def discover_network(network, api_list): # """ first, load all of the previous influencers and relationships """ influencers, infl_header = helpers.load_csv( dirs.dirs_dict["discoveries"][network]) influencer_user_ids = set(helpers.list_of_keys(influencers, 'user_id')) # print(influencers) relationships_loaded, rel_header = helpers.load_csv( dirs.dirs_dict["relationships"][network]) # """ next, check which users we've already searched """ user_ids_searched = set( int(rel["follower_id"]) for rel in relationships_loaded) # """ we'll only search influencers who we haven't searched """ influencers_to_search = filter( lambda i: i["user_id"] and int(i["user_id"]) not in user_ids_searched, influencers ) # [i for i in influencers if i["user_id"] and int(i["user_id"]) not in user_ids_searched] # print(influencers_to_search) # """ while there are influencers to search, we should search one of them """ # only expand if the new degree is less than or equal # to a maximum degree for expanding the network, # so that we don't run into low quality people. # if new_degree <= configs.MAX_EXPANSION_DEGREE: while max(influencers_to_search, key=helpers.influencer_norm )['degree'] <= configs.MAX_EXPANSION_DEGREE: infl = max(influencers_to_search, key=helpers.influencer_norm) print("chose: " + str(infl)) # this holds true if we first exhaust all degree 0's # and then go to degree 1s, 2s, etc. new_degree = infl['degree'] + 1 try: follows = random.choice(api_list).get_follows(infl["user_id"]) print("this person follows: " + str(follows)) profile = social_apis.Profile(network, infl, infl['degree'], follows_list=follows) relationships_loaded.extend(profile.get_follows()) profile.flush_follows() for fid in follows: flushed_dict = flush_followed_user(random.choice(api_list), network, fid, influencer_user_ids, new_degree) if not flushed_dict: continue influencers.append(flushed_dict) influencer_user_ids.add(flushed_dict['user_id']) influencers_to_search.append(flushed_dict) except: print("An error occured - onto the next one") influencers_to_search.remove(infl) return None
def discover_network(network, api_list): # """ first, load all of the previous influencers and relationships """ influencers, infl_header = helpers.load_csv(dirs.dirs_dict["discoveries"][network]) influencer_user_ids = set(helpers.list_of_keys(influencers, "user_id")) # print(influencers) relationships_loaded, rel_header = helpers.load_csv(dirs.dirs_dict["relationships"][network]) # """ next, check which users we've already searched """ user_ids_searched = set(int(rel["follower_id"]) for rel in relationships_loaded) # """ we'll only search influencers who we haven't searched """ influencers_to_search = filter( lambda i: i["user_id"] and int(i["user_id"]) not in user_ids_searched, influencers ) # [i for i in influencers if i["user_id"] and int(i["user_id"]) not in user_ids_searched] # print(influencers_to_search) # """ while there are influencers to search, we should search one of them """ # only expand if the new degree is less than or equal # to a maximum degree for expanding the network, # so that we don't run into low quality people. # if new_degree <= configs.MAX_EXPANSION_DEGREE: while max(influencers_to_search, key=helpers.influencer_norm)["degree"] <= configs.MAX_EXPANSION_DEGREE: infl = max(influencers_to_search, key=helpers.influencer_norm) print("chose: " + str(infl)) # this holds true if we first exhaust all degree 0's # and then go to degree 1s, 2s, etc. new_degree = infl["degree"] + 1 try: follows = random.choice(api_list).get_follows(infl["user_id"]) print("this person follows: " + str(follows)) profile = social_apis.Profile(network, infl, infl["degree"], follows_list=follows) relationships_loaded.extend(profile.get_follows()) profile.flush_follows() for fid in follows: flushed_dict = flush_followed_user( random.choice(api_list), network, fid, influencer_user_ids, new_degree ) if not flushed_dict: continue influencers.append(flushed_dict) influencer_user_ids.add(flushed_dict["user_id"]) influencers_to_search.append(flushed_dict) except: print("An error occured - onto the next one") influencers_to_search.remove(infl) return None
def test_load_data(self): convertfunc = lambda x: 0 if b'b' in x else 1 # convertfucntion for Prediction column to 0 if bg, and 1 if signal converters = {"Prediction": convertfunc} data = load_csv(self.path, converters=converters) self.assertEqual(data[0]['Id'], data[0][0]) #test dtype naming self.assertEqual(data[0]["Prediction"], 1) # test conversion of Prediction strings self.assertEqual(data.shape, (10, ))
def drop_nones(): rows, header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"]) new_rows = list() for row in rows: if row["username"] and row["user_id"]: new_rows.append(row) helpers.write_csv(dirs.dirs_dict["discoveries"]["instagram"], new_rows, header) return
def write_ranks(rks_dict): discovery_list, discovery_header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"]) # fixes bug where some influencers are reported as None discovery_list = filter(lambda infl: bool(infl['username']), discovery_list) for i, el in enumerate(discovery_list): discovery_list[i]["pagerank"] = rks_dict[el["user_id"]] discovery_header.append("pagerank") discovery_list.sort(key=lambda k: k["pagerank"], reverse=True) helpers.write_csv(dirs.dirs_dict["discoveries"]["instagram"]+"-pageranked", discovery_list, discovery_header) return None
def drop_nones(): rows, header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"]) new_rows = list() for row in rows: if row['username'] and row['user_id']: new_rows.append(row) helpers.write_csv(dirs.dirs_dict["discoveries"]["instagram"], new_rows, header) return
def dedup(folder, network, on_keys): rows, header = helpers.load_csv(dirs.dirs_dict[folder][network]) if not rows: return stored_keys = set() new_rows = list() for row in rows: row_key = tuple(row[on_key] for on_key in on_keys) if row_key not in stored_keys: new_rows.append(row) stored_keys.add(row_key) helpers.write_csv(dirs.dirs_dict[folder][network], new_rows, header) return
def main(): t0 = time.time() print("Fetching training and testing datasets..") tr = load_txt("data/train_set.txt") tr_x = tr[0] tr_y = tr[1] ts_x_raw = load_xlsx("data/test_set.xlsx") ts_x = [row["A"] for row in ts_x_raw] ts_y_raw = load_txt("data/test_set_y.txt") ts_y = ts_y_raw[1] ts_y = ts_y[0:len(ts_y) - 1] # because there's a new line at the end if compare_datasets: # Check our test labels against Eysteinn's ts_y_alternate = load_csv("data/test_dataset.csv") different = [] for i in range(len(ts_y_alternate)): if ts_y[i] is not ts_y_alternate[i]: different.append(i) print("Number of different entries:") print(len(different)) print(different) print("Creating features from training set..") vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True) tr_vectors = vectorizer.fit_transform(tr_x) print("Creating MultinomialNB classifier..") clf = MultinomialNB() clf.fit(tr_vectors, tr_y) ts_x_featurized = vectorizer.transform(ts_x) print("Making predictions..") predictions = clf.predict(ts_x_featurized) t1 = time.time() dt = t1 - t0 i = 0 correct_predictions = 0 for row in predictions: if row == ts_y[i]: correct_predictions = correct_predictions + 1 i = i + 1 print("Result: %d/%d correct predictions (%.2f%%), in %.2fs.\n" % (correct_predictions, len(predictions), 100. * correct_predictions / len(predictions), dt)) print(classification_report(ts_y, predictions))
def write_ranks(rks_dict): discovery_list, discovery_header = helpers.load_csv( dirs.dirs_dict["discoveries"]["instagram"]) # fixes bug where some influencers are reported as None discovery_list = filter(lambda infl: bool(infl['username']), discovery_list) for i, el in enumerate(discovery_list): discovery_list[i]["pagerank"] = rks_dict[el["user_id"]] discovery_header.append("pagerank") discovery_list.sort(key=lambda k: k["pagerank"], reverse=True) helpers.write_csv( dirs.dirs_dict["discoveries"]["instagram"] + "-pageranked", discovery_list, discovery_header) return None
def load_data(file_name): global data_df global features_list global source try: read_data_df = load_csv(os.path.join(csv_data_dir, file_name)) except Exception as e: print(e) data_df = read_data_df features_list = [ i for i in list(data_df.columns) if i not in ['tsne_x', 'tsne_y', 'image_path'] ] # source = ColumnDataSource(data_df) cr.data_source.data = data_df toggle_class_select.options = features_list toggle_class_select.value = features_list[0] update_toggle_class('value', None, features_list[0]) color_class_select.options = features_list color_class_select.value = features_list[0] update_color_class('value', None, features_list[0]) update_class_selection('value', None, []) hover_tip_tool.tooltips = generate_tooltip_html()
i for i in list(data_df.columns) if i not in ['tsne_x', 'tsne_y', 'image_path'] ] # source = ColumnDataSource(data_df) cr.data_source.data = data_df toggle_class_select.options = features_list toggle_class_select.value = features_list[0] update_toggle_class('value', None, features_list[0]) color_class_select.options = features_list color_class_select.value = features_list[0] update_color_class('value', None, features_list[0]) update_class_selection('value', None, []) hover_tip_tool.tooltips = generate_tooltip_html() data_df = load_csv('./embedding_visualization/data/show.csv') features_list = [ i for i in list(data_df.columns) if i not in ['tsne_x', 'tsne_y'] ] source = ColumnDataSource(data_df) cls_color_mapper, color_cls_list, _ = get_color_mapper(features_list[0]) p = figure(plot_width=800, plot_height=800, match_aspect=True, tools=['pan', 'box_zoom', 'reset'], title='', sizing_mode='scale_height', output_backend="webgl") cr = p.circle(x='tsne_x', y='tsne_y', color=cls_color_mapper, source=source) cr.selection_glyph = Circle(fill_color=cls_color_mapper,
def test_load_predict_data(self): data = load_csv('test/unlabeled_dummy.csv')
def test_split_data(self): data = load_csv(self.path) trainset, testset = split_dataset(data, test_ratio=0.4) self.assertEqual(trainset.shape, (6, )) self.assertEqual(testset.shape, (4, ))
def test_write_csv(self): convertfunc = lambda x: 0 if b'b' in x else 1 # convertfucntion for Prediction column to 0 if bg, and 1 if signal converters = {"Prediction": convertfunc} data = load_csv(self.path, converters=converters) write_csv(data, "test/test_write.csv")
# Bind the engine to the metadata of the Base class so that the # declaratives can be accessed through a DBSession instance Base.metadata.bind = engine DBSession = sessionmaker(bind=engine) # A DBSession() instance establishes all conversations with the database # and represents a "staging zone" for all the objects loaded into the # database session object. Any change made against the objects in the # session won't be persisted into the database until you call # session.commit(). If you're not happy about the changes, you can # revert all of them back to the last commit by calling # session.rollback() session = DBSession() influencers, infl_header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"]) # Insert a Profile in the profile table for influencer in influencers: for key in ('time_pulled', ): influencer.pop(key) profile = Profile(**influencer) profile_record, created = get_or_create(session, Profile, defaults=influencer, user_id=influencer['user_id']) print('created' if created else '--existed')
def main(): t0 = time.time() print("Fetching training and testing datasets..") tr = load_txt("data/train_set.txt") tr_x = tr[0] tr_y = tr[1] ts_x_raw = load_xlsx("data/test_set.xlsx") ts_x = [row["A"] for row in ts_x_raw] ts_y_raw = load_txt("data/test_set_y.txt") ts_y = ts_y_raw[1] ts_y = ts_y[0:len(ts_y) - 1] # because there's a new line at the end if compare_datasets: # Check our test labels against Eysteinn's ts_y_alternate = load_csv("data/test_dataset.csv") different = [] for i in range(len(ts_y_alternate)): if ts_y[i] is not ts_y_alternate[i]: different.append(i) print("Number of different entries:") print(len(different)) print(different) print("Creating features from training set..") vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True) tr_vectors = vectorizer.fit_transform(tr_x) print("Grid searching params for SVM classifier..") #clf = MultinomialNB() params = { 'kernel': ('linear', 'poly', 'rbf'), 'C': [1, 10], 'degree': [2, 3, 4, 5], 'coef0': [5, 7, 10, 15, 17, 20] } #params = {'kernel':['poly'], 'C':[10], 'degree':[3], 'coef0':[5]} bestclf = 0 bestRes = 0 bestPredictions = 0 for classifier in params['kernel']: for c in params['C']: for d in params['degree']: for coef in params['coef0']: clf = svm.SVC(kernel=classifier, C=c, degree=d, coef0=coef) clf.fit(tr_vectors, tr_y) ts_x_featurized = vectorizer.transform(ts_x) predictions = clf.predict(ts_x_featurized) t1 = time.time() i = 0 correct_predictions = 0 for row in predictions: if row == ts_y[i]: correct_predictions = correct_predictions + 1 i = i + 1 if correct_predictions > bestRes: bestRes = correct_predictions bestclf = clf bestPredictions = predictions print('kernel:', classifier, 'C:', c, 'degree:', d, 'coef0:', coef) print('Numcorrect', bestRes) dt = t1 - t0 print("Result: %d/%d correct predictions (%.2f%%), in %.2fs.\n" % (bestRes, len(bestPredictions), 100. * bestRes / len(bestPredictions), dt)) print(classification_report(ts_y, bestPredictions))
session.add(instance) session.commit() return instance, True engine = create_engine(configs.DB_NAME) # Bind the engine to the metadata of the Base class so that the # declaratives can be accessed through a DBSession instance Base.metadata.bind = engine DBSession = sessionmaker(bind=engine) # A DBSession() instance establishes all conversations with the database # and represents a "staging zone" for all the objects loaded into the # database session object. Any change made against the objects in the # session won't be persisted into the database until you call # session.commit(). If you're not happy about the changes, you can # revert all of them back to the last commit by calling # session.rollback() session = DBSession() influencers, infl_header = helpers.load_csv(dirs.dirs_dict["discoveries"]["instagram"]) # Insert a Profile in the profile table for influencer in influencers: for key in ("time_pulled",): influencer.pop(key) profile = Profile(**influencer) profile_record, created = get_or_create(session, Profile, defaults=influencer, user_id=influencer["user_id"]) print("created" if created else "--existed")
def vaccines(): """Displays vaccine tracker""" vaccines = { "MCV1": "The percentage of children under 1 year of age who have received at least one dose of measles-containing vaccine in a given year. For countries recommending the first dose of measles vaccine in children over 12 months of age, the indicator is calculated as the proportion of children less than 12-23 months of age receiving one dose of measles-containing vaccine.", "MCV2": "The percentage of children who have received two doses of measles containing vaccine (MCV2) in a given year, according to the nationally recommended schedule.", "BCG": "The percentage of 1-year-olds who have received one dose of bacille Calmette-Guérin (BCG) vaccine in a given year.", "DTP3": "The percentage of 1-year-olds who have received three doses of the combined diphtheria, tetanus toxoid and pertussis vaccine in a given year.", "PAB": "The proportion of neonates in a given year that can be considered as having been protected against tetanus as a result of maternal immunization.", "PCV3": "The percentage of 1-year-olds who have received three doses of pneumococcal conjugate vaccine (PCV3) in a given year.", "HepB3": "The percentage of 1-year-olds who have received three doses of hepatitis B vaccine in a given year.", "Pol3": "The percentage of 1-year-olds who have received three doses of polio vaccine in a given year.", "Hib3": "The percentage of 1-year-olds who have received three doses of Haemophilus influenzae type B vaccine in a given year.", "ROTAC": "The percentage of surviving infants who received the final recommended dose of rotavirus vaccine, which can be either the 2nd or the 3rd dose depending on the vaccine in a given year." } # Dictionary of dictionaries for vaccine data all_vaccines = {} for vaccine in vaccines: all_vaccines[vaccine] = load_csv(vaccine) if request.method == "GET": return render_template("vaccines.html", all_vaccines=all_vaccines) if request.method == "POST": if "search" in request.form: # Check that user entered vaccine abbreviation if not request.form.get("abbr"): flash("Must enter vaccine abbr.") return redirect("/vaccines") # Check if user input is an actual vaccine abbr = request.form.get("abbr") if abbr not in vaccines: flash("Data unavailable") return redirect("/vaccines") else: return render_template("vaccinedata.html", vaccine=abbr, vaccine_data=all_vaccines[abbr], vaccine_info=vaccines[abbr], startyear=1980, endyear=2018) if "datasearch" in request.form: vaccine = request.form["datasearch"] # If nothing is inputted, refresh original page if not request.form.get("country") and not request.form.get( "startyear") and not request.form.get("endyear"): return render_template("vaccinedata.html", vaccine=vaccine, vaccine_data=all_vaccines[vaccine], vaccine_info=vaccines[vaccine], startyear=1980, endyear=2018) # Check that country exists country = request.form["country"] if country != "" and country not in all_vaccines[vaccine]: flash("Country data not available") return render_template("vaccinedata.html", vaccine=vaccine, vaccine_data=all_vaccines[vaccine], vaccine_info=vaccines[vaccine], startyear=1980, endyear=2018) # Check that years are valid startyear = request.form["startyear"] endyear = request.form["endyear"] if startyear != "" and (int(startyear) > 2018 or int(startyear) < 1980): flash("Invalid start year") return render_template("vaccinedata.html", vaccine=vaccine, vaccine_data=all_vaccines[vaccine], vaccine_info=vaccines[vaccine], startyear=1980, endyear=2018) if endyear != "" and (int(endyear) > 2018 or int(endyear) < 1980): flash("Invalid end year") return render_template("vaccinedata.html", vaccine=vaccine, vaccine_data=all_vaccines[vaccine], vaccine_info=vaccines[vaccine], startyear=1980, endyear=2018) if startyear != "" and endyear != "" and int(startyear) > int( endyear): flash("End year must be later than start year") return render_template("vaccinedata.html", vaccine=vaccine, vaccine_data=all_vaccines[vaccine], vaccine_info=vaccines[vaccine], startyear=1980, endyear=2018) start = 1980 if startyear != "": start = int(startyear) end = 2018 if endyear != "": end = int(endyear) spliced_dict = {} if country != "": spliced_dict[country] = all_vaccines[vaccine][country] else: spliced_dict = all_vaccines[vaccine] for country in spliced_dict: spliced_dict[country] = spliced_dict[country][2018 - end:2019 - start] return render_template("vaccinedata.html", vaccine=vaccine, vaccine_data=spliced_dict, vaccine_info=vaccines[vaccine], startyear=start, endyear=end) for vaccine in vaccines: if vaccine in request.form: return render_template("vaccinedata.html", vaccine=vaccine, vaccine_data=all_vaccines[vaccine], vaccine_info=vaccines[vaccine], startyear=1980, endyear=2018)