def feature_selection(activity_threshold=3): """Train classifier on DonorsChoose set given a label to choose most important features. INPUT: OUTPUT: list of most important columns """ dc_districts = get_donorschoose.districts() dc_index = dc_districts.index census = get_census.all_states() census = census.loc[dc_index].copy() columns = ["STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "Z32", "Z34", "Z35", "HR1", "HE1", "HE2"] nces = get_nces.districts(columns=columns, nonneg=True) data = pd.concat([census, nces.loc[census.index]], axis=1) data.dropna(inplace=True) label = dc_districts.activity > activity_threshold label = label.loc[data.index] print label.value_counts() feature_importance.importance(data._get_numeric_data(), label)
def feature_selection(activity_threshold=3): """Train classifier on DonorsChoose set given a label to choose most important features. INPUT: OUTPUT: list of most important columns """ dc_districts = get_donorschoose.districts() dc_index = dc_districts.index census = get_census.all_states() census = census.loc[dc_index].copy() columns = [ "STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "Z32", "Z34", "Z35", "HR1", "HE1", "HE2" ] nces = get_nces.districts(columns=columns, nonneg=True) data = pd.concat([census, nces.loc[census.index]], axis=1) data.dropna(inplace=True) label = dc_districts.activity > activity_threshold label = label.loc[data.index] print label.value_counts() feature_importance.importance(data._get_numeric_data(), label)
def potential_districts(sim, n_potential=15, activity_threshold=3): """Find potentially active districts outside of DonorsChoose network. OUTPUT: pandas dataframe of recommended distrcits districts.topo.json district.json """ dc_districts = get_donorschoose.districts() active_districts = set(dc_districts[dc_districts.activity > activity_threshold].index.values.astype(np.int)) all_districts = set(sim.data.index.values.astype(np.int)) potential = all_districts - (active_districts & all_districts) rms = sim.rms_score(potential, active_districts, normalize=True) # potential districts most similar to active districts in descending order potential_df = pd.DataFrame(sorted(zip(potential, rms), key=lambda (x, y): y, reverse=True)) potential_df.columns = ["leaid", "score"] potential_df.index = potential_df.pop("leaid") potential_df["State"] = sim.data["State"].loc[potential_df.index] # pick at most n_potential recommendations for each state recommend = [] for state in sim.data.State.value_counts().index: recommend.extend(potential_df[potential_df.State == state].head(n_potential).index.values) rec_df = sim.data[["District Name", "STNAME", "State", "LATCOD", "LONCOD"]].loc[recommend] rec_df["score"] = potential_df.score.loc[recommend] N_rec = len(rec_df) rec_df.dropna(inplace=True) print "NaNs: drop {} districts".format(N_rec - len(rec_df)) # build tooltip text district_info = [] for leaid in rec_df.index: tooltip = [] tooltip.append( "{}".format(rec_df.loc[leaid, "District Name"]) ) tooltip.append( "students: {}".format(sim.data.loc[leaid, "Total Students"].astype(np.int)) ) tooltip.append("") most_sim = sim.most_similar(leaid) most_sim.drop(leaid) most_sim = most_sim.loc[filter(lambda leaid: True if leaid in active_districts else False, most_sim.index)] closest = most_sim.head(1).index[0] # same, close = sim.closest_features([leaid, closest]) # closest_features = list(same) + list(close) tooltip.append( "Most similar to {}, {}".format(most_sim.loc[closest, "District Name"], most_sim.loc[closest, "State"]) ) # tooltip.append( "(based on: {}, {})".format(closest_features[0], closest_features[1]) ) tooltip.append( "students: {}".format(sim.data.loc[closest, "Total Students"].astype(np.int)) ) tooltip.append( "projects: {}".format(dc_districts.loc[closest, "projects"].astype(np.int)) ) donation_per_project = dc_districts.loc[closest, "total_donations"] / dc_districts.loc[closest, "projects"] tooltip.append( "received donations/project: ${:.2f}".format(donation_per_project) ) htmltooltip = "<br/>".join(tooltip) district_info.append(htmltooltip) info_series = pd.Series(district_info) info_series.index = rec_df.index rec_df["info"] = info_series to_geojson(rec_df) return rec_df
def potential_districts(sim, n_potential=15, activity_threshold=3): """Find potentially active districts outside of DonorsChoose network. OUTPUT: pandas dataframe of recommended distrcits districts.topo.json district.json """ dc_districts = get_donorschoose.districts() active_districts = set(dc_districts[ dc_districts.activity > activity_threshold].index.values.astype( np.int)) all_districts = set(sim.data.index.values.astype(np.int)) potential = all_districts - (active_districts & all_districts) rms = sim.rms_score(potential, active_districts, normalize=True) # potential districts most similar to active districts in descending order potential_df = pd.DataFrame( sorted(zip(potential, rms), key=lambda (x, y): y, reverse=True)) potential_df.columns = ["leaid", "score"] potential_df.index = potential_df.pop("leaid") potential_df["State"] = sim.data["State"].loc[potential_df.index] # pick at most n_potential recommendations for each state recommend = [] for state in sim.data.State.value_counts().index: recommend.extend(potential_df[potential_df.State == state].head( n_potential).index.values) rec_df = sim.data[["District Name", "STNAME", "State", "LATCOD", "LONCOD"]].loc[recommend] rec_df["score"] = potential_df.score.loc[recommend] N_rec = len(rec_df) rec_df.dropna(inplace=True) print "NaNs: drop {} districts".format(N_rec - len(rec_df)) # build tooltip text district_info = [] for leaid in rec_df.index: tooltip = [] tooltip.append("{}".format(rec_df.loc[leaid, "District Name"])) tooltip.append("students: {}".format( sim.data.loc[leaid, "Total Students"].astype(np.int))) tooltip.append("") most_sim = sim.most_similar(leaid) most_sim.drop(leaid) most_sim = most_sim.loc[filter( lambda leaid: True if leaid in active_districts else False, most_sim.index)] closest = most_sim.head(1).index[0] # same, close = sim.closest_features([leaid, closest]) # closest_features = list(same) + list(close) tooltip.append("Most similar to {}, {}".format( most_sim.loc[closest, "District Name"], most_sim.loc[closest, "State"])) # tooltip.append( "(based on: {}, {})".format(closest_features[0], closest_features[1]) ) tooltip.append("students: {}".format( sim.data.loc[closest, "Total Students"].astype(np.int))) tooltip.append("projects: {}".format( dc_districts.loc[closest, "projects"].astype(np.int))) donation_per_project = dc_districts.loc[ closest, "total_donations"] / dc_districts.loc[closest, "projects"] tooltip.append( "received donations/project: ${:.2f}".format(donation_per_project)) htmltooltip = "<br/>".join(tooltip) district_info.append(htmltooltip) info_series = pd.Series(district_info) info_series.index = rec_df.index rec_df["info"] = info_series to_geojson(rec_df) return rec_df