def feature_selection(activity_threshold=3): """Train classifier on DonorsChoose set given a label to choose most important features. INPUT: OUTPUT: list of most important columns """ dc_districts = get_donorschoose.districts() dc_index = dc_districts.index census = get_census.all_states() census = census.loc[dc_index].copy() columns = ["STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "Z32", "Z34", "Z35", "HR1", "HE1", "HE2"] nces = get_nces.districts(columns=columns, nonneg=True) data = pd.concat([census, nces.loc[census.index]], axis=1) data.dropna(inplace=True) label = dc_districts.activity > activity_threshold label = label.loc[data.index] print label.value_counts() feature_importance.importance(data._get_numeric_data(), label)
def feature_selection(activity_threshold=3): """Train classifier on DonorsChoose set given a label to choose most important features. INPUT: OUTPUT: list of most important columns """ dc_districts = get_donorschoose.districts() dc_index = dc_districts.index census = get_census.all_states() census = census.loc[dc_index].copy() columns = [ "STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "Z32", "Z34", "Z35", "HR1", "HE1", "HE2" ] nces = get_nces.districts(columns=columns, nonneg=True) data = pd.concat([census, nces.loc[census.index]], axis=1) data.dropna(inplace=True) label = dc_districts.activity > activity_threshold label = label.loc[data.index] print label.value_counts() feature_importance.importance(data._get_numeric_data(), label)
def district_similarity(): """Compute district similarity matrix using census, NCES, and census district data. OUTPUT: Similarity object """ census = get_census.all_states() columns = ["STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "HR1", "HE1", "HE2"] nces = get_nces.districts(columns=columns, nonneg=True) ddf = pd.concat([census, nces.loc[census.index]], axis=1) sim = similarity.Similarity(ddf, ref_columns=["District Name", "State", "STNAME", "LATCOD", "LONCOD"]) return sim
def district_similarity(): """Compute district similarity matrix using census, NCES, and census district data. OUTPUT: Similarity object """ census = get_census.all_states() columns = [ "STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "HR1", "HE1", "HE2" ] nces = get_nces.districts(columns=columns, nonneg=True) ddf = pd.concat([census, nces.loc[census.index]], axis=1) sim = similarity.Similarity( ddf, ref_columns=["District Name", "State", "STNAME", "LATCOD", "LONCOD"]) return sim