Beispiel #1
0
def feature_selection(activity_threshold=3):
    """Train classifier on DonorsChoose set given a label to choose most important features.

    INPUT:
    OUTPUT: list of most important columns
    """

    dc_districts = get_donorschoose.districts()
    dc_index = dc_districts.index

    census = get_census.all_states()
    census = census.loc[dc_index].copy()

    columns = ["STNAME", "LATCOD", "LONCOD", 
               "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", 
               "Z32", "Z34", "Z35", "HR1", "HE1", "HE2"]
    nces = get_nces.districts(columns=columns, nonneg=True)

    data = pd.concat([census, nces.loc[census.index]], axis=1)
    data.dropna(inplace=True)

    label = dc_districts.activity > activity_threshold
    label = label.loc[data.index]

    print label.value_counts()

    feature_importance.importance(data._get_numeric_data(), label)
Beispiel #2
0
def feature_selection(activity_threshold=3):
    """Train classifier on DonorsChoose set given a label to choose most important features.

    INPUT:
    OUTPUT: list of most important columns
    """

    dc_districts = get_donorschoose.districts()
    dc_index = dc_districts.index

    census = get_census.all_states()
    census = census.loc[dc_index].copy()

    columns = [
        "STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV",
        "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "Z32", "Z34", "Z35",
        "HR1", "HE1", "HE2"
    ]
    nces = get_nces.districts(columns=columns, nonneg=True)

    data = pd.concat([census, nces.loc[census.index]], axis=1)
    data.dropna(inplace=True)

    label = dc_districts.activity > activity_threshold
    label = label.loc[data.index]

    print label.value_counts()

    feature_importance.importance(data._get_numeric_data(), label)
Beispiel #3
0
def district_similarity():
    """Compute district similarity matrix using census, NCES, and census district data.

    OUTPUT: Similarity object
    """

    census = get_census.all_states()

    columns = ["STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV", "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "HR1", "HE1", "HE2"]
    nces = get_nces.districts(columns=columns, nonneg=True)

    ddf = pd.concat([census, nces.loc[census.index]], axis=1)

    sim = similarity.Similarity(ddf, ref_columns=["District Name", "State", "STNAME", "LATCOD", "LONCOD"])

    return sim
Beispiel #4
0
def district_similarity():
    """Compute district similarity matrix using census, NCES, and census district data.

    OUTPUT: Similarity object
    """

    census = get_census.all_states()

    columns = [
        "STNAME", "LATCOD", "LONCOD", "TOTALREV", "TFEDREV", "TSTREV",
        "TLOCREV", "TOTALEXP", "TCURSSVC", "TCAPOUT", "HR1", "HE1", "HE2"
    ]
    nces = get_nces.districts(columns=columns, nonneg=True)

    ddf = pd.concat([census, nces.loc[census.index]], axis=1)

    sim = similarity.Similarity(
        ddf,
        ref_columns=["District Name", "State", "STNAME", "LATCOD", "LONCOD"])

    return sim