Beispiel #1
0
def compdistr_crossval(alist, blist, kfold):
    afold = KFold(alist.shape[0], n_folds=kfold, shuffle=True)
    bfold = KFold(blist.shape[0], n_folds=kfold, shuffle=True)
    afold = iter(afold)
    bfold = iter(bfold)
    res = []
    for x in range(kfold):
        atrain, atest = afold.next()
        btrain, btest = bfold.next()
        a = alist[atrain]
        b = blist[btrain]

        #print 'crossval', a.shape, b.shape, blist[btest].shape, np.vstack((alist[atest],blist[btest])).shape

        test = vstack((alist[atest], blist[btest]))

        res.append(compdistr(a, b, test))
    return res[kfold / 2]
Beispiel #2
0
def import_data():
    # For .read_csv, always use header=0 when you know row 0 is the header row
    df = pd.read_csv("./data/ElectionsData-full.csv", header=0)

    df['split'] = 0
    indices = KFold(n=len(df), n_folds=5, shuffle=True)._iter_test_indices()
    df['split'][indices.next()] = 1
    df['split'][indices.next()] = 2
    raw_data = df.copy()

    raw_data[raw_data['split'] == 0].drop('split', axis=1).to_csv(
        './data/output/raw_train.csv', index=False, sep=',')
    raw_data[raw_data['split'] == 1].drop('split', axis=1).to_csv(
        './data/output/raw_test.csv', index=False, sep=',')
    raw_data[raw_data['split'] == 2].drop('split', axis=1).to_csv(
        './data/output/raw_validation.csv', index=False)

    return df
def main():
    df = pd.read_csv('dataset/ElectionsData.csv')
    df['split'] = 0
    indices = KFold(n=len(df), n_folds=5, shuffle=True)._iter_test_indices()
    df['split'][indices.next()] = 1
    df['split'][indices.next()] = 2
    raw_data = df.copy()

    raw_data[raw_data['split']==0].drop('split', axis=1).to_csv('dataset/raw_train.csv', index=False)
    raw_data[raw_data['split']==1].drop('split', axis=1).to_csv('dataset/raw_test.csv', index=False)
    raw_data[raw_data['split']==2].drop('split', axis=1).to_csv('dataset/raw_validation.csv', index=False)

    all_features, discrete_features, continuous_features, categorical_features, numeric_features = split_features_by_type(df)
    features_to_keep = {'Yearly_ExpensesK', 'Yearly_IncomeK', 'Overall_happiness_score', 'Most_Important_Issue',
                        'Avg_Residancy_Altitude', 'Will_vote_only_large_party', 'Financial_agenda_matters'}
    df = mark_negative_values_as_nan(df)
    df = outlier_detection(df, continuous_features)

    #fill missing values by correlated features.
    fill_f1_by_f2_linear(df, 'Yearly_ExpensesK', 'Avg_monthly_expense_on_pets_or_plants')
    fill_f1_by_f2_linear(df, 'Yearly_IncomeK', 'Avg_size_per_room')
    fill_f1_by_f2_linear(df, 'Overall_happiness_score', 'Political_interest_Total_Score') #not perfectly corelated, but better then nothing
    fill_f1_by_f2_discrete(df, 'Most_Important_Issue', 'Last_school_grades')
    fill_f1_by_f2_linear(df, 'Avg_Residancy_Altitude', 'Avg_monthly_expense_when_under_age_21')
    fill_f1_by_f2_discrete(df, 'Will_vote_only_large_party', 'Looking_at_poles_results')
    fill_f1_by_f2_discrete(df, 'Financial_agenda_matters', 'Vote')
    for c in features_to_keep:
        rows_to_fix = df[c].isnull()
        for row, value in enumerate(rows_to_fix):
            if value:
                df[c][row] = df[df.Vote==df.Vote[row]][c].mean()

    df=df[list(features_to_keep)+['Vote', 'split']]
    reduce_Most_Important_Issue(df)
    z_score_scaling(df, list(features_to_keep.intersection(set(continuous_features))))
    l_encoder = label_encoder(df)
    df = categorical_features_transformation(df)

    pickle.dump(l_encoder, open('encoder.pickle', 'w'))
    df[df['split'] == 0].drop('split', axis=1).to_csv('dataset/transformed_train.csv', index=False)
    df[df['split'] == 1].drop('split', axis=1).to_csv('dataset/transformed_test.csv', index=False)
    df[df['split']==2].drop('split', axis=1).to_csv('dataset/transformed_validation.csv', index=False)