def compdistr_crossval(alist, blist, kfold): afold = KFold(alist.shape[0], n_folds=kfold, shuffle=True) bfold = KFold(blist.shape[0], n_folds=kfold, shuffle=True) afold = iter(afold) bfold = iter(bfold) res = [] for x in range(kfold): atrain, atest = afold.next() btrain, btest = bfold.next() a = alist[atrain] b = blist[btrain] #print 'crossval', a.shape, b.shape, blist[btest].shape, np.vstack((alist[atest],blist[btest])).shape test = vstack((alist[atest], blist[btest])) res.append(compdistr(a, b, test)) return res[kfold / 2]
def import_data(): # For .read_csv, always use header=0 when you know row 0 is the header row df = pd.read_csv("./data/ElectionsData-full.csv", header=0) df['split'] = 0 indices = KFold(n=len(df), n_folds=5, shuffle=True)._iter_test_indices() df['split'][indices.next()] = 1 df['split'][indices.next()] = 2 raw_data = df.copy() raw_data[raw_data['split'] == 0].drop('split', axis=1).to_csv( './data/output/raw_train.csv', index=False, sep=',') raw_data[raw_data['split'] == 1].drop('split', axis=1).to_csv( './data/output/raw_test.csv', index=False, sep=',') raw_data[raw_data['split'] == 2].drop('split', axis=1).to_csv( './data/output/raw_validation.csv', index=False) return df
def main(): df = pd.read_csv('dataset/ElectionsData.csv') df['split'] = 0 indices = KFold(n=len(df), n_folds=5, shuffle=True)._iter_test_indices() df['split'][indices.next()] = 1 df['split'][indices.next()] = 2 raw_data = df.copy() raw_data[raw_data['split']==0].drop('split', axis=1).to_csv('dataset/raw_train.csv', index=False) raw_data[raw_data['split']==1].drop('split', axis=1).to_csv('dataset/raw_test.csv', index=False) raw_data[raw_data['split']==2].drop('split', axis=1).to_csv('dataset/raw_validation.csv', index=False) all_features, discrete_features, continuous_features, categorical_features, numeric_features = split_features_by_type(df) features_to_keep = {'Yearly_ExpensesK', 'Yearly_IncomeK', 'Overall_happiness_score', 'Most_Important_Issue', 'Avg_Residancy_Altitude', 'Will_vote_only_large_party', 'Financial_agenda_matters'} df = mark_negative_values_as_nan(df) df = outlier_detection(df, continuous_features) #fill missing values by correlated features. fill_f1_by_f2_linear(df, 'Yearly_ExpensesK', 'Avg_monthly_expense_on_pets_or_plants') fill_f1_by_f2_linear(df, 'Yearly_IncomeK', 'Avg_size_per_room') fill_f1_by_f2_linear(df, 'Overall_happiness_score', 'Political_interest_Total_Score') #not perfectly corelated, but better then nothing fill_f1_by_f2_discrete(df, 'Most_Important_Issue', 'Last_school_grades') fill_f1_by_f2_linear(df, 'Avg_Residancy_Altitude', 'Avg_monthly_expense_when_under_age_21') fill_f1_by_f2_discrete(df, 'Will_vote_only_large_party', 'Looking_at_poles_results') fill_f1_by_f2_discrete(df, 'Financial_agenda_matters', 'Vote') for c in features_to_keep: rows_to_fix = df[c].isnull() for row, value in enumerate(rows_to_fix): if value: df[c][row] = df[df.Vote==df.Vote[row]][c].mean() df=df[list(features_to_keep)+['Vote', 'split']] reduce_Most_Important_Issue(df) z_score_scaling(df, list(features_to_keep.intersection(set(continuous_features)))) l_encoder = label_encoder(df) df = categorical_features_transformation(df) pickle.dump(l_encoder, open('encoder.pickle', 'w')) df[df['split'] == 0].drop('split', axis=1).to_csv('dataset/transformed_train.csv', index=False) df[df['split'] == 1].drop('split', axis=1).to_csv('dataset/transformed_test.csv', index=False) df[df['split']==2].drop('split', axis=1).to_csv('dataset/transformed_validation.csv', index=False)