def benchmark_solution(): train=load_data('train.csv') test=load_data('test.csv') lbl_enc = preprocessing.LabelEncoder() train['target'] = lbl_enc.fit_transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=train[feature_cols] y=train['target'] X_test=test[feature_cols] test_ids=test['id'] print "benchmark solution" cross_v(get_rf(),X_train.values,y.values)#0.596256539386
def feature_engineering_solution(): train=load_data('train.csv') test=load_data('test.csv') le = preprocessing.LabelEncoder() le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=feature_engineering(train[feature_cols]) X_test=feature_engineering(test[feature_cols]) feature_cols= [col for col in X_train.columns]#std 0.607958003167 mean 0.615741311533 X_train=X_train[feature_cols] X_test=X_test[feature_cols] y=train['target'] test_ids=test['id'] print 'feature_engineering_solution' cross_v(get_rf(),X_train.values,y.values)#0.600017926514
def parameter_tuning_solution(): train=load_data('train.csv') test=load_data('test.csv') le = preprocessing.LabelEncoder() le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=train[feature_cols] X_test=test[feature_cols] y=train['target'] test_ids=test['id'] print 'parameter_tuning_solution800_6' cross_v(get_tuned_rf(),X_train.values,y.values)#0.546637992781 clf=get_tuned_rf() clf.fit(X_train,y) preds = clf.predict_proba(X_test) write_submission(test_ids,preds,'submissions/parameter_tuning_solution800_6.csv')
def clf_score(models,X_train,y_train): index=[] score=[] for clf in models: index.append(clf[0]) cv=cross_v(clf[1],X_train.values,y_train.values) print clf[0] print cv score.append(cv) return pd.DataFrame(score,index=index)
def feature_selection_solution(): train=load_data('train.csv') test=load_data('test.csv') le = preprocessing.LabelEncoder() le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=feature_engineering(train[feature_cols]) X_test=feature_engineering(test[feature_cols]) feature_cols=[col for col in X_train.columns if col not in ['mean','std','nonzero','feat_6','feat_82','feat_84']] X_train=X_train[feature_cols] X_test=X_test[feature_cols] print X_train.columns y=train['target'] test_ids=test['id'] print 'feature_selection_solution' cross_v(get_rf(),X_train.values,y.values)# mean 0.595288515439 std 0.593551044059 nonzero 0.597406303207 #no fg 6 82 84 0.603600594376 #0.600058535601 clf=get_rf() clf.fit(X_train,y) preds = clf.predict_proba(X_test) write_submission(test_ids,preds,'submissions/feature_selection_rf100_84_82_6_nofg.csv')
clf=clfs[name]['est'](**params) print name #cross_v(clf,X_train.values,y.values) clf.fit(X_train.values,y.values) def main(): train=load_data('train.csv') feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=feature_engineering(train[feature_cols]) y=train['target'] grid_search(X_train,y,get_clfs()) #if __name__ == '__main__': # main() train=load_data('train.csv') le = preprocessing.LabelEncoder() le.fit(train['target']) train['target']=le.transform(train['target']) feature_cols= [col for col in train.columns if col not in ['target','id']] X_train=train[feature_cols] y=train['target'] clfs=get_gb() for name, clf in clfs.iteritems(): print name param_list = list(ParameterGrid(clfs[name]['grid'])) for i in range(0,len(param_list)): clf=clfs[name]['est'].set_params(**param_list[i]) print clf print param_list[i] cross_v(clf,X_train.values,y.values)