def main(): preds, clfs = [], [] X_train, X_val, y_train, y_val = common.load_train_dummies() for fn in glob.glob('pickles/*.pkl'): print fn clf = joblib.load(fn) y_pred = clf.predict(X_val) preds.append(y_pred) clfs.append(clf) avgd_pred = np.average(preds, axis=0) meta = xgb.XGBClassifier(n_estimators=90, subsample=0.6, colsample_bytree=0.5, learning_rate=0.05) predsa = np.column_stack(preds) Xx = np.column_stack((X_val, predsa)) meta_pred = cross_val_predict(meta, Xx, y_val, cv=3) meta_auc = roc_auc_score(y_val, meta_pred) avg_auc = roc_auc_score(y_val, avgd_pred) common.compare_to_history(avg_auc, fn='validation_metrics.json') common.compare_to_history(meta_auc.mean(), fn='validation_metrics.json') meta.fit(Xx, y_val) yolo = (avgd_pred + meta_pred) / 2.0 print 'yolo', roc_auc_score(y_val, yolo) dft = common.load_tourn_dummies() X_dum_cols = [c for c in dft if c[0]=='f' or c.startswith('c1_')] X_t = dft.loc[:, X_dum_cols] y_pred_t = avg_pred(clfs, X_t) X_m_t = np.column_stack((X_t, y_pred_t)) y_pred_m = meta.predict_proba(X_m_t) out_df = pd.DataFrame(dict(probability=y_pred_m[:, 1]), index=dft.index) out_df.to_csv('preds_2015_12_23.csv')
def main(): preds, clfs = [], [] X_train, X_val, y_train, y_val = common.load_train_dummies() for fn in glob.glob('pickles/*.pkl'): print fn clf = joblib.load(fn) y_pred = clf.predict(X_val) preds.append(y_pred) clfs.append(clf) avgd_pred = np.average(preds, axis=0) meta = xgb.XGBClassifier(n_estimators=90, subsample=0.6, colsample_bytree=0.5, learning_rate=0.05) predsa = np.column_stack(preds) Xx = np.column_stack((X_val, predsa)) meta_pred = cross_val_predict(meta, Xx, y_val, cv=3) meta_auc = roc_auc_score(y_val, meta_pred) avg_auc = roc_auc_score(y_val, avgd_pred) common.compare_to_history(avg_auc, fn='validation_metrics.json') common.compare_to_history(meta_auc.mean(), fn='validation_metrics.json') meta.fit(Xx, y_val) yolo = (avgd_pred + meta_pred) / 2.0 print 'yolo', roc_auc_score(y_val, yolo) dft = common.load_tourn_dummies() X_dum_cols = [c for c in dft if c[0] == 'f' or c.startswith('c1_')] X_t = dft.loc[:, X_dum_cols] y_pred_t = avg_pred(clfs, X_t) X_m_t = np.column_stack((X_t, y_pred_t)) y_pred_m = meta.predict_proba(X_m_t) out_df = pd.DataFrame(dict(probability=y_pred_m[:, 1]), index=dft.index) out_df.to_csv('preds_2015_12_23.csv')
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() print X_train.shape clf = xgb.XGBClassifier( n_estimators=200, max_depth=5, colsample_bytree=0.7, learning_rate=0.002, subsample=0.1, seed=484313) clf = Pipeline([ ('vec', MinMaxScaler()), ('v2', FeatureUnion([ ('vec', FunctionTransformer()), ('km', KMeans(n_clusters=7)), ]) ), ('clf', clf), ]) params = dict( clf__n_estimators=[50, 100], clf__max_depth=[3, 5], clf__learning_rate=[0.02, 0.1], clf__subsample=[0.5], ) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True) common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) joblib.dump(clf, 'pickles/xg.pkl')
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() slr = make_pipeline(MinMaxScaler(), LogisticRegression()) plr = make_pipeline(PCA(), LogisticRegression()) nb_bag = BaggingClassifier(base_estimator=GaussianNB()) clfs = ( GaussianNB(), #GridSearchCV(slr, dict(logisticregression__C=[1.0, 0.8])), make_pipeline(PCA(), GaussianNB()), GridSearchCV(plr, dict(pca__n_components=[None, 3, 8], logisticregression__C=[1.0, 0.7]), scoring='roc_auc'), GridSearchCV(nb_bag, dict(max_samples=[0.2, 0.4, 0.6], max_features=[0.3, 0.7]), scoring='roc_auc'), xgb.XGBClassifier(n_estimators=20, max_depth=3, colsample_bytree=0.7, subsample=0.6, learning_rate=0.1), #make_pipeline(KMeans(), GaussianNB()), #GridSearchCV( # BaggingClassifier(), # dict(base_estimator=[None, GaussianNB(), LogisticRegression()], # n_estimators=[7, 10, 14], # max_samples=[0.3, 0.6])), #GridSearchCV(xgb.XGBClassifier(), dict(n_estimators=[2, 3, 4], learning_rate=[0.01, 0.1], subsample=[0.5, 0.9])), #BaggingClassifier(base_estimator=SVC(), max_features=0.8, max_samples=2500, n_estimators=5), ) preds = [] for clf in clfs: print clf clf.fit(X_train, y_train) val_pred = clf.predict(X_val) print roc_auc_score(y_val, val_pred) clf.fit(X_val, y_val) train_pred = clf.predict(X_train) preds.append(np.concatenate((train_pred, val_pred))) print roc_auc_score(y_train, train_pred) print y_all = np.concatenate((y_train, y_val)) preds = np.column_stack(preds) gm = gmean(preds, axis=1) hm = hmean(preds + 1, axis=1) preds = np.column_stack((preds, gm, hm)) print 'GM', roc_auc_score(y_all, gm) print 'HM', roc_auc_score(y_all, hm) meta = GaussianNB() meta = GridSearchCV(xgb.XGBClassifier(), dict(max_depth=[2, 3, 4], learning_rate=[0.01, 0.05, 0.1], n_estimators=[20, 40, 60]), scoring='roc_auc') meta.fit(preds, y_all) scores = cross_val_score(meta, preds, y_all, scoring='roc_auc', cv=5) print scores print scores.mean()
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() print X_train.shape clf = xgb.XGBClassifier(n_estimators=200, max_depth=5, colsample_bytree=0.7, learning_rate=0.002, subsample=0.1, seed=484313) clf = Pipeline([ ('vec', MinMaxScaler()), ('v2', FeatureUnion([ ('vec', FunctionTransformer()), ('km', KMeans(n_clusters=7)), ])), ('clf', clf), ]) params = dict( clf__n_estimators=[50, 100], clf__max_depth=[3, 5], clf__learning_rate=[0.02, 0.1], clf__subsample=[0.5], ) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True) common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) joblib.dump(clf, 'pickles/xg.pkl')
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() clf = make_pipeline(KMeans(), GaussianNB()) clf = Pipeline([('km', KMeans()), ('clf', xgb.XGBClassifier())]) params = dict(km__n_clusters=[7, 10, 20], clf__n_estimators=[15, 30]) clf = GridSearchCV(clf, params, verbose=True) clf.fit(X_train, y_train) y_pred = clf.predict(X_val) print roc_auc_score(y_val, y_pred) print clf.best_params_
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() est = LogisticRegression() clf = Pipeline([('vec', PolynomialFeatures(interaction_only=True)), ('pca', PCA()), ('clf', est)]) params = dict(pca__n_components=[None, 7, 15], ) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=5) common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) print clf.best_params_ fn = 'pca_lr_val' joblib.dump(clf, 'pickles/%s.pkl' % (fn))
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() clf = Pipeline([ ('pca', PCA()), ('clf', GaussianNB()) ]) params = dict( pca__n_components=[None, 2, 3, 4, 7, 9], ) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True) common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() train_clfs = [(x, 0.7) for x in fitto(X_train, y_train)] val_clfs = [(x, 0.3) for x in fitto(X_val, y_val)] val_aucs = evalshow(train_clfs, X_val, y_val) print train_aucs = evalshow(val_clfs, X_train, y_train) print print 'train_aucs', max(train_aucs) print 'val_aucs', max(val_aucs) predict_tourn(train_clfs+val_clfs)
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() train_clfs = [(x, 0.7) for x in fitto(X_train, y_train)] val_clfs = [(x, 0.3) for x in fitto(X_val, y_val)] val_aucs = evalshow(train_clfs, X_val, y_val) print train_aucs = evalshow(val_clfs, X_train, y_train) print print 'train_aucs', max(train_aucs) print 'val_aucs', max(val_aucs) predict_tourn(train_clfs + val_clfs)
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() params = dict( clf__n_neighbors=[1, 3, 5, 7], clf__weights=['distance', 'uniform'], ) clf = Pipeline([('vec', MinMaxScaler()), ('clf', KNeighborsClassifier())]) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3) #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) common.predict_and_report_val(clf, X_val, X_train, y_val, y_train) print clf.best_params_ fn = 'knn_val' joblib.dump(clf, 'pickles/%s.pkl' % (fn))
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() params = dict( n_estimators=[50, 100, 150], max_depth=[None, 3, 5, 6], ) clf = RandomForestClassifier() clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3) #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) common.predict_and_report_val(clf, X_val, X_train, y_val, y_train) print clf.best_params_ fn = 'rf2_val' joblib.dump(clf, 'pickles/%s.pkl' % (fn))
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() params = dict( clf__max_samples=[2000, 6000], clf__base_estimator__C=[1.5, 1.2, 1.0], ) est = BaggingClassifier(base_estimator=SVC(), n_estimators=3) clf = Pipeline([('vec', StandardScaler()), ('pca', PCA()), ('clf', est)]) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3) #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) common.predict_and_report_val(clf, X_val, X_train, y_val, y_train) print clf.best_params_ fn = 'svm_hack_val' joblib.dump(clf, 'pickles/%s.pkl' % (fn))
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() est = LogisticRegression() clf = Pipeline([ ('vec', PolynomialFeatures(interaction_only=True)), ('pca', PCA()), ('clf', est) ]) params = dict( pca__n_components=[None, 7, 15], ) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=5) common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) print clf.best_params_ fn = 'pca_lr_val' joblib.dump(clf, 'pickles/%s.pkl' % (fn))
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() slr = make_pipeline(MinMaxScaler(), LogisticRegression()) plr = make_pipeline(PCA(), LogisticRegression()) nb_bag = BaggingClassifier(base_estimator=GaussianNB()) clfs = ( GaussianNB(), #GridSearchCV(slr, dict(logisticregression__C=[1.0, 0.8])), make_pipeline(PCA(), GaussianNB()), GridSearchCV(plr, dict(pca__n_components=[None, 3, 8], logisticregression__C=[1.0, 0.7]), scoring='roc_auc'), GridSearchCV(nb_bag, dict(max_samples=[0.2, 0.4, 0.6], max_features=[0.3, 0.7]), scoring='roc_auc'), xgb.XGBClassifier(n_estimators=20, max_depth=3, colsample_bytree=0.7, subsample=0.6, learning_rate=0.1), #make_pipeline(KMeans(), GaussianNB()), #GridSearchCV( # BaggingClassifier(), # dict(base_estimator=[None, GaussianNB(), LogisticRegression()], # n_estimators=[7, 10, 14], # max_samples=[0.3, 0.6])), #GridSearchCV(xgb.XGBClassifier(), dict(n_estimators=[2, 3, 4], learning_rate=[0.01, 0.1], subsample=[0.5, 0.9])), #BaggingClassifier(base_estimator=SVC(), max_features=0.8, max_samples=2500, n_estimators=5), ) preds = [] for clf in clfs: print clf clf.fit(X_train, y_train) val_pred = clf.predict(X_val) print roc_auc_score(y_val, val_pred) clf.fit(X_val, y_val) train_pred = clf.predict(X_train) preds.append(np.concatenate((train_pred, val_pred))) print roc_auc_score(y_train, train_pred) print y_all = np.concatenate((y_train, y_val)) preds = np.column_stack(preds) gm = gmean(preds, axis=1) hm = hmean(preds+1, axis=1) preds = np.column_stack((preds, gm, hm)) print 'GM', roc_auc_score(y_all, gm) print 'HM', roc_auc_score(y_all, hm) meta = GaussianNB() meta = GridSearchCV(xgb.XGBClassifier(), dict(max_depth=[2, 3, 4], learning_rate=[0.01, 0.05, 0.1], n_estimators=[20, 40, 60]), scoring='roc_auc') meta.fit(preds, y_all) scores = cross_val_score(meta, preds, y_all, scoring='roc_auc', cv=5) print scores print scores.mean()
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() est = BaggingClassifier(base_estimator=GaussianNB()) clf = Pipeline([('pca', RandomizedPCA()), ('clf', est)]) params = dict( pca__n_components=[None, 4, 7, 9], pca__whiten=[True, False], clf__max_samples=[0.9], clf__max_features=[0.5, 0.9, 1.0], clf__bootstrap=[False], clf__n_estimators=[10, 15, 25], ) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=5) #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) common.predict_and_report_val(clf, X_val, X_train, y_val, y_train) print clf.best_params_ fn = 'rpca_pca_hack_2_val' joblib.dump(clf, 'pickles/%s.pkl' % (fn))
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() params = dict( clf__max_samples=[2000, 6000], clf__base_estimator__C=[1.5, 1.2, 1.0], ) est = BaggingClassifier(base_estimator=SVC(), n_estimators=3) clf = Pipeline([ ('vec', StandardScaler()), ('pca', PCA()), ('clf', est) ]) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3) #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) common.predict_and_report_val(clf, X_val, X_train, y_val, y_train) print clf.best_params_ fn = 'svm_hack_val' joblib.dump(clf, 'pickles/%s.pkl' % (fn))
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() clf = Pipeline([ ('pca', PCA()), ('clf', StackingClassifier()) ]) clf.fit(X_train, y_train) fn = 'stack' joblib.dump(clf, 'pickles/%s.pkl' % (fn)) y_pred = clf.predict(X_val) print y_val[:10] print y_pred[:10] print roc_auc_score(y_val, y_pred) hack = joblib.load('pickles/pca_hack_2.pkl') h2 = joblib.load('pickles/rpca_pca_hack_2.pkl') y_hack = hack.predict(X_val) y_h2 = h2.predict(X_val) duh = (y_pred + y_hack + y_h2) / 3.0 print 'duh auc', roc_auc_score(y_val, duh)
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() est = BaggingClassifier(base_estimator=GaussianNB()) clf = Pipeline([ ('pca', RandomizedPCA()), ('clf', est) ]) params = dict( pca__n_components=[None, 4, 7, 9], pca__whiten=[True, False], clf__max_samples=[0.9], clf__max_features=[0.5, 0.9, 1.0], clf__bootstrap=[False], clf__n_estimators=[10, 15, 25], ) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=5) #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val) common.predict_and_report_val(clf, X_val, X_train, y_val, y_train) print clf.best_params_ fn = 'rpca_pca_hack_2_val' joblib.dump(clf, 'pickles/%s.pkl' % (fn))
def main(): X_train, X_val, y_train, y_val = common.load_train_dummies() clf = Pipeline([('pca', PCA()), ('clf', GaussianNB())]) params = dict(pca__n_components=[None, 2, 3, 4, 7, 9], ) clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True) common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)