Example #1
0
def main():
    preds, clfs = [], []
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    for fn in glob.glob('pickles/*.pkl'):
        print fn
        clf = joblib.load(fn)
        y_pred = clf.predict(X_val)
        preds.append(y_pred)
        clfs.append(clf)

    avgd_pred = np.average(preds, axis=0)
    meta = xgb.XGBClassifier(n_estimators=90, subsample=0.6, colsample_bytree=0.5, learning_rate=0.05)
    predsa = np.column_stack(preds)
    Xx = np.column_stack((X_val, predsa))
    meta_pred = cross_val_predict(meta, Xx, y_val, cv=3)
    meta_auc = roc_auc_score(y_val, meta_pred)
    avg_auc = roc_auc_score(y_val, avgd_pred)
    common.compare_to_history(avg_auc, fn='validation_metrics.json')
    common.compare_to_history(meta_auc.mean(), fn='validation_metrics.json')
    meta.fit(Xx, y_val)
    yolo = (avgd_pred + meta_pred) / 2.0
    print 'yolo', roc_auc_score(y_val, yolo)

    dft = common.load_tourn_dummies()
    X_dum_cols = [c for c in dft if c[0]=='f' or c.startswith('c1_')]
    X_t = dft.loc[:, X_dum_cols]
    y_pred_t = avg_pred(clfs, X_t)
    X_m_t = np.column_stack((X_t, y_pred_t))
    y_pred_m = meta.predict_proba(X_m_t)
    out_df = pd.DataFrame(dict(probability=y_pred_m[:, 1]), index=dft.index)
    out_df.to_csv('preds_2015_12_23.csv')
Example #2
0
def main():
    preds, clfs = [], []
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    for fn in glob.glob('pickles/*.pkl'):
        print fn
        clf = joblib.load(fn)
        y_pred = clf.predict(X_val)
        preds.append(y_pred)
        clfs.append(clf)

    avgd_pred = np.average(preds, axis=0)
    meta = xgb.XGBClassifier(n_estimators=90,
                             subsample=0.6,
                             colsample_bytree=0.5,
                             learning_rate=0.05)
    predsa = np.column_stack(preds)
    Xx = np.column_stack((X_val, predsa))
    meta_pred = cross_val_predict(meta, Xx, y_val, cv=3)
    meta_auc = roc_auc_score(y_val, meta_pred)
    avg_auc = roc_auc_score(y_val, avgd_pred)
    common.compare_to_history(avg_auc, fn='validation_metrics.json')
    common.compare_to_history(meta_auc.mean(), fn='validation_metrics.json')
    meta.fit(Xx, y_val)
    yolo = (avgd_pred + meta_pred) / 2.0
    print 'yolo', roc_auc_score(y_val, yolo)

    dft = common.load_tourn_dummies()
    X_dum_cols = [c for c in dft if c[0] == 'f' or c.startswith('c1_')]
    X_t = dft.loc[:, X_dum_cols]
    y_pred_t = avg_pred(clfs, X_t)
    X_m_t = np.column_stack((X_t, y_pred_t))
    y_pred_m = meta.predict_proba(X_m_t)
    out_df = pd.DataFrame(dict(probability=y_pred_m[:, 1]), index=dft.index)
    out_df.to_csv('preds_2015_12_23.csv')
Example #3
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    print X_train.shape
    clf = xgb.XGBClassifier(
            n_estimators=200,
            max_depth=5,
            colsample_bytree=0.7,
            learning_rate=0.002,
            subsample=0.1,
            seed=484313)
    clf = Pipeline([
        ('vec', MinMaxScaler()),
        ('v2', FeatureUnion([
            ('vec', FunctionTransformer()),
            ('km', KMeans(n_clusters=7)),
            ])
        ),
        ('clf', clf),
    ])
    params = dict(
            clf__n_estimators=[50, 100],
            clf__max_depth=[3, 5],
            clf__learning_rate=[0.02, 0.1],
            clf__subsample=[0.5],
    )
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True)
    common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    joblib.dump(clf, 'pickles/xg.pkl')
Example #4
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    slr = make_pipeline(MinMaxScaler(), LogisticRegression())
    plr = make_pipeline(PCA(), LogisticRegression())
    nb_bag = BaggingClassifier(base_estimator=GaussianNB())
    clfs = (
        GaussianNB(),
        #GridSearchCV(slr, dict(logisticregression__C=[1.0, 0.8])),
        make_pipeline(PCA(), GaussianNB()),
        GridSearchCV(plr,
                     dict(pca__n_components=[None, 3, 8],
                          logisticregression__C=[1.0, 0.7]),
                     scoring='roc_auc'),
        GridSearchCV(nb_bag,
                     dict(max_samples=[0.2, 0.4, 0.6], max_features=[0.3,
                                                                     0.7]),
                     scoring='roc_auc'),
        xgb.XGBClassifier(n_estimators=20,
                          max_depth=3,
                          colsample_bytree=0.7,
                          subsample=0.6,
                          learning_rate=0.1),
        #make_pipeline(KMeans(), GaussianNB()),
        #GridSearchCV(
        #    BaggingClassifier(),
        #    dict(base_estimator=[None, GaussianNB(), LogisticRegression()],
        #        n_estimators=[7, 10, 14],
        #        max_samples=[0.3, 0.6])),
        #GridSearchCV(xgb.XGBClassifier(), dict(n_estimators=[2, 3, 4], learning_rate=[0.01, 0.1], subsample=[0.5, 0.9])),
        #BaggingClassifier(base_estimator=SVC(), max_features=0.8, max_samples=2500, n_estimators=5),
    )
    preds = []
    for clf in clfs:
        print clf
        clf.fit(X_train, y_train)
        val_pred = clf.predict(X_val)
        print roc_auc_score(y_val, val_pred)
        clf.fit(X_val, y_val)
        train_pred = clf.predict(X_train)
        preds.append(np.concatenate((train_pred, val_pred)))
        print roc_auc_score(y_train, train_pred)
        print

    y_all = np.concatenate((y_train, y_val))
    preds = np.column_stack(preds)
    gm = gmean(preds, axis=1)
    hm = hmean(preds + 1, axis=1)
    preds = np.column_stack((preds, gm, hm))
    print 'GM', roc_auc_score(y_all, gm)
    print 'HM', roc_auc_score(y_all, hm)
    meta = GaussianNB()
    meta = GridSearchCV(xgb.XGBClassifier(),
                        dict(max_depth=[2, 3, 4],
                             learning_rate=[0.01, 0.05, 0.1],
                             n_estimators=[20, 40, 60]),
                        scoring='roc_auc')
    meta.fit(preds, y_all)
    scores = cross_val_score(meta, preds, y_all, scoring='roc_auc', cv=5)
    print scores
    print scores.mean()
Example #5
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    print X_train.shape
    clf = xgb.XGBClassifier(n_estimators=200,
                            max_depth=5,
                            colsample_bytree=0.7,
                            learning_rate=0.002,
                            subsample=0.1,
                            seed=484313)
    clf = Pipeline([
        ('vec', MinMaxScaler()),
        ('v2',
         FeatureUnion([
             ('vec', FunctionTransformer()),
             ('km', KMeans(n_clusters=7)),
         ])),
        ('clf', clf),
    ])
    params = dict(
        clf__n_estimators=[50, 100],
        clf__max_depth=[3, 5],
        clf__learning_rate=[0.02, 0.1],
        clf__subsample=[0.5],
    )
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True)
    common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    joblib.dump(clf, 'pickles/xg.pkl')
Example #6
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    clf = make_pipeline(KMeans(), GaussianNB())
    clf = Pipeline([('km', KMeans()), ('clf', xgb.XGBClassifier())])
    params = dict(km__n_clusters=[7, 10, 20], clf__n_estimators=[15, 30])
    clf = GridSearchCV(clf, params, verbose=True)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)
    print roc_auc_score(y_val, y_pred)
    print clf.best_params_
Example #7
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    est = LogisticRegression()
    clf = Pipeline([('vec', PolynomialFeatures(interaction_only=True)),
                    ('pca', PCA()), ('clf', est)])
    params = dict(pca__n_components=[None, 7, 15], )
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=5)
    common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    print clf.best_params_
    fn = 'pca_lr_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #8
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    clf = Pipeline([
        ('pca', PCA()),
        ('clf', GaussianNB())
    ])
    params = dict(
            pca__n_components=[None, 2, 3, 4, 7, 9],
    )
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True)
    common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
Example #9
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    train_clfs = [(x, 0.7) for x in fitto(X_train, y_train)]
    val_clfs = [(x, 0.3) for x in fitto(X_val, y_val)]
    val_aucs = evalshow(train_clfs, X_val, y_val)
    print
    train_aucs = evalshow(val_clfs, X_train, y_train)
    print
    print 'train_aucs', max(train_aucs)
    print 'val_aucs', max(val_aucs)
    predict_tourn(train_clfs+val_clfs)
Example #10
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    train_clfs = [(x, 0.7) for x in fitto(X_train, y_train)]
    val_clfs = [(x, 0.3) for x in fitto(X_val, y_val)]
    val_aucs = evalshow(train_clfs, X_val, y_val)
    print
    train_aucs = evalshow(val_clfs, X_train, y_train)
    print
    print 'train_aucs', max(train_aucs)
    print 'val_aucs', max(val_aucs)
    predict_tourn(train_clfs + val_clfs)
Example #11
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    params = dict(
        clf__n_neighbors=[1, 3, 5, 7],
        clf__weights=['distance', 'uniform'],
    )
    clf = Pipeline([('vec', MinMaxScaler()), ('clf', KNeighborsClassifier())])
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3)
    #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    common.predict_and_report_val(clf, X_val, X_train, y_val, y_train)
    print clf.best_params_
    fn = 'knn_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #12
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    params = dict(
            n_estimators=[50, 100, 150],
            max_depth=[None, 3, 5, 6],
    )
    clf = RandomForestClassifier()
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3)
    #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    common.predict_and_report_val(clf, X_val, X_train, y_val, y_train)
    print clf.best_params_
    fn = 'rf2_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #13
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    params = dict(
        n_estimators=[50, 100, 150],
        max_depth=[None, 3, 5, 6],
    )
    clf = RandomForestClassifier()
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3)
    #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    common.predict_and_report_val(clf, X_val, X_train, y_val, y_train)
    print clf.best_params_
    fn = 'rf2_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #14
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    params = dict(
        clf__max_samples=[2000, 6000],
        clf__base_estimator__C=[1.5, 1.2, 1.0],
    )
    est = BaggingClassifier(base_estimator=SVC(), n_estimators=3)
    clf = Pipeline([('vec', StandardScaler()), ('pca', PCA()), ('clf', est)])
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3)
    #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    common.predict_and_report_val(clf, X_val, X_train, y_val, y_train)
    print clf.best_params_
    fn = 'svm_hack_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #15
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    est = LogisticRegression()
    clf = Pipeline([
        ('vec', PolynomialFeatures(interaction_only=True)),
        ('pca', PCA()),
        ('clf', est)
    ])
    params = dict(
            pca__n_components=[None, 7, 15],
    )
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=5)
    common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    print clf.best_params_
    fn = 'pca_lr_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #16
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    slr = make_pipeline(MinMaxScaler(), LogisticRegression())
    plr = make_pipeline(PCA(), LogisticRegression())
    nb_bag = BaggingClassifier(base_estimator=GaussianNB())
    clfs = (
            GaussianNB(),
            #GridSearchCV(slr, dict(logisticregression__C=[1.0, 0.8])),
            make_pipeline(PCA(), GaussianNB()),
            GridSearchCV(plr, dict(pca__n_components=[None, 3, 8], logisticregression__C=[1.0, 0.7]), scoring='roc_auc'),
            GridSearchCV(nb_bag, dict(max_samples=[0.2, 0.4, 0.6], max_features=[0.3, 0.7]), scoring='roc_auc'),
            xgb.XGBClassifier(n_estimators=20, max_depth=3, colsample_bytree=0.7, subsample=0.6, learning_rate=0.1),
            #make_pipeline(KMeans(), GaussianNB()),
            #GridSearchCV(
            #    BaggingClassifier(),
            #    dict(base_estimator=[None, GaussianNB(), LogisticRegression()],
            #        n_estimators=[7, 10, 14],
            #        max_samples=[0.3, 0.6])),
            #GridSearchCV(xgb.XGBClassifier(), dict(n_estimators=[2, 3, 4], learning_rate=[0.01, 0.1], subsample=[0.5, 0.9])),
            #BaggingClassifier(base_estimator=SVC(), max_features=0.8, max_samples=2500, n_estimators=5),
    )
    preds = []
    for clf in clfs:
        print clf
        clf.fit(X_train, y_train)
        val_pred = clf.predict(X_val)
        print roc_auc_score(y_val, val_pred)
        clf.fit(X_val, y_val)
        train_pred = clf.predict(X_train)
        preds.append(np.concatenate((train_pred, val_pred)))
        print roc_auc_score(y_train, train_pred)
        print

    y_all = np.concatenate((y_train, y_val))
    preds = np.column_stack(preds)
    gm = gmean(preds, axis=1)
    hm = hmean(preds+1, axis=1)
    preds = np.column_stack((preds, gm, hm))
    print 'GM', roc_auc_score(y_all, gm)
    print 'HM', roc_auc_score(y_all, hm)
    meta = GaussianNB()
    meta = GridSearchCV(xgb.XGBClassifier(), dict(max_depth=[2, 3, 4], learning_rate=[0.01, 0.05, 0.1], n_estimators=[20, 40, 60]), scoring='roc_auc')
    meta.fit(preds, y_all)
    scores = cross_val_score(meta, preds, y_all, scoring='roc_auc', cv=5)
    print scores
    print scores.mean()
Example #17
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    est = BaggingClassifier(base_estimator=GaussianNB())
    clf = Pipeline([('pca', RandomizedPCA()), ('clf', est)])
    params = dict(
        pca__n_components=[None, 4, 7, 9],
        pca__whiten=[True, False],
        clf__max_samples=[0.9],
        clf__max_features=[0.5, 0.9, 1.0],
        clf__bootstrap=[False],
        clf__n_estimators=[10, 15, 25],
    )
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=5)
    #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    common.predict_and_report_val(clf, X_val, X_train, y_val, y_train)
    print clf.best_params_
    fn = 'rpca_pca_hack_2_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #18
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    params = dict(
            clf__max_samples=[2000, 6000],
            clf__base_estimator__C=[1.5, 1.2, 1.0],
    )
    est = BaggingClassifier(base_estimator=SVC(), n_estimators=3)
    clf = Pipeline([
        ('vec', StandardScaler()),
        ('pca', PCA()),
        ('clf', est)
    ])
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=3)
    #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    common.predict_and_report_val(clf, X_val, X_train, y_val, y_train)
    print clf.best_params_
    fn = 'svm_hack_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #19
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    clf = Pipeline([
        ('pca', PCA()),
        ('clf', StackingClassifier())
    ])
    clf.fit(X_train, y_train)
    fn = 'stack'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
    y_pred = clf.predict(X_val)
    print y_val[:10]
    print y_pred[:10]
    print roc_auc_score(y_val, y_pred)
    hack = joblib.load('pickles/pca_hack_2.pkl')
    h2 = joblib.load('pickles/rpca_pca_hack_2.pkl')
    y_hack = hack.predict(X_val)
    y_h2 = h2.predict(X_val)
    duh = (y_pred + y_hack + y_h2) / 3.0
    print 'duh auc', roc_auc_score(y_val, duh)
Example #20
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    est = BaggingClassifier(base_estimator=GaussianNB())
    clf = Pipeline([
        ('pca', RandomizedPCA()),
        ('clf', est)
    ])
    params = dict(
            pca__n_components=[None, 4, 7, 9],
            pca__whiten=[True, False],
            clf__max_samples=[0.9],
            clf__max_features=[0.5, 0.9, 1.0],
            clf__bootstrap=[False],
            clf__n_estimators=[10, 15, 25],
    )
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True, cv=5)
    #common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)
    common.predict_and_report_val(clf, X_val, X_train, y_val, y_train)
    print clf.best_params_
    fn = 'rpca_pca_hack_2_val'
    joblib.dump(clf, 'pickles/%s.pkl' % (fn))
Example #21
0
def main():
    X_train, X_val, y_train, y_val = common.load_train_dummies()
    clf = Pipeline([('pca', PCA()), ('clf', GaussianNB())])
    params = dict(pca__n_components=[None, 2, 3, 4, 7, 9], )
    clf = GridSearchCV(clf, params, scoring='roc_auc', verbose=True)
    common.predict_and_report_val(clf, X_train, X_val, y_train, y_val)