Exemple #1
0
def train_nusvc(df_train, df_test, n_splits=25, pca=False):
    train = df_train.copy()
    test = df_test.copy()

    oof = np.zeros(len(train))
    preds = np.zeros(len(test))
    cols = [
        c for c in train.columns
        if c not in ['id', 'target', 'wheezy-copper-turtle-magic']
    ]

    for i in range(512):
        train2 = train[train['wheezy-copper-turtle-magic'] == i].copy()
        test2 = test[test['wheezy-copper-turtle-magic'] == i].copy()
        idx1 = train2.index
        idx2 = test2.index
        train2.reset_index(drop=True, inplace=True)

        if pca:
            data = pd.concat(
                [pd.DataFrame(train2[cols]),
                 pd.DataFrame(test2[cols])])
            #data2 = StandardScaler().fit_transform(PCA(n_components=40, random_state=51).fit_transform(data[cols]))
            data2 = StandardScaler().fit_transform(
                PCA(svd_solver='full',
                    n_components='mle').fit_transform(data[cols]))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]
        else:
            sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
            train3 = sel.transform(train2[cols])
            test3 = sel.transform(test2[cols])

        skf = StratifiedKFold(n_splits=n_splits, random_state=15)
        for train_index, test_index in skf.split(train3, train2['target']):

            clf = Pipeline([('scaler', StandardScaler()),
                            ('svn',
                             NuSVC(probability=True,
                                   kernel='poly',
                                   degree=4,
                                   gamma='auto',
                                   random_state=745,
                                   nu=0.59,
                                   coef0=0.053))])

            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:,
                                                                             1]
            preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

    ut.report_oof(df_train, oof)

    return oof, preds
def train_svc(df_train, df_test):
    train = df_train.copy()
    test = df_test.copy()

    oof = np.zeros(len(train))
    preds = np.zeros(len(test))
    cols = [
        c for c in train.columns
        if c not in ['id', 'target', 'wheezy-copper-turtle-magic']
    ]

    for i in range(512):
        train2 = train[train['wheezy-copper-turtle-magic'] == i]
        test2 = test[test['wheezy-copper-turtle-magic'] == i]
        idx1 = train2.index
        idx2 = test2.index
        train2.reset_index(drop=True, inplace=True)

        sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
        train3 = sel.transform(train2[cols])
        test3 = sel.transform(test2[cols])

        skf = StratifiedKFold(n_splits=25, random_state=15)
        for train_index, test_index in skf.split(train3, train2['target']):

            clf = Pipeline([('scaler', StandardScaler()),
                            ('svc',
                             SVC(probability=True,
                                 kernel='poly',
                                 degree=4,
                                 gamma='auto'))])
            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:,
                                                                             1]
            preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

        if i % 25 == 0:
            print(i)

    ut.report_oof(df_train, oof)

    return oof, preds
def train_logit(df_train, df_test):
    train = df_train.copy()
    test = df_test.copy()

    oof = np.zeros(len(train))
    preds = np.zeros(len(test))
    cols = [
        c for c in train.columns
        if c not in ['id', 'target', 'wheezy-copper-turtle-magic']
    ]

    for i in range(512):
        train2 = train[train['wheezy-copper-turtle-magic'] == i]
        test2 = test[test['wheezy-copper-turtle-magic'] == i]
        idx1 = train2.index
        idx2 = test2.index
        train2.reset_index(drop=True, inplace=True)

        sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
        train3 = sel.transform(train2[cols])
        test3 = sel.transform(test2[cols])

        skf = StratifiedKFold(n_splits=25, random_state=15)
        for train_index, test_index in skf.split(train3, train2['target']):

            clf = Pipeline([('scaler', StandardScaler()),
                            ('logit',
                             LogisticRegression(solver='saga',
                                                penalty='l1',
                                                C=1))])

            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])

            oof[idx1[test_index]] = clf.predict_proba(train3[test_index, :])[:,
                                                                             1]
            preds[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

    ut.report_oof(df_train, oof)

    return oof, preds
def train_logit(df_train, df_test, C=1, pca=False):
    train = df_train.copy()
    test = df_test.copy()

    oof = np.zeros(len(train))
    preds = np.zeros(len(test))
    cols = [c for c in train.columns if c not in ['id', 'target', 'wheezy-copper-turtle-magic']]

    for i in range(512):
        train2 = train[train['wheezy-copper-turtle-magic']==i]
        test2 = test[test['wheezy-copper-turtle-magic']==i]
        idx1 = train2.index; idx2 = test2.index
        train2.reset_index(drop=True,inplace=True)

        if pca:
            data = pd.concat([pd.DataFrame(train2[cols]), pd.DataFrame(test2[cols])])
            #data2 = StandardScaler().fit_transform(PCA(n_components=40, random_state=51).fit_transform(data[cols]))
            data2 = StandardScaler().fit_transform(PCA(svd_solver='full',n_components='mle').fit_transform(data[cols]))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]
        else:
            sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
            train3 = sel.transform(train2[cols])
            test3 = sel.transform(test2[cols])

        skf = StratifiedKFold(n_splits=25, random_state=15)
        for train_index, test_index in skf.split(train3, train2['target']):

            clf = Pipeline([('scaler', StandardScaler()),
                            ('logit', LogisticRegression(solver='saga', penalty='l1', C=C))])
            
            clf.fit(train3[train_index,:],train2.loc[train_index]['target'])
            
            oof[idx1[test_index]] = clf.predict_proba(train3[test_index,:])[:,1]
            preds[idx2] += clf.predict_proba(test3)[:,1] / skf.n_splits
            
    ut.report_oof(df_train, oof)
    
    return oof, preds
Exemple #5
0
def train_lgb(df_train, df_test, kfolds):
    train = df_train.copy()
    test = df_test.copy()

    target = train.target.copy()

    sub = test[['id']].copy()

    train, test = ut.general_processing(train, test)

    # model
    oof = np.zeros(len(train))
    predictions = np.zeros(len(test))
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx,
                val_idx) in enumerate(kfolds.split(train.values,
                                                   target.values)):
        print("fold n°{}".format(fold_))

        trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx])

        param = {
            'bagging_freq': 3,
            'bagging_fraction': 0.8,
            'boost_from_average': 'false',
            'boost': 'gbdt',
            'feature_fraction': 0.9,
            'learning_rate': 0.01,
            'max_depth': 10,
            'metric': 'auc',
            'min_data_in_leaf': 82,
            'min_sum_hessian_in_leaf': 10.0,
            'num_leaves': 20,
            'objective': 'binary',
            'verbosity': 1,
        }

        # param = {  # this is for v9
        #     'bagging_freq': 3,
        #     'bagging_fraction': 0.8,
        #     'boost_from_average':'false',
        #     'boost': 'gbdt',
        #     'feature_fraction': 0.8,
        #     'learning_rate': 0.001,
        #     'max_depth': 10,
        #     'metric':'auc',
        #     'min_data_in_leaf': 100,
        #     'num_leaves': 30,
        #     'objective': 'binary',
        #     'verbosity': 1,
        #     'n_jobs': -1

        # }
        num_round = 1000000
        clf = lgb.train(param,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=500,
                        early_stopping_rounds=100)

        oof[val_idx] = clf.predict(train.iloc[val_idx],
                                   num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = train.columns
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

        predictions += clf.predict(
            test, num_iteration=clf.best_iteration) / kfolds.n_splits

    ut.report_oof(df_train, oof)

    sub['target'] = predictions

    return oof, sub
Exemple #6
0
def train_all(df_train, df_test, n_folds, pca=False):
    train = df_train.copy()
    test = df_test.copy()

    oof_svc = np.zeros(len(train))
    oof_nusvc = np.zeros(len(train))
    oof_logit = np.zeros(len(train))
    oof_knn = np.zeros(len(train))
    oof_qda = np.zeros(len(train))
    preds_svc = np.zeros(len(test))
    preds_nusvc = np.zeros(len(test))
    preds_logit = np.zeros(len(test))
    preds_knn = np.zeros(len(test))
    preds_qda = np.zeros(len(test))

    cols = [
        c for c in train.columns
        if c not in ['id', 'target', 'wheezy-copper-turtle-magic']
    ]

    for i in range(512):
        train2 = train[train['wheezy-copper-turtle-magic'] == i].copy()
        test2 = test[test['wheezy-copper-turtle-magic'] == i].copy()
        if len(train2) == 0:
            continue
        idx1 = train2.index
        idx2 = test2.index
        train2.reset_index(drop=True, inplace=True)

        if pca:
            data = pd.concat(
                [pd.DataFrame(train2[cols]),
                 pd.DataFrame(test2[cols])])
            #data2 = StandardScaler().fit_transform(PCA(n_components=40, random_state=51).fit_transform(data[cols]))
            data2 = StandardScaler().fit_transform(
                PCA(svd_solver='full',
                    n_components='mle').fit_transform(data[cols]))
            train3 = data2[:train2.shape[0]]
            test3 = data2[train2.shape[0]:]
        else:
            sel = VarianceThreshold(threshold=1.5).fit(train2[cols])
            train3 = sel.transform(train2[cols])
            test3 = sel.transform(test2[cols])

        skf = StratifiedKFold(n_splits=n_folds, random_state=15)

        for train_index, test_index in skf.split(train3, train2['target']):

            clf = Pipeline([('scaler', StandardScaler()),
                            ('svn',
                             SVC(probability=True,
                                 kernel='poly',
                                 degree=4,
                                 gamma='auto'))])
            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof_svc[idx1[test_index]] = clf.predict_proba(
                train3[test_index, :])[:, 1]
            preds_svc[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

            clf = Pipeline([('scaler', StandardScaler()),
                            ('svn',
                             NuSVC(probability=True,
                                   kernel='poly',
                                   degree=4,
                                   gamma='auto',
                                   random_state=745,
                                   nu=0.59,
                                   coef0=0.053))])
            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof_nusvc[idx1[test_index]] = clf.predict_proba(
                train3[test_index, :])[:, 1]
            preds_nusvc[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

            clf = Pipeline([('scaler', StandardScaler()),
                            ('logit',
                             LogisticRegression(solver='saga',
                                                penalty='l1',
                                                C=0.5))])
            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof_logit[idx1[test_index]] = clf.predict_proba(
                train3[test_index, :])[:, 1]
            preds_logit[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

            clf = Pipeline([('scaler', StandardScaler()),
                            ('KNN', KNeighborsClassifier(n_neighbors=17,
                                                         p=2.9))])
            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof_knn[idx1[test_index]] = clf.predict_proba(
                train3[test_index, :])[:, 1]
            preds_knn[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

            clf = QuadraticDiscriminantAnalysis(reg_param=0.6)
            clf.fit(train3[train_index, :], train2.loc[train_index]['target'])
            oof_qda[idx1[test_index]] = clf.predict_proba(
                train3[test_index, :])[:, 1]
            preds_qda[idx2] += clf.predict_proba(test3)[:, 1] / skf.n_splits

    ut.report_oof(df_train, oof_svc)
    ut.report_oof(df_train, oof_nusvc)
    ut.report_oof(df_train, oof_logit)
    ut.report_oof(df_train, oof_knn)
    ut.report_oof(df_train, oof_kda)

    return oof_svc, preds_svc, oof_nusvc, preds_nusvc, oof_logit, preds_logit, oof_knn, preds_knn, oof_qda, preds_qda