Beispiel #1
0
def make_predictions_cat(tr_df, tt_df, y, features_columns, cat_params, NFOLDS=6):

    folds = GroupKFold(n_splits=NFOLDS)
    split_groups = tr_df['DT_month']

    X = tr_df[features_columns]
    X_test = tt_df[features_columns]

    pred_df = pd.DataFrame()
    pred_df['TransactionID'] = tt_df.reset_index()['TransactionID']
    pred_df['isFraud'] = np.zeros(len(pred_df))

    predictions = np.zeros(len(tt_df))
    oof = np.zeros(len(tr_df))

    for fold_, (trn_idx,
                val_idx) in enumerate(folds.split(X, y, groups=split_groups)):
        tr_x, tr_y = X.iloc[trn_idx, :], y.iloc[trn_idx]
        vl_x, vl_y = X.iloc[val_idx, :], y.iloc[val_idx]

        print('Fold:', fold_, ' - ', len(tr_x), len(vl_x))
        
        estimator = CatBoostClassifier(**cat_params)  
        estimator.fit(
            X.iloc[trn_idx,:],y[trn_idx],
            eval_set=(X.iloc[val_idx,:], y[val_idx]),
            cat_features=categorical_features,
            use_best_model=True,
            verbose=True)
        
        oof_preds = estimator.predict_proba(X.iloc[val_idx,:])[:,1]
        oof[val_idx] = (oof_preds - oof_preds.min())/(oof_preds.max() - oof_preds.min())
        
        pp_p = estimator.predict(X_test)
        predictions += pp_p / NFOLDS

        feature_imp = pd.DataFrame(sorted(zip(estimator.feature_importance(),X.columns)), columns=['Value','Feature'])
        print(feature_imp)

        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()

    pred_df['isFraud'] = predictions
    pred_df = pred_df.set_index('TransactionID')

    print('---------------------------------------')
    print('OOF AUC:', metrics.roc_auc_score(y, oof))

    return pred_df, metrics.roc_auc_score(y, oof)
Beispiel #2
0
def train_model_generic(X,
                        X_test,
                        y,
                        params,
                        folds,
                        model_type='lgb',
                        n_fold=5,
                        plot_feature_importance=True,
                        averaging='usual',
                        model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.loc[train_index], X.loc[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)

            model = lgb.train(params,
                              train_data,
                              num_boost_round=20000,
                              valid_sets=[train_data, valid_data],
                              verbose_eval=1000,
                              early_stopping_rounds=100 if TEST_RUN else 200)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X_train.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X_train.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=500,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(
                X_valid, feature_names=X_train.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X_train.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict_proba(X_valid).reshape(-1, )
            score = roc_auc_score(y_valid, y_pred_valid)
            # print(f'Fold {fold_n}. AUC: {score:.4f}.')
            # print('')

            y_pred = model.predict_proba(X_test)[:, 1]

        if model_type == 'glm':
            model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
            model_results = model.fit()
            model_results.predict(X_test)
            y_pred_valid = model_results.predict(X_valid).reshape(-1, )
            score = roc_auc_score(y_valid, y_pred_valid)

            y_pred = model_results.predict(X_test)

        if model_type == 'cat':
            model = CatBoostClassifier(iterations=20000,
                                       learning_rate=0.05,
                                       loss_function='Logloss',
                                       eval_metric='AUC',
                                       **params)
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred = model.predict_proba(X_test)[:, 1]

        oof[valid_index] = y_pred_valid.reshape(-1, )
        scores.append(roc_auc_score(y_valid, y_pred_valid))

        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values

        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importance()
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

            # display_html(eli5.show_weights(estimator=model,
            #                                feature_names=train_df.columns.values, top=50))

    prediction /= n_fold

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
        np.mean(scores), np.std(scores)))

    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')
            plt.show()

            return oof, prediction, feature_importance
        return oof, prediction, scores

    else:
        return oof, prediction, scores
Beispiel #3
0
def train_model(X,
                X_test,
                y,
                params,
                n_fold=5,
                shuffle_folds=True,
                model_type='lgb',
                plot_feature_importance=False,
                averaging='usual',
                model=None,
                folds_random_state=42):

    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    auc_scores = []
    f1_scores = []
    recall_scores = []
    precision_scores = []
    accuracy_scores = []
    feature_importance = pd.DataFrame()
    folds = StratifiedKFold(n_splits=n_fold,
                            shuffle=shuffle_folds,
                            random_state=folds_random_state)
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X.loc[train_index], X.loc[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]

        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)

            model = lgb.train(params,
                              train_data,
                              num_boost_round=1000000,
                              valid_sets=[train_data, valid_data],
                              verbose_eval=1000,
                              early_stopping_rounds=3000)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration)

        if model_type == 'lgb_sklearn':
            model = lgb.LGBMClassifier(**params, n_estimators=1000000)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      verbose=1000,
                      early_stopping_rounds=3000)

            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred = model.predict_proba(
                X_test, num_iteration=model.best_iteration_)[:, 1]

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X_train.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X_train.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=500,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(
                X_valid, feature_names=X_train.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X_train.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict_proba(X_valid).reshape(-1, )
            score = roc_auc_score(y_valid, y_pred_valid)
            # print(f'Fold {fold_n}. AUC: {score:.4f}.')
            # print('')

            y_pred = model.predict_proba(X_test)[:, 1]

        if model_type == 'glm':
            model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
            model_results = model.fit()
            model_results.predict(X_test)
            y_pred_valid = model_results.predict(X_valid).reshape(-1, )
            score = roc_auc_score(y_valid, y_pred_valid)

            y_pred = model_results.predict(X_test)

        if model_type == 'cat':
            model = CatBoostClassifier(iterations=20000,
                                       learning_rate=0.05,
                                       loss_function='Logloss',
                                       eval_metric='AUC',
                                       **params)
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred = model.predict_proba(X_test)[:, 1]

        oof[valid_index] = y_pred_valid.reshape(-1, )

        f1 = 0
        best_t = 0
        for t in np.arange(0.1, 1, 0.05):
            valid_pr = (y_pred_valid > t).astype(int)
            valid_f1 = metrics.f1_score(y_valid, valid_pr)
            if valid_f1 > f1:
                f1 = valid_f1
                best_t = t

        t = best_t
        y_valid_pr = (y_pred_valid > t).astype(int)
        auc_scores.append(roc_auc_score(y_valid, y_pred_valid))
        f1_scores.append(f1_score(y_valid, y_valid_pr))
        precision_scores.append(precision_score(y_valid, y_valid_pr))
        recall_scores.append(recall_score(y_valid, y_valid_pr))
        accuracy_scores.append(accuracy_score(y_valid, y_valid_pr))

        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values

        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importance()
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

    prediction /= n_fold
    with open(os.path.join(src.models.__path__[0], 'model.pkl'), 'wb') as f:
        pickle.dump(model, f, protocol=2)

    scores = pd.DataFrame(
        {
            'precision_score': np.mean(precision_scores),
            'recall_score': np.mean(recall_scores),
            'f1_score': np.mean(f1_scores),
            'accuracy_score': np.mean(accuracy_scores),
            'auc_score': np.mean(auc_scores),
        },
        index=[0])

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
        np.mean(auc_scores), np.std(auc_scores)))

    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')
            plt.savefig('feature_importance.png')

            return oof, prediction, scores, feature_importance
        return oof, prediction, scores, feature_importance

    else:
        return oof, prediction, scores, feature_importance