Esempio n. 1
0
def kfold_lightgbm(df, num_folds=5, stratified=False, debug=False):

    # Divide into train/valid and text data

    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    ids = train_df['SK_ID_CURR']

    print('Starting Lightgbm. Train shape: {}, test shape: {}'.format(
        train_df.shape, test_df.shape))
    del df
    gc.collect()

    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=321)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=123)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])

    feature_importance_df = pd.DataFrame()
    feats = [
        f for f in train_df.columns if f not in
        ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']
    ]

    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['TARGET'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'TARGET'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'TARGET'].iloc[valid_idx]

        clf = LGBMClassifier(
            n_estimators=10000,
            learning_rate=0.01,
            num_leaves=30,
            colsample_bytree=.9,
            subsample=0.5,
            max_depth=2,
            reg_alpha=.04,
            reg_lambda=.07,
            min_split_gain=.02,
            min_child_weight=39,
            silent=-1,
            verbose=-1,
            n_jobs=-1,
        )

        clf.fit(
            train_x,
            train_y,
            eval_set=[(train_x, train_y), (valid_x, valid_y)],
            eval_metric='auc',
            verbose=100,
            early_stopping_rounds=100  # 30
        )

        oof_preds[valid_idx] = clf.predict_proba(
            valid_x, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(
            test_df[feats],
            num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

        print('Fold %2d AUC : %.6f' %
              (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
        del clf, train_x, train_y, valid_x, valid_y
        gc.collect()

    score = roc_auc_score(train_df['TARGET'], oof_preds)
    print('Full AUC score %.6f' % score)

    df_oof_preds = pd.DataFrame({
        'SK_ID_CURR': ids,
        'TARGET': train_df['TARGET'],
        'PREDICTION': oof_preds
    })
    df_oof_preds = df_oof_preds[['SK_ID_CURR', 'TARGET', 'PREDICTION']]

    if not debug:
        test_df['TARGET'] = sub_preds

        # Save test predictions
        now = datetime.now()
        created_time = now.strftime('%Y-%m-%d-%H-%M')
        score = str(round(score, 6)).replace('.', '')

        # submission file
        sub_file = f'../predictions/{created_time}_{score}_{num_folds}_fold-average-LGBClassifier_submission.csv'
        test_df[['SK_ID_CURR', 'TARGET']].to_csv(sub_file, index=False)

        # oof prediction file
        oof_file = f'../predictions/{created_time}_{score}_{num_folds}_fold-average-LGBClassifier_oof.csv'
        df_oof_preds.to_csv(oof_file, index=False)

        # Display a few plots
        vis_file = f'../visualization/{score}_{created_time}_'
        folds_idx = [(train_idx, valid_idx)
                     for train_idx, valid_idx in folds.split(
                         train_df[feats], train_df['TARGET'])]
        display_importances(feature_importance_df_=feature_importance_df,
                            vis_file=vis_file +
                            "_feature_importances_without_ext_source.png")
        display_roc_curve(y_=train_df['TARGET'],
                          oof_preds_=oof_preds,
                          folds_idx_=folds_idx,
                          vis_file=vis_file +
                          "_roc_curve_without_ext_source.png")
        display_precision_recall(y_=df['TARGET'],
                                 oof_preds_=oof_preds,
                                 folds_idx_=folds_idx,
                                 vis_file=vis_file +
                                 "_precision_recall_without_ext_source.png")

    return None
Esempio n. 2
0
def main(debug=2000):
    if debug is not False:
        rows = debug
    else:
        rows = None

    app_train = pd.read_csv('../../input/application_train.csv', nrows=rows)
    app_test = pd.read_csv('../../input/application_test.csv', nrows=rows)

    test_skid = app_test[['SK_ID_CURR']]

    cols_label = ['TARGET']
    cols_basic = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH']
    cols_amt = [
        'TARGET', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
        'AMT_GOODS_PRICE'
    ]

    cols_test = cols_basic + cols_amt

    # ----------------------------------------------------------------------------------------------------
    # 清除离群点

    print('1 清除离群点')

    app_train = app_train.loc[:, cols_test]
    app_test = app_test.loc[:, cols_test]
    origin_index = app_train.index

    train_length = app_train.shape[0]
    need_drop = ['AMT_INCOME_TOTAL']
    for col in need_drop:
        col_mean = app_train[col].mean()
        col_std = app_train[col].std()
        z = (app_train[col] - col_mean) / col_std
        outlier = z[abs(z) > 8].index
        app_train = app_train.drop(outlier)

    # print(app_train.index)
    # drop_index = [index for index in origin_index.tolist() if index not in app_train.index.tolist()]
    # print(drop_index)

    print('被删除的离群点个数:{}'.format(train_length - app_train.shape[0]))

    app_all = pd.concat([app_train, app_test], axis=0)
    print('shape: {}'.format(app_all.shape))

    missing_values = missing_values_table(app_train)
    print(missing_values)

    # ----------------------------------------------------------------------------------------------------
    print('2 构造新特征')
    # ----------------------------------------------------------------------------------------------------

    features = [
        'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE'
    ]

    for col in features:
        app_all[col].fillna(app_all[col].dropna().median(), inplace=True)

    new_features = []
    for i in range(4):
        for j in range(i + 1, 4):
            app_all[features[i] + ' / ' +
                    features[j]] = app_all[features[i]] / app_all[features[j]]
            new_features.append(features[i] + ' / ' + features[j])

    print(new_features)

    features = features + new_features

    train_len = app_train.shape[0]
    print(train_len)
    app_train = app_all.iloc[:train_len, :]
    app_test = app_all.iloc[train_len:, :]

    # plot

    kde_plot(app_train, features, 'amt_4_pic/4_orgin_persent_kde.png')
    skew_plot(app_train, features, 'amt_4_pic/4_origin_persent_skew.png')

    # 准备训练数据

    # ----------------------------------------------------------------------------------------------------
    print('3 准备训练数据')
    # ----------------------------------------------------------------------------------------------------

    label = app_train['TARGET']
    print('label len {}'.format(len(label)))

    if 'TARGET' in app_train:
        train = app_train.drop(columns=['TARGET'])
    else:
        train = app_train.copy()

    # Copy of the testing data
    test = app_test[features].copy()

    print(train.shape, test.shape)

    imputer = Imputer(strategy='median')
    scaler = MinMaxScaler(feature_range=(0, 1))

    imputer.fit(train)
    train = imputer.transform(train)
    test = imputer.transform(test)

    scaler.fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)

    print(train.shape[0], test.shape[0])

    train_feature = pd.DataFrame(train, columns=features, index=label.index)

    # print('label shape, train_feature shape')
    # print(label.shape, train_feature.shape)
    # print(label.head(), train_feature.head())
    # print((label.index == train_feature.index).sum())
    app_train = pd.concat([label, train_feature], axis=1)
    app_test = pd.DataFrame(test, columns=features)
    # print('app_train {}, app_test {} '.format(app_train.shape, app_test.shape))

    # plot

    kde_plot(app_train, features,
             'amt_4_pic/4_amt_persent_fillna_scaler_kde.png')
    skew_plot(app_train, features,
              'amt_4_pic/4_amt_persent_fillna_scaler_skew.png')

    # 模型训练

    print('4 lgb 训练')

    params_lgb = {
        'nthread': 4,
        'n_estimators': 10000,
        'learning_rate': 0.02,
        'num_leaves': 34,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'max_depth': 8,
        'reg_alpha': 0.041545473,
        'reg_lambda': 0.0735294,
        'min_split_gain': 0.0222415,
        'min_child_weight': 39.3259775,
        'silent': -1,
        'verbose': -1,
    }

    oof_preds, sub_preds, feature_importance, metrics = OOFPreds(train_feature,
                                                                 label,
                                                                 app_test,
                                                                 params_lgb,
                                                                 clf='lgb')

    # display_importances(feature_importance, num_features=4, filename='./basic_4_pic/feature_importance.png')

    print(metrics)

    sub_preds = pd.concat([test_skid, sub_preds], axis=1)

    sub_preds.to_csv('lgb_4amt_persent_-fillmedian-minmax-val-180628.csv',
                     index=False)

    # ----------------------------------------------------------------------------------------------------
    print('5 box_cox')
    # ----------------------------------------------------------------------------------------------------

    train_len = app_train.shape[0]
    app_all = pd.concat([app_train, app_test], axis=0)
    print(train_len)

    for col in features:
        col_trans = scale_minmax(app_all.loc[:, col])
        app_all.loc[:, col], _ = stats.boxcox(col_trans + 1)
        app_all.loc[:, col] = scale_minmax(app_all.loc[:, col])

    app_train = app_all.iloc[:train_len, :]
    app_test = app_all.iloc[train_len:, :]

    kde_plot(app_train, features, 'amt_4_pic/4_persent_boxcox_kde.png')
    skew_plot(app_train, features, 'amt_4_pic/4_persent_boxcox_skew.png')

    # ----------------------------------------------------------------------------------------------------
    print('6 lgb 训练')
    # ----------------------------------------------------------------------------------------------------

    params_lgb = {
        'nthread': 4,
        'n_estimators': 10000,
        'learning_rate': 0.02,
        'num_leaves': 34,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'max_depth': 8,
        'reg_alpha': 0.041545473,
        'reg_lambda': 0.0735294,
        'min_split_gain': 0.0222415,
        'min_child_weight': 39.3259775,
        'silent': -1,
        'verbose': -1,
    }

    print(app_train.shape, app_test.shape)
    train_feature = app_train[features]
    test = app_test[features]

    oof_preds, sub_preds, feature_importance, metrics = OOFPreds(train_feature,
                                                                 label,
                                                                 test,
                                                                 params_lgb,
                                                                 clf='lgb')

    print(FileExistsError)
    display_importances(feature_importance,
                        num_features=4,
                        filename='feature_importance.png')

    print(metrics)

    fpr, tpr, _ = roc_curve(label, oof_preds)
    print(fpr, tpr)
    plt.figure(figsize=(8, 8))
    plt.plot(fpr,
             tpr,
             lw=1,
             alpha=0.3,
             label='ROC (AUC = %0.4f ' % (metrics.iloc[5, 2]))

    plt.plot([0, 1], [0, 1],
             linestyle='--',
             lw=2,
             color='r',
             label='Luck',
             alpha=.8)
    plt.savefig('roc_persent.png')

    sub_preds = pd.concat([test_skid, sub_preds], axis=1)

    sub_preds.to_csv('lgb_4amt_persent-boxcox-val-180628.csv', index=False)
Esempio n. 3
0
def kfold_lightgbm(train_df,
                   test_df,
                   num_folds,
                   submission_file_name,
                   stratified=False,
                   debug=False):
    logger = logging.getLogger('lgbm_train')
    logger.info("Starting LightGBM. Train shape: {}, test shape: {}".format(
        train_df.shape, test_df.shape))

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds,
                                shuffle=True,
                                random_state=326)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=326)

    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(
            folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df[
            'target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df[
            'target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x, label=train_y, free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x, label=valid_y, free_raw_data=False)

        # params optimized by optuna
        params = {
            'device': 'gpu',
            'task': 'train',
            'objective': 'regression',
            'metric': 'rmse',
            'boosting': 'gbdt',
            'learning_rate': 0.01,
            'subsample': 0.718509060213284,
            'max_depth': 8,
            'top_rate': 0.8076614306859368,
            'num_leaves': 45,
            'min_child_weight': 59.174950161115106,
            'other_rate': 0.0721768246018207,
            'reg_alpha': 17.018862389097798,
            'reg_lambda': 24.20636870149939,
            'colsample_bytree': 0.667864732544997,
            'min_split_gain': 8.021790442813048,
            'min_data_in_leaf': 30,
            'verbose': -1,
            'seed': int(2**n_fold),
            'bagging_seed': int(2**n_fold),
            'drop_seed': int(2**n_fold)
        }

        reg = lgb.train(params=params,
                        train_set=lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds=200,
                        verbose_eval=100)

        oof_preds[valid_idx] = reg.predict(valid_x,
                                           num_iteration=reg.best_iteration)
        sub_preds += reg.predict(
            test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(
            reg.feature_importance(importance_type='gain',
                                   iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)
        logger.info('Fold %2d RMSE : %.6f' %
                    (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()

    # display importances
    display_importances(feature_importance_df)

    if not debug:
        # save submission file
        test_df.loc[:, 'target'] = sub_preds
        test_df = test_df.reset_index()
        test_df[['card_id', 'target']].to_csv(submission_file_name,
                                              index=False)
def get_model(df, feats):
    res,feature_importance_df  = lightgbm(df, feats)
    res.to_csv(config.SUBMISSION_FILE_NAME, index= False)
    utils.display_importances(feature_importance_df, config.IMPORTANCE_IMAGE_PATH)
Esempio n. 5
0
def main(debug=2000):
    if debug is not False:
        rows = debug
    else:
        rows = None

    app_train = pd.read_csv('../input/application_train.csv', nrows=rows)
    app_test = pd.read_csv('../input/application_test.csv', nrows=rows)

    id_train = app_train.index
    id_test = app_test.index
    test_skid = app_test[['SK_ID_CURR']]

    app_all = pd.concat([app_train, app_test], axis=0)
    print('shape: {}'.format(app_all.shape))

    y = app_train['TARGET']
    ids = app_train['SK_ID_CURR']

    del app_train['SK_ID_CURR']

    missing_values = missing_values_table(app_train)
    # print(missing_values.head(40))
    # print(missing_values.tail(10))

    # ----------------------------------------------------------------------------------------------------

    # 划分特征类型: 数值连续,数值离散, 分类

    # ----------------------------------------------------------------------------------------------------
    '''
    列名当中其实已经包含了,数据的类型
    
    分类:
    ---------
    TYPE CODE
    
    数值连续:
    ----------
    DAYS
    CNT
    RELATIVE 相对值 也就是标准化过
    AMT
    
    数值离散:
    ----------
    FLAG 
    RATING
    NOT  
    '''
    # print('train shape {}'.format(app_train.shape))
    #
    # num_cols = app_train.select_dtypes(include=[np.number]).columns.tolist()
    # cat_cols = app_train.select_dtypes(include=[np.object]).columns.tolist()
    #
    # print("\nnum_cols count:{}".format(len(num_cols)))
    # print("\ncat_cols count:{}".format(len(cat_cols)))
    #
    # print("\nnum_cols:{}".format(num_cols))
    # print("\ncat_cols:{}".format(cat_cols))
    #
    # cat_flag_cols = [col for col in cat_cols if app_train[col].nunique() == 2]
    # print('\ncat flag 特征总数:{}'.format(len(cat_flag_cols)))
    # print('cat flag cols {}'.format(cat_flag_cols))
    #
    # # 只有一个值
    # num_1_cols = [col for col in num_cols if app_train[col].nunique() == 1]
    # print('\nnum 1 特征总数:{}'.format(len(num_1_cols)))
    # print('{}'.format(num_1_cols))
    #
    # num_flag_cols = [col for col in num_cols if app_train[col].nunique() == 2]
    # print('\nnum flag 特征总数:{}'.format(len(num_flag_cols)))
    # print('num flag cols {}'.format(num_flag_cols))
    #
    # num_discreet_cols = [col for col in num_cols if 2 < app_train[col].nunique() <= 20]
    # print('\n数值离散 特征总数:{}'.format(len(num_discreet_cols)))
    # print('数值离散 {}'.format(num_discreet_cols))
    #
    # num_continuous_cols = [col for col in num_cols if 20 < app_train[col].nunique()]
    # print('\n数值连续 特征总数:{}'.format(len(num_continuous_cols)))
    # print('数值连续 {}'.format(num_continuous_cols))

    # for col in num_cols:
    #     print(app_train[col].nunique())

    # ----------------------------------------------------------------------------------------------------

    # 数据探索可视化

    # ----------------------------------------------------------------------------------------------------

    # ----------------------------------------------------------------------------------------------------
    # 离群值可视化

    # for col in num_continuous_cols:
    #     draw_feature_distribution(app_all, col)

    # ----------------------------------------------------------------------------------------------------
    #

    # fcols = 2
    # frows = len(num_continuous_cols)
    # plt.figure(figsize=(4 * fcols, 6 * frows))
    # i = 0
    #
    # for col in num_continuous_cols:
    #
    #     dat = app_train[[col, 'TARGET']].dropna()
    #
    #     i += 1
    #     plt.subplot(frows, fcols, i)
    #     sns.distplot(dat[col], fit=stats.norm)
    #     plt.title(col + ' Original')
    #     plt.xlabel('')
    #
    #     i += 1
    #     plt.subplot(frows, fcols, i)
    #     _ = stats.probplot(dat[col], plot=plt)
    #     plt.title('skew=' + '{:.4f}'.format(stats.skew(dat[col])))
    #     plt.xlabel('')
    #     plt.ylabel('')
    #
    #
    #
    #
    #
    # plt.tight_layout(h_pad=2.5)
    # plt.savefig('./pic/num_continue.png')
    # plt.show()

    # ----------------------------------------------------------------------------------------------------

    # 编码

    # ----------------------------------------------------------------------------------------------------

    le = LabelEncoder()
    le_count = 0

    # Iterate through the columns
    for col in app_train.columns:
        if app_train[col].dtype == 'object':
            # If 2 or fewer unique categories
            if len(list(app_train[col].unique())) <= 2:
                # Train on the training data
                le.fit(app_train[col])
                # Transform both training and testing data
                app_train[col] = le.transform(app_train[col])
                app_test[col] = le.transform(app_test[col])

                # Keep track of how many columns were label encoded
                le_count += 1
                print(col)

    print('%d columns were label encoded.' % le_count)

    app_train = pd.get_dummies(app_train)
    app_test = pd.get_dummies(app_test)

    print('Training Features shape: ', app_train.shape)
    print('Testing Features shape: ', app_test.shape)

    # 对齐 train test

    train_labels = app_train['TARGET']

    app_train, app_test = app_train.align(app_test, join='inner', axis=1)

    # Add the target back in
    app_train['TARGET'] = train_labels

    print('Training Features shape: ', app_train.shape)
    print('Testing Features shape: ', app_test.shape)

    # ----------------------------------------------------------------------------------------------------

    # 处理异常数据 DAYS_EMPLOYED_ANOM

    # ----------------------------------------------------------------------------------------------------

    app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243

    # Replace the anomalous values with nan
    app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace=True)

    # app_train['DAYS_EMPLOYED'].plot.hist(title='Days Employment Histogram')
    # plt.xlabel('Days Employment')
    # plt.show()

    app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
    app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace=True)

    print('There are %d anomalies in the test data out of %d entries' %
          (app_test["DAYS_EMPLOYED_ANOM"].sum(), len(app_test)))

    # ----------------------------------------------------------------------------------------------------

    # baseline

    # ----------------------------------------------------------------------------------------------------

    # Drop the target from the training data
    if 'TARGET' in app_train:
        train = app_train.drop(columns=['TARGET'])
    else:
        train = app_train.copy()

    # Feature names
    features = list(train.columns)

    # Copy of the testing data
    test = app_test.copy()

    print(type(test))

    imputer = Imputer(strategy='median')
    scaler = MinMaxScaler(feature_range=(0, 1))

    imputer.fit(train)
    train = imputer.transform(train)
    test = imputer.transform(app_test)

    scaler.fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)

    print('Training data shape: ', train.shape)
    print('Testing data shape: ', test.shape)

    train = pd.DataFrame(train, columns=features)
    test = pd.DataFrame(test, columns=features)

    # ----------------------------------------------------------------------------------------------------

    # 尝试用 bayesian optimization 给 logistic 回归调参

    # ----------------------------------------------------------------------------------------------------
    print(train.shape)
    print(train_labels.ravel().reshape(len(train_labels), 1).shape)

    # ----------------------------------------------------------------------------------------------------

    # 训练模型

    # ----------------------------------------------------------------------------------------------------
    params = {'C': 0.001}
    params_lgb = {
        'nthread': 4,
        'n_estimators': 10000,
        'learning_rate': 0.02,
        'num_leaves': 34,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'max_depth': 8,
        'reg_alpha': 0.041545473,
        'reg_lambda': 0.0735294,
        'min_split_gain': 0.0222415,
        'min_child_weight': 39.3259775,
        'silent': -1,
        'verbose': -1,
    }

    # oof_preds, sub_preds, feature_importance, metrics = OOFPreds(train, train_labels, test, params=params,
    #                                                              n_splits=2, clf=None)

    oof_preds, sub_preds, feature_importance, metrics = OOFPreds(
        train, train_labels, test, params=params_lgb, n_splits=5, clf='lgb')

    print('feature importance: {}'.format(feature_importance.shape))
    print(feature_importance.dtypes)
    display_importances(feature_importance,
                        num_features=20,
                        filename='lgb_20.png')

    print(metrics)

    sub_preds = pd.concat([test_skid, sub_preds], axis=1)

    sub_preds.to_csv('lgb_baseline-fillmedian-minmax-val-180627.csv',
                     index=False)

    # ----------------------------------------------------------------------------------------------------

    # 需要处理的离群点

    # ----------------------------------------------------------------------------------------------------
    outlier = [
        'YEARS_BEGINEXPLUATATION_AVG', 'DAYS_EMPLOYED', 'BASEMENTAREA_AVG'
    ]
Esempio n. 6
0
def kfold_lightgbm(df, num_folds, stratified=False, debug=False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()]
    test_df = df[df['TARGET'].isnull()]
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits=num_folds, shuffle=True, random_state=1001)
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx],
                             label=train_df['TARGET'].iloc[train_idx],
                             free_raw_data=False, silent=True)
        dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx],
                             label=train_df['TARGET'].iloc[valid_idx],
                             free_raw_data=False, silent=True)

        # LightGBM parameters found by Bayesian optimization
        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'nthread': 16,
            'learning_rate': 0.02,  # 02,
            'num_leaves': 20,
            'colsample_bytree': 0.9497036,
            'subsample': 0.8715623,
            'subsample_freq': 1,
            'max_depth': 8,
            'reg_alpha': 0.041545473,
            'reg_lambda': 0.0735294,
            'min_split_gain': 0.0222415,
            'min_child_weight': 60,  # 39.3259775,
            'seed': 0,
            'verbose': -1,
            'metric': 'auc',
        }

        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=10000,
            valid_sets=[dtrain, dvalid],
            early_stopping_rounds=200,
            verbose_eval=False
        )

        oof_preds[valid_idx] = clf.predict(dvalid.data)
        sub_preds += clf.predict(test_df[feats]) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(dvalid.label, oof_preds[valid_idx])))
        del clf, dtrain, dvalid
        gc.collect()

    print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))
    # Write submission file and plot feature importance
    if not debug:
        sub_df = test_df[['SK_ID_CURR']].copy()
        sub_df['TARGET'] = sub_preds
        sub_df[['SK_ID_CURR', 'TARGET']].to_csv(submission_file_name, index=False)
    if debug:
        display_importances(feature_importance_df)
    return feature_importance_df