Esempio n. 1
0
def main():

    #========================================================================
    # Data Load
    #========================================================================
    win_path_list = glob.glob(win_path)
    train_path_list = []
    test_path_list = []
    for path in win_path_list:
        if path.count('train'):
            train_path_list.append(path)
        elif path.count('test'):
            test_path_list.append(path)

    #  train_feature_list = utils.pararell_load_data(path_list=train_path_list, delimiter='gz')
    #  test_feature_list = utils.pararell_load_data(path_list=test_path_list, delimiter='gz')
    #  train = pd.concat(train_feature_list, axis=1)
    #  test = pd.concat(test_feature_list, axis=1)
    df = utils.read_df_pkl('../input/appli*')
    train = df[df[target] >= 0]
    test = df[df[target] == -1]

    metric = 'auc'
    fold = 5
    fold_type = 'stratified'
    group_col_name = ''
    dummie = 1
    oof_flg = True
    LGBM = lgb_ex(logger=logger,
                  metric=metric,
                  model_type=model_type,
                  ignore_list=ignore_list)

    train, _ = LGBM.data_check(df=train)
    test, drop_list = LGBM.data_check(df=test, test_flg=True)
    if len(drop_list):
        train.drop(drop_list, axis=1, inplace=True)
        test.drop(drop_list, axis=1, inplace=True)

    #========================================================================
    # Train & Prediction Start
    #========================================================================
    LGBM = LGBM.cross_prediction(train=train,
                                 test=test,
                                 key=key,
                                 target=target,
                                 fold_type=fold_type,
                                 fold=fold,
                                 group_col_name=group_col_name,
                                 params=params,
                                 num_boost_round=num_boost_round,
                                 early_stopping_rounds=early_stopping_rounds,
                                 oof_flg=oof_flg)

    #========================================================================
    # Result
    #========================================================================
    cv_score = LGBM.cv_score
    result = LGBM.prediction
    cv_feim = LGBM.cv_feim
    feature_num = len(LGBM.use_cols)

    cv_feim.to_csv(
        f'../valid/{start_time[4:12]}_{model_type}_{fname}_feat{feature_num}_CV{cv_score}_lr{learning_rate}.csv'
    )

    #========================================================================
    # X-RAYの計算と出力
    # Args:
    #     model    : 学習済のモデル
    #     train    : モデルの学習に使用したデータセット
    #     col_list : X-RAYの計算を行うカラムリスト。指定なしの場合、
    #                データセットの全カラムについて計算を行うが、
    #                計算時間を考えると最大30カラム程度を推奨。
    #========================================================================
    xray = False
    if xray:
        train.reset_index(inplace=True)
        train = train[LGBM.use_cols]
        result_xray = pd.DataFrame()
        N_sample = 150000
        max_point = 30
        for fold_num in range(fold):
            model = LGBM.fold_model_list[fold_num]
            if fold_num == 0:
                xray_obj = Xray_Cal(logger=logger,
                                    ignore_list=ignore_list,
                                    model=model)
            xray_obj, tmp_xray = xray_obj.get_xray(base_xray=train,
                                                   col_list=train.columns,
                                                   fold_num=fold_num,
                                                   N_sample=N_sample,
                                                   max_point=max_point)
            tmp_xray.rename(columns={'xray': f'xray_{fold_num}'}, inplace=True)

            if len(result_xray):
                result_xray.merge(tmp_xray.drop('N', axis=1),
                                  on=['feature', 'value'],
                                  how='inner')
            else:
                result_xray = tmp_xray.copy()
            del tmp_xray
            gc.collect()

        xray_col = [col for col in result_xray.columns if col.count('xray')]
        result_xray['xray_avg'] = result_xray[xray_col].mean(axis=1)
        result_xray.to_csv(
            f'../output/{start_time[4:10]}_xray_{model_type}_CV{LGBM.cv_score}.csv'
        )
        sys.exit()

    submit = pd.read_csv('../input/sample_submission.csv')
    #  submit = []

    #========================================================================
    # STACKING
    #========================================================================
    if len(stack_name) > 0:
        logger.info(f'result_stack shape: {LGBM.result_stack.shape}')
        utils.to_pkl(
            path=
            f"../stack/{start_time[4:12]}_{stack_name}_{model_type}_CV{str(cv_score).replace('.', '-')}_{feature_num}features.fp",
            obj=LGBM.result_stack)
    logger.info(
        f'FEATURE IMPORTANCE PATH: {HOME}/kaggle/home-credit-default-risk/output/cv_feature{feature_num}_importances_auc_{cv_score}.csv'
    )

    #========================================================================
    # Submission
    #========================================================================
    if len(submit) > 0:
        if stack_name == 'add_nest':
            test[target] = result
            test = test.reset_index()[[
                key, target
            ]].groupby(key)[target].mean().reset_index()
            submit = submit[key].to_frame().merge(test, on=key, how='left')
            submit[target].fillna(0, inplace=True)
            submit.to_csv(
                f'../submit/{start_time[4:12]}_submit_{fname}_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv',
                index=False)
        else:
            submit[target] = result
            submit.to_csv(
                f'../submit/{start_time[4:12]}_submit_{model_type}_rate{learning_rate}_{feature_num}features_CV{cv_score}_LB.csv',
                index=False)
#  test = pd.concat([base_test, df_feat.iloc[len(base_train):, :].reset_index(drop=True)], axis=1)
test = []

#========================================================================
# LGBM Setting
seed = 1208
metric = 'rmse'
fold_type = 'self'
group_col_name = ''
dummie = 1
oof_flg = True

#========================================================================
# Preprocessing
LGBM = lgb_ex(logger=logger,
              metric=metric,
              model_type=model_type,
              ignore_list=ignore_list)
#  train, test, drop_list = LGBM.data_check(train=train, test=test, target=target)
train, test, drop_list = LGBM.data_check(train=train, test=[], target=target)
if len(drop_list):
    train.drop(drop_list, axis=1, inplace=True)
    #  test.drop(drop_list, axis=1, inplace=True)
#========================================================================

#========================================================================
# Increase Valid Features
valid_feat_list = [''] + glob.glob('../features/1_first_valid/*.gz')
#========================================================================

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
def objective(trial):

    #  subsample = trial.suggest_uniform('subsample', 0.9, 0.98)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.20, 0.33)
    num_leaves = trial.suggest_int('num_leaves', 54, 73)
    #  max_depth = trial.suggest_int('max_depth', 8, 12)
    min_child_samples = trial.suggest_int('min_child_samples', 30, 75)
    lambda_l2 = trial.suggest_int('lambda_l2', 3.0, 15.0)

    params = {
        #  'num_threads': -1,
        'num_threads': 32,
        'num_leaves': num_leaves,
        'objective': 'regression',
        "boosting": "gbdt",
        #  'max_depth': max_depth,
        'max_depth': -1,
        'learning_rate': learning_rate,
        "min_child_samples": min_child_samples,
        "bagging_freq": 1,
        #  "subsample": subsample ,
        "subsample": 0.9,
        "colsample_bytree": colsample_bytree,
        #  "colsample_bytree": 0.9,
        "metric": 'rmse',
        "lambda_l1": 0.1,
        "lambda_l2": lambda_l2,
        #  "lambda_l2": 0.1,
        "verbosity": -1,
        'random_seed': seed,
        'bagging_seed': seed,
        'feature_fraction_seed': seed,
        'data_random_seed': seed
    }

    LGBM = lgb_ex(logger=logger,
                  metric=metric,
                  model_type=model_type,
                  ignore_list=ignore_list)
    LGBM.seed = seed

    #  train['outliers'] = train[target].map(lambda x: 1 if x<-30 else 0)
    #  folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
    #  kfold = folds.split(train,train['outliers'].values)
    #  train.drop('outliers', axis=1, inplace=True)

    #========================================================================
    # Train & Prediction Start
    #========================================================================
    LGBM = LGBM.cross_validation(
        train=train,
        key=key,
        target=target,
        fold_type=fold_type,
        fold=fold,
        group_col_name=group_col_name,
        params=params,
        num_boost_round=num_boost_round,
        early_stopping_rounds=early_stopping_rounds,
        self_kfold=kfold
        #  ,self_stop=thres_score_list
        ,
        params_tune=True)

    cv_score = LGBM.cv_score
    pred_val = LGBM.prediction
    df_pred = train.reset_index()[key].to_frame()
    df_pred['prediction'] = pred_val

    # outlierに対するスコアを出す
    #  from sklearn.metrics import mean_squared_error
    #  train.reset_index(inplace=True)

    #  out_ids = train.loc[train.target<-30, key].values
    #  out_val = train.loc[train.target<-30, target].values
    #  out_pred = df_pred[df_pred[key].isin(out_ids)]['prediction'].values
    #  out_score = np.sqrt(mean_squared_error(out_val, out_pred))

    #  out_list.append(out_score)
    #  if len(out_list)%10==0:
    #      if len(out_list)>=10:
    #          print(out_list[-10:])
    #      else:
    #          print(out_list)

    #  # outlier以外に対するスコアを出す
    #  com_ids = train.loc[train.target>-30, key].values
    #  com_val = train.loc[train.target>-30, target].values
    #  com_pred = df_pred[df_pred[key].isin(com_ids)]['prediction'].values
    #  com_score = np.sqrt(mean_squared_error(com_val, com_pred))
    #  com_list.append(com_score)
    #  com_score -= 1.8404775225287757

    logger.info(f'''
    #========================================================================
    # CV SCORE: {cv_score}
    #========================================================================'''
                )
    #  if com_score<0:
    #      out_score += com_score*-2

    #  # スコア経過のログ
    #  LGBM.val_score_list.append(cv_score)
    #  LGBM.val_score_list.append(params)
    #  tmp = pd.Series(LGBM.val_score_list)
    #  valid_list.append(tmp.copy())

    return cv_score