Exemple #1
0
    def train_1fold(self, fold, params, params_custom):
        X_train, X_valid, y_train, y_valid, X_test, vdx, tdx = self.get_fold_data(fold)
        
        if fold == 0:
            X_train.dtypes.to_csv(self.models_path + "/dtypes.csv")
        logger.info(f"X_train.shape = {X_train.shape} f{fold:02d}")

        mprof_timestamp(f"lgb_dataset_f{fold}")
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_valid = lgb.Dataset(X_valid, y_valid)

        feval = build_feval(params_custom)

        # 学習の実行
        evals = dict()
        params2 = copy.deepcopy(params)
        callbacks = [log_evaluation(logger, period=10)]
        if params2["seed"] is not None:
            params2["seed"] = params2["seed"] + fold
            logger.info(f"Set lgbm train seed = {params2['seed']}")

        logger.info(f"Start train f{fold:02d}")
        mprof_timestamp(f"lgb_train_f{fold}")
        model = lgb.train(params2, lgb_train,
                            valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                            verbose_eval=100, evals_result=evals, feval=feval, callbacks=callbacks, 
                            keep_training_booster=True)

        logger.info(f"current_iteration={model.current_iteration()}")
        logger.info(f"best_iteration={model.best_iteration}")

        mprof_timestamp(f"lgb_postproc_f{fold}")
        model.save_model(self.models_path + f"/model-lgbm-f{fold:02d}.txt", num_iteration=model.best_iteration)
        model.save_model(self.models_path + f"/model-lgbm-last-f{fold:02d}.txt", num_iteration=-1)

        evals_df = pd.DataFrame({
            f"logloss_train_f{fold:02d}":evals["train"]['logloss'],
            f"accuracy_train_f{fold:02d}":evals["train"]['accuracy'],
            f"logloss_valid_f{fold:02d}":evals['valid']['logloss'],
            f"accuracy_valid_f{fold:02d}":evals['valid']['accuracy']
        })
        self.evals_df.append(evals_df)

        # 予測値の保存
        preds_valid = model.predict(X_valid, num_iteration=model.best_iteration)
        self.preds_valid_all.loc[vdx, "pred"] = preds_valid

        preds_train = model.predict(X_train, num_iteration=model.best_iteration)
        self.preds_train_all.append(pd.DataFrame({fold:preds_train}, index=tdx))

        preds_test = model.predict(X_test, num_iteration=model.best_iteration)
        self.preds_test_all.append(preds_test)

        # 性能指標の保存
        ms = [fold, model.best_score["train"]["accuracy"], model.best_score["valid"]["accuracy"], 
                model.best_score["train"]["logloss"], model.best_score["valid"]["logloss"], model.best_iteration]
        self.mets.append(ms)
        show_mets(*ms)
        
        for it in ["gain", "split"]:
            imp = pd.Series(model.feature_importance(importance_type=it, iteration=model.best_iteration), 
                    index=model.feature_name())
            imp.name = fold
            imp.index.name = "feature"
            self.importance[it].append(imp)
Exemple #2
0
def create_tarenc_agg_features(mode_target_persons, mode_target_cols):
    logger.info(f"tarenc_agg_tp{mode_target_persons}_tc{mode_target_cols}")
    mprof_timestamp(f"tarenc_agg_tp{mode_target_persons}_tc{mode_target_cols}")
    _ = Parallel(n_jobs=args.n_jobs//2, verbose=args.verbose_joblib) \
            ([delayed(create_tarenc_agg_features_1fold)(fold, mode_target_persons, mode_target_cols, "tarenc") for fold in range(args.FOLD_NUM)])
Exemple #3
0
def create_freqenc_agg_features(mode_target_persons, mode_target_cols):
    logger.info(f"freqenc_agg_tp{mode_target_persons}_tc{mode_target_cols}")
    mprof_timestamp(
        f"freqenc_agg_tp{mode_target_persons}_tc{mode_target_cols}")
    create_tarenc_agg_features_1fold(None, mode_target_persons,
                                     mode_target_cols, "freqenc")
Exemple #4
0
    os.makedirs(args.features_path, exist_ok=False)
else:  # 既存のdirに出力
    if args.DO_TEST:
        args.features_path = "../features/test"
    os.makedirs(args.features_path, exist_ok=True)

logging_config.init(f"{args.features_path}/log_{timestr}.log")

logger.info(f"features_path = {args.features_path}")

logger.info("args =\n" + pp.pformat(vars(args)))

util.dump_json(vars(args), f"{args.features_path}/args_{timestr}.json")
shutil.copytree("../src", args.features_path + "/src_" + timestr)

mprof_timestamp("basic")


def create_team_agg_features_wrp(merge):
    ma = merge[[
        c for c in merge.columns if re.match("A[1234]-(level|rank-int)", c)
    ]]
    mb = merge[[
        c for c in merge.columns if re.match("B[1234]-(level|rank-int)", c)
    ]]
    ma_agg = create_team_agg_features(ma, "A")
    ma234_agg = create_team_agg_features(ma, "A234")
    mb_agg = create_team_agg_features(mb, "B")

    m_diff = create_team_agg_diff_features(ma_agg, mb_agg)
    m_diff_a234b = create_team_agg_diff_features(ma234_agg, mb_agg)