def train_1fold(self, fold, params, params_custom): X_train, X_valid, y_train, y_valid, X_test, vdx, tdx = self.get_fold_data(fold) if fold == 0: X_train.dtypes.to_csv(self.models_path + "/dtypes.csv") logger.info(f"X_train.shape = {X_train.shape} f{fold:02d}") mprof_timestamp(f"lgb_dataset_f{fold}") lgb_train = lgb.Dataset(X_train, y_train) lgb_valid = lgb.Dataset(X_valid, y_valid) feval = build_feval(params_custom) # 学習の実行 evals = dict() params2 = copy.deepcopy(params) callbacks = [log_evaluation(logger, period=10)] if params2["seed"] is not None: params2["seed"] = params2["seed"] + fold logger.info(f"Set lgbm train seed = {params2['seed']}") logger.info(f"Start train f{fold:02d}") mprof_timestamp(f"lgb_train_f{fold}") model = lgb.train(params2, lgb_train, valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid], verbose_eval=100, evals_result=evals, feval=feval, callbacks=callbacks, keep_training_booster=True) logger.info(f"current_iteration={model.current_iteration()}") logger.info(f"best_iteration={model.best_iteration}") mprof_timestamp(f"lgb_postproc_f{fold}") model.save_model(self.models_path + f"/model-lgbm-f{fold:02d}.txt", num_iteration=model.best_iteration) model.save_model(self.models_path + f"/model-lgbm-last-f{fold:02d}.txt", num_iteration=-1) evals_df = pd.DataFrame({ f"logloss_train_f{fold:02d}":evals["train"]['logloss'], f"accuracy_train_f{fold:02d}":evals["train"]['accuracy'], f"logloss_valid_f{fold:02d}":evals['valid']['logloss'], f"accuracy_valid_f{fold:02d}":evals['valid']['accuracy'] }) self.evals_df.append(evals_df) # 予測値の保存 preds_valid = model.predict(X_valid, num_iteration=model.best_iteration) self.preds_valid_all.loc[vdx, "pred"] = preds_valid preds_train = model.predict(X_train, num_iteration=model.best_iteration) self.preds_train_all.append(pd.DataFrame({fold:preds_train}, index=tdx)) preds_test = model.predict(X_test, num_iteration=model.best_iteration) self.preds_test_all.append(preds_test) # 性能指標の保存 ms = [fold, model.best_score["train"]["accuracy"], model.best_score["valid"]["accuracy"], model.best_score["train"]["logloss"], model.best_score["valid"]["logloss"], model.best_iteration] self.mets.append(ms) show_mets(*ms) for it in ["gain", "split"]: imp = pd.Series(model.feature_importance(importance_type=it, iteration=model.best_iteration), index=model.feature_name()) imp.name = fold imp.index.name = "feature" self.importance[it].append(imp)
def create_tarenc_agg_features(mode_target_persons, mode_target_cols): logger.info(f"tarenc_agg_tp{mode_target_persons}_tc{mode_target_cols}") mprof_timestamp(f"tarenc_agg_tp{mode_target_persons}_tc{mode_target_cols}") _ = Parallel(n_jobs=args.n_jobs//2, verbose=args.verbose_joblib) \ ([delayed(create_tarenc_agg_features_1fold)(fold, mode_target_persons, mode_target_cols, "tarenc") for fold in range(args.FOLD_NUM)])
def create_freqenc_agg_features(mode_target_persons, mode_target_cols): logger.info(f"freqenc_agg_tp{mode_target_persons}_tc{mode_target_cols}") mprof_timestamp( f"freqenc_agg_tp{mode_target_persons}_tc{mode_target_cols}") create_tarenc_agg_features_1fold(None, mode_target_persons, mode_target_cols, "freqenc")
os.makedirs(args.features_path, exist_ok=False) else: # 既存のdirに出力 if args.DO_TEST: args.features_path = "../features/test" os.makedirs(args.features_path, exist_ok=True) logging_config.init(f"{args.features_path}/log_{timestr}.log") logger.info(f"features_path = {args.features_path}") logger.info("args =\n" + pp.pformat(vars(args))) util.dump_json(vars(args), f"{args.features_path}/args_{timestr}.json") shutil.copytree("../src", args.features_path + "/src_" + timestr) mprof_timestamp("basic") def create_team_agg_features_wrp(merge): ma = merge[[ c for c in merge.columns if re.match("A[1234]-(level|rank-int)", c) ]] mb = merge[[ c for c in merge.columns if re.match("B[1234]-(level|rank-int)", c) ]] ma_agg = create_team_agg_features(ma, "A") ma234_agg = create_team_agg_features(ma, "A234") mb_agg = create_team_agg_features(mb, "B") m_diff = create_team_agg_diff_features(ma_agg, mb_agg) m_diff_a234b = create_team_agg_diff_features(ma234_agg, mb_agg)