def mk_submit(): files_tr = ('../feature/train_' + features + '.f').tolist() files_te = ('../feature/test_' + features + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) y_train = utils.read_pickles('../data/label').TARGET # ============================================================================= # remove old users # ============================================================================= X_train['SK_ID_CURR'] = SK_ID_CURR y_train = y_train[~X_train.SK_ID_CURR.isin(drop_ids)] X_train = X_train[~X_train.SK_ID_CURR.isin(drop_ids)] oof_train = X_train[['SK_ID_CURR']] X_train.drop('SK_ID_CURR', axis=1, inplace=True) X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(utils_cat.ALL)) COL = X_train.columns.tolist() # test X_test = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(loop {i}): {auc_mean}" print(result) utils.send_line(result) y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean: {auc_mean}" print(result) utils.send_line(result) # save oof_train['oof'] = y_pred oof_train.to_csv('../output/onodera-last-oof.csv', index=False) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
EXE_SUBMIT = True np.random.seed(SEED) print('seed :', SEED) # ============================================================================= # wait # ============================================================================= while True: if os.path.isfile('SUCCESS_805'): break else: sleep(60 * 1) utils.send_line(f'START {__file__}') # ============================================================================= # load train # ============================================================================= dtrain = lgb.Dataset('../data/dtrain.mt') gc.collect() # ============================================================================= # xgboost # ============================================================================= param = { 'objective': 'binary', 'metric': 'auc', 'max_bin': 100,
verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X[COL], y.values, models, SEED, stratified=True, shuffle=True, n_class=True) y_preds.append(y_pred) model_all += models nround_mean += len(ret['wloss-mean']) wloss_list.append(ret['wloss-mean'][-1]) nround_mean = int((nround_mean / 2) * 1.3) utils.send_line(f'nround_mean: {nround_mean}') result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" print(result) utils.send_line(result) for i, y_pred in enumerate(y_preds): y_pred = pd.DataFrame(utils_metric.softmax(y_pred.astype(float).values)) if i == 0: tmp = y_pred else: tmp += y_pred tmp /= len(y_preds) y_preds = tmp.copy().values.astype(float) a_score = utils_metric.akiyama_metric(y.values, y_preds)
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_df = dh.load('../data/input/train_data.csv') test_df = dh.load('../data/input/test_data.csv') oof = np.zeros((len(train_df), len(cfg.models))) preds = np.zeros((len(test_df), len(cfg.models))) for i, m in enumerate(cfg.models): name = getattr(cfg.models, m).name log_dir = Path(f'../logs/{name}') model_oof = dh.load(log_dir / 'oof.npy') model_cfg = dh.load(log_dir / 'config.yml') if model_cfg.common.drop: drop_idxs = np.array([]) for drop_name in model_cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idxs = np.append(drop_idxs, drop_idx) model_oof = factory.fill_dropped(model_oof, drop_idx) model_preds = dh.load(f'../logs/{name}/raw_preds.npy') oof[:, i] = model_oof[:len(train_df)] preds[:, i] = model_preds with t.timer('drop index'): if cfg.common.drop is not None: drop_idxs = np.array([]) for drop_name in model_cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idxs = np.append(drop_idxs, drop_idx) train_df = train_df.drop(drop_idxs, axis=0).reset_index(drop=True) with t.timer('optimize model weight'): metric = factory.get_metrics(cfg.common.metrics.name) y_true = train_df[cfg.common.target] def objective(trial): p_list = [0 for i in range(len(cfg.models))] for i in range(len(cfg.models) - 1): p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01) p_list[-1] = round(1 - sum(p_list[:-1]), 2) y_pred = np.zeros(len(train_df)) for i in range(oof.shape[1]): y_pred += oof[:, i] * p_list[i] return metric(y_true, y_pred) study = optuna.create_study(direction='minimize') study.optimize(objective, timeout=10) best_params = list(study.best_params.values()) best_weight = best_params + [round(1 - sum(best_params), 2)] with t.timer('ensemble'): ensemble_oof = np.zeros(len(train_df)) ensemble_preds = np.zeros(len(test_df)) for i in range(len(best_weight)): ensemble_oof += oof[:, i] * best_weight[i] ensemble_preds += preds[:, i] * best_weight[i] dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof) dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds) cv = metric(y_true, ensemble_oof) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') print('\n\n===================================\n') print(f'CV: {cv:.4f}') print(f'BEST WEIGHT: {best_weight}') print('\n===================================\n\n') with t.timer('make submission'): sample_path = f'../data/input/sample_submission.feather' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=ensemble_preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{options.model}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
# 'lambda_l1': 3, # 'lambda_l2': 3, 'seed': SEED } # ============================================================================= # wait # ============================================================================= while True: if os.path.isfile('SUCCESS_801'): break else: sleep(60*1) utils.send_line('START {}'.format(__file__)) # ============================================================================= # load # ============================================================================= imp = pd.read_csv('imp_802_importance_502-2.py.csv').set_index('index') feature_all = imp[imp.weight!=0].index.tolist() X_train = pd.read_feather('../data/X_train_mini.f')[feature_all] y_train = pd.read_feather('../data/y_train_mini.f').is_attributed gc.collect() X_valid = pd.read_feather('../data/X_valid_mini.f')[feature_all] y_valid = pd.read_feather('../data/y_valid_mini.f').is_attributed gc.collect()
gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, feval=utils_metric.lgb_multi_weighted_logloss_gal, early_stopping_rounds=100, verbose_eval=50, seed=SEED) model_all += models nround_mean += len(ret['multi_logloss-mean']) wloss_list.append( ret['wloss-mean'][-1] ) nround_mean = int((nround_mean/LOOP) * 1.3) result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" print(result) utils.send_line(result) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) print(imp.head(200).feature.map(lambda x: x.split('_')[0]).value_counts()) COL = imp.feature.tolist()[:3000] X_gal = X_gal[COL] # ======== for gal ======== X_gal['g'] = np.arange(X_gal.shape[0]) % NFOLD
nround_mean += len(ret['auc-mean']) loss_list.append(ret['auc-mean'][-1]) nround_mean = int((nround_mean / LOOP) * 1.3) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) for i, y_pred in enumerate(y_preds): if i == 0: oof = y_pred else: oof += y_pred oof /= len(y_preds) imp.to_csv(f'LOG/imp_{__file__}.csv', index=False) pd.DataFrame(oof, columns=['oof']).to_csv(f'../data/oof_{__file__}.csv', index=False) utils.savefig_imp(imp, f'LOG/imp_{__file__}.png', x='total') utils.send_line(f'oof AUC({var}): {round(roc_auc_score(y_train, oof), 5)}') #============================================================================== utils.end(__file__) #utils.stop_instance()
def mk_submit(): files_tr = ('../feature/train_' + features + '.f').tolist() files_te = ('../feature/test_' + features + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = loader.train() X_train_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) X_train = pd.concat([X_train, X_train_], axis=1) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category())) COL = X_train.columns.tolist() # test X_test = loader.test() X_test_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1) X_test = pd.concat([X_test, X_test_], axis=1)[COL] # ============================================================================= # groupKfold # ============================================================================= sk_tbl = pd.read_csv('../data/user_id_v8.csv.gz') # TODO: check user_tbl = sk_tbl.user_id.drop_duplicates().reset_index( drop=True).to_frame() sub_train = pd.read_csv('../input/application_train.csv.zip', usecols=['SK_ID_CURR']).set_index('SK_ID_CURR') sub_train['y'] = y_train.values group_kfold = GroupKFold(n_splits=NFOLD) # ============================================================================= # training with cv # ============================================================================= model_all = [] auc_mean = 0 for i in range(LOOP): dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) # shuffle fold ids = list(range(user_tbl.shape[0])) np.random.shuffle(ids) user_tbl['g'] = np.array(ids) % NFOLD sk_tbl_ = pd.merge(sk_tbl, user_tbl, on='user_id', how='left').set_index('SK_ID_CURR') sub_train['g'] = sk_tbl_.g folds = group_kfold.split(X_train, sub_train['y'], sub_train['g']) gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, folds=folds, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models auc_mean += ret['auc-mean'][-1] auc_mean /= LOOP result = f"CV auc-mean({COMMENT}): {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, NROUND, nfold=NFOLD, stratified=True, shuffle=True, feval=ex.eval_auc, early_stopping_rounds=ESR, verbose_eval=VERBOSE_EVAL, seed=SEED + i) p_train = ex.eval_oob(X_train[col], y_train.values, models, SEED + i, stratified=True, shuffle=True) model_all += models nround_mean += len(ret['auc-mean']) loss_list.append(ret['auc-mean'][-1]) utils.send_line( f'oof AUC({i}): {round(roc_auc_score(y_train, p_train), 5)}') #============================================================================== utils.end(__file__) #utils.stop_instance()
def mk_submit(): files_tr = ('../feature/train_' + features + '.f').tolist() files_te = ('../feature/test_' + features + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = loader.train() X_train_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) X_train = pd.concat([X_train, X_train_], axis=1) y_train = utils.read_pickles('../data/label').TARGET # remove old users X_train = X_train[new_train_users] y_train = y_train[new_train_users] X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category())) COL = X_train.columns.tolist() # test X_test = loader.test() X_test_ = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1) X_test = pd.concat([X_test, X_test_], axis=1)[COL] # ============================================================================= # training with cv # ============================================================================= model_all = [] auc_mean = 0 for i in range(LOOP): dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models auc_mean += ret['auc-mean'][-1] auc_mean /= LOOP result = f"CV auc-mean({COMMENT}): {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
p_test = model.predict(X_test) # valid_idx_uq = pd.unique(valid_idx%200300) for j in range(var_len): # id_y.loc[valid_idx, 'pred'] = p_valid oof[valid_idx] = p_valid # oof[valid_idx_uq, j] = p_valid[j*l:(j+1)*l] p_test_all[:, j, i] = p_test[j * 100000:(j + 1) * 100000] models.append(model) id_y['pred'] = oof oof = pd.pivot_table(id_y, index='id', columns='var', values='pred').head(200000).values auc = roc_auc_score(y_train, (9 * oof / (1 - oof)).prod(axis=1)) utils.send_line(f'AUC(all var): {auc}') l = y_train.shape[0] oof_odds = np.ones(l) * 1 / 9 for j in range(var_len): if roc_auc_score(y_train, oof[:, j]) >= 0.500: oof_odds *= (9 * oof[:, j] / (1 - oof[:, j])) auc = roc_auc_score(y_train, oof_odds) print(f'AUC(th0.5): {auc}') sub_train = pd.DataFrame(zip(y_train, oof_odds), columns=['y', 'p']) sub_train.sort_values('p', ascending=False, inplace=True) for i in range(100, 2000, 100): sub_train_ = sub_train.head(i)
param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=SEED + i) model_all += models nround_mean += len(ret['multi_logloss-mean']) wloss_list.append(ret['multi_logloss-mean'][-1]) nround_mean = int((nround_mean / LOOP) * 1.3) result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" utils.send_line(result) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) print(imp.head(100).feature.map(lambda x: x.split('_')[0]).value_counts()) imp.to_csv(f'LOG/imp_{__file__}-1.csv', index=False) """ __file__ = '816_cv_mlogloss.py'
free_raw_data=False) model = lgb.train(params, train_set=dtrain, num_boost_round=NROUND, verbose_eval=100) l = valid_idx.shape[0] p_valid = model.predict(X_valid) p_test = model.predict(X_test) for j in range(var_len): oof[valid_idx, j] = p_valid[j * l:(j + 1) * l] p_test_all[:, j, i] = p_test[j * 100000:(j + 1) * 100000] models.append(model) auc = roc_auc_score(y_train, (9 * oof / (1 - oof)).prod(axis=1)) utils.send_line(f'{ayasii} AUC(all var): {auc}, {AUC_bench1 - auc}') l = y_train.shape[0] oof_odds = np.ones(l) * 1 / 9 for j in range(var_len): if roc_auc_score(y_train, oof[:, j]) >= 0.500: oof_odds *= (9 * oof[:, j] / (1 - oof[:, j])) auc = roc_auc_score(y_train, oof_odds) utils.send_line(f'{ayasii} AUC(th0.5): {auc}, {AUC_bench2 - auc}') #============================================================================== utils.end(__file__) utils.stop_instance()
# ============================================================================= dtrain = lgb.Dataset( X, y, categorical_feature=list(set(X.columns) & set(categorical_feature))) gc.collect() ret = lgb.cv(param, dtrain, 9999, nfold=5, early_stopping_rounds=50, verbose_eval=10, seed=SEED) print(f"CV auc-mean {ret['auc-mean'][-1]}") best_score = ret['auc-mean'][-1] utils.send_line(f'all features best_score: {best_score}') # ============================================================================= # # ============================================================================= dtrain = lgb.Dataset( X, y, categorical_feature=list(set(X.columns) & set(categorical_feature))) model = lgb.train(param, dtrain, len(ret['auc-mean'])) imp = ex.getImp(model) imp.to_csv(f'LOG/imp_{__file__}.csv', index=False) #============================================================================== utils.end(__file__) utils.stop_instance()
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_x = dh.load('../data/input/train_concated.csv') train_org_x = dh.load('../data/input/train.csv') train_2019_x = dh.load('../data/input/train_2019.csv') test_x = dh.load('../data/input/test.csv') with t.timer('make folds'): fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]]) fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]]) fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True) if cfg.validation.val1.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('load features'): features = dh.load('../configs/feature/all.yml')['features'] for f in features: train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1) test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1) with t.timer('drop several rows'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train model'): result = train_model(run_name, train_x, fold_df, cfg) logging.disable(logging.FATAL) run_name_cv = f'{run_name}_{result["cv"]:.3f}' logger_path.rename(f'../logs/{run_name_cv}') with t.timer('predict'): preds = predict_test(run_name_cv, test_x, fold_df, cfg) with t.timer('post process'): duplicates = { 'ISIC_5224960': 1, 'ISIC_9207777': 1, 'ISIC_6457527': 1, 'ISIC_8347588': 0, 'ISIC_8372206': 1, 'ISIC_9353360': 1, 'ISIC_3689290': 0, 'ISIC_3584949': 0, } for image_name, target in duplicates.items(): idx = test_x[test_x['image_name'] == image_name].index[0] preds[idx] = target with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) with t.timer('kaggle api'): kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) if cfg.common.kaggle.submit: kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': cfg.model.name, 'local_cv': round(result['cv'], 4), 'time': process_minutes, 'comment': comment })
for i, y_pred in enumerate(y_preds): if i == 0: oof = y_pred else: oof += y_pred oof /= len(y_preds) auc = round(roc_auc_score(y_train, oof), 5) if auc_best < auc: auc_best = auc round_best = round_ oof_best = oof utils.send_line(f'oof AUC({var, round_}): {auc}') result.append(auc) result_all.append(result) oof_best = pd.DataFrame(oof_best, columns=['oof']) oof_best.to_pickle(f'../data/806/oof_{__file__}_{var}_r{round_best}.pkl') # imp.to_csv(f'LOG/imp_{__file__}.csv', index=False) # utils.savefig_imp(imp, f'LOG/imp_{__file__}.png', x='total') result_all = pd.DataFrame(result_all, columns=['r4', 'r3', 'r2', 'r1', 'r0'], index=var_names) result_all.to_csv(f'LOG/auc_{__file__}.csv')
save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True) with t.timer('save features importances'): save_importances(RUN_NAME, models, FEATURES) with t.timer('make submission'): output_path = LOGGER_PATH / f'{METER_TYPE}.csv' make_submission(y_pred=np.mean(preds, axis=1), target_name=TARGET_NAME, sample_path=SAMPLE_SUB_PATH, output_path=str(output_path), comp=True) LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}') process_minutes = t.get_processing_time() with t.timer('notify'): message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]''' send_line(NOTIFY_PARAMS['line']['token'], message) send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'], url=NOTIFY_PARAMS['notion']['url'], name=RUN_NAME, created=NOW, model=MODEL_NAME.split('_')[0], local_cv=round(np.mean(scores), 4), time_=process_minutes, comment=COMMENT)
y_pred /= len(model_all) sub[f'class_{i}'] = y_pred # Compute preds_99 as the proba of class not being any of the others # preds_99 = 0.1 gives 1.769 sub.to_csv('../output/sub.csv.gz', index=False, compression='gzip') preds_99 = np.ones(sub.shape[0]) for i in range(sub.shape[1] - 1): preds_99 *= (1 - sub.iloc[:, i + 1]) sub['class_99'] = preds_99 sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') png = f'LOG/sub_{__file__}.png' utils.savefig_sub(sub, png) utils.send_line('DONE!', png) # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT) #============================================================================== utils.end(__file__) utils.stop_instance()
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) dh.save(logger_path / 'features.yml', features_params) with t.timer('load data'): train_df = dh.load('../data/input/train.csv') train2019_df = dh.load('../data/input/train_concated.csv') train_x = factory.get_features(features, cfg.data.loader.train) test_x = factory.get_features(features, cfg.data.loader.test) train_y = factory.get_target(cfg.data.target) with t.timer('add oof'): if cfg.data.features.oof.name is not None: oof, preds = factory.get_oof(cfg.data) train_x['oof'] = oof test_x['oof'] = preds features.append('oof') with t.timer('make folds'): fold_df = factory.get_fold(cfg.validation, train_df, train_df[['target']]) fold_df = pd.concat([ fold_df, pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))), columns=fold_df.columns) ], axis=0, sort=False, ignore_index=True) if cfg.validation.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('prepare for ad'): if cfg.data.adversarial_validation: train_x, train_y = factory.get_ad(cfg, train_x, test_x) with t.timer('train and predict'): trainer = Trainer(cfg) cv = trainer.train(train_df=train_x, target_df=train_y, fold_df=fold_df) preds = trainer.predict(test_x) trainer.save(run_name) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.data.target.name, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X[feature_set[i]], y.values, models, SEED, stratified=True, shuffle=True, n_class=True) y_preds.append(y_pred) model_all += models nround_mean += len(ret['multi_logloss-mean']) loss_list.append(ret['multi_logloss-mean'][-1]) nround_mean = int((nround_mean / MOD_N) * 1.3) utils.send_line(f'nround_mean: {nround_mean}') result = f"CV multi_logloss: {np.mean(loss_list)} + {np.std(loss_list)}" utils.send_line(result) for i, y_pred in enumerate(y_preds): if i == 0: oof = y_pred else: oof += y_pred oof /= len(y_preds) oof.to_pickle(f'../data/oof_{__file__}.pkl') oid_gal = pd.read_pickle('../data/tr_oid_gal.pkl')['object_id'].tolist() oid_exgal = pd.read_pickle('../data/tr_oid_exgal.pkl')['object_id'].tolist()
raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') gc.collect() CAT = list(set(X.columns) & set(utils_cat.ALL)) # ============================================================================= # cv # ============================================================================= dtrain = lgb.Dataset(X, y, categorical_feature=CAT) gc.collect() ret = lgb.cv(param, dtrain, 9999, nfold=7, early_stopping_rounds=100, verbose_eval=50, seed=SEED) result = f"CV auc-mean({HEAD}): {ret['auc-mean'][-1]}" print(result) utils.send_line(result) #============================================================================== utils.end(__file__) #utils.stop_instance()
def mk_submit(HEAD): SUBMIT_FILE_PATH_ = SUBMIT_FILE_PATH.replace('feature', str(HEAD)) files_tr = ('../feature/train_' + imp.head(HEAD).feature + '.f').tolist() files_te = ('../feature/test_' + imp.head(HEAD).feature + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH_.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(utils_cat.ALL)) COL = X_train.columns.tolist() # test X_test = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(feature {HEAD}): {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH_, index=False, compression='gzip')
print(result) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) imp.to_csv(f'LOG/imp_{__file__}.csv', index=False) png = f'LOG/imp_{__file__}.png' utils.savefig_imp(imp, png, x='total', title=f'{__file__}') utils.send_line(result, png) for i,y_pred in enumerate(y_preds): y_pred = pd.DataFrame(utils_metric.softmax(y_pred.astype(float).values)) if i==0: tmp = y_pred else: tmp += y_pred tmp /= len(y_preds) y_preds = tmp.copy().values.astype(float) a_score = utils_metric.akiyama_metric(y.values, y_preds) print('akiyama_metric:', a_score) # =============================================================================
def mk_submit(): # ============================================================================= # load # ============================================================================= # train X_train = loader.train() col = [c for c in X_train.columns if c.startswith('f702_')] X_train.drop(col, axis=1, inplace=True) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category())) print('CAT :', CAT) COL = X_train.columns.tolist() # test X_test = loader.test()[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(loop {i}): {auc_mean} {ret['auc-mean'][-1]}" print(result) utils.send_line(result) y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean: {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
EXE_SUBMIT = True np.random.seed(SEED) print('seed :', SEED) # ============================================================================= # wait # ============================================================================= while True: if os.path.isfile('SUCCESS_802'): break else: sleep(60 * 1) utils.send_line('{} start'.format(__file__)) # ============================================================================= # load train # ============================================================================= dtrain = lgb.Dataset('../data/dtrain.mt') gc.collect() # ============================================================================= # xgboost # ============================================================================= param = { 'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.1,
print(f'add {c}') dtrain = lgb.Dataset(X[features_new], y, categorical_feature=list( set(features_new)&set(categorical_feature)) ) ret = lgb.cv(param, dtrain, 9999, nfold=5, early_stopping_rounds=50, verbose_eval=None, seed=SEED) score = ret['auc-mean'][-1] print(f"auc-mean {score}") if best_score < score: print(f'UPDATE! SCORE:{score:+.5f} DIFF:{score-best_score:+.5f}') print(f'features: {features_new}') best_score = score features_curr = features_new utils.send_line(f'{c}: {best_score}') if False: # ============================================================================= # best # ============================================================================= features = ['app_001_AMT_ANNUITY', 'app_001_AMT_CREDIT', 'app_001_AMT_GOODS_PRICE', 'app_001_APARTMENTS_AVG', 'app_001_CODE_GENDER', 'app_001_COMMONAREA_AVG', 'app_001_DAYS_BIRTH', 'app_001_DAYS_EMPLOYED', 'app_001_DAYS_EMPLOYED-m-DAYS_BIRTH',
# pretrained: False num_classes = train_params.model_params.n_classes model_wight, oof_list, best_score, train_loss_list, val_loss_list, val_score_list = train_model( x_trn, x_val, train_params, num_classes, weights, device) np.save(f'../logs/{run_name}/oof_gr.npy', oof_list[0]) np.save(f'../logs/{run_name}/oof_vo.npy', oof_list[1]) np.save(f'../logs/{run_name}/oof_co.npy', oof_list[2]) torch.save(model_wight, f'../logs/{run_name}/weight_best.pt') save_png(run_name, train_params, train_loss_list, val_loss_list, val_score_list) logging.disable(logging.FATAL) logger_path.rename(f'../logs/{run_name}_{best_score:.3f}') process_minutes = t.get_processing_time(type='hour') with t.timer('notify'): message = f'''{model_name}\ncv: {best_score:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) send_notion(token_v2=notify_params.notion.token_v2, url=notify_params.notion.url, name=run_name, created=now, model=train_params.model_params.model_name, local_cv=round(best_score, 4), time_=process_minutes, comment=comment)
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): root = Path(cfg.common.input_root) train_df = dh.load(root / cfg.common.img_file) with t.timer('create target dataframe'): ordinal_target = np.zeros((len(train_df), 6)) for idx in train_df.index: target = train_df.loc[idx, 'isup_grade'] ordinal_target[idx, :] = [ 1 if target >= i else 0 for i in range(6) ] target_df = pd.DataFrame(ordinal_target, columns=[f'target_{i}' for i in range(6)]) with t.timer('drop several rows'): if cfg.common.drop.name is not None: drop_idx = dh.load(f'../pickle/{cfg.common.drop.name}.npy') train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True) target_df = target_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('make folds'): train_x_all = train_df.drop('isup_grade', axis=1) train_y_all = train_df['isup_grade'] if cfg.model.n_classes == 1: train_y_all = train_y_all.astype(float) trn_x, val_x, trn_y, val_y = train_test_split( train_x_all, target_df, test_size=0.2, shuffle=True, random_state=cfg.common.seed, stratify=train_df['isup_grade']) with t.timer('train model'): result = train_ordinal_reg(run_name, trn_x, val_x, trn_y, val_y, cfg) logging.disable(logging.FATAL) run_name_cv = f'{run_name}_{result["cv"]:.3f}' logger_path.rename(f'../logs/{run_name_cv}') with t.timer('kaggle api'): kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) if cfg.common.kaggle.data: kaggle.create_dataset() if cfg.common.kaggle.notebook: kaggle.push_notebook() with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': cfg.model.name, 'local_cv': round(result['cv'], 4), 'time': process_minutes, 'comment': comment })
verbose_eval=100) l = valid_idx.shape[0] p_valid = model.predict(X_valid) p_test = model.predict(X_test) for j in range(var_len): oof[valid_idx] = p_valid p_test_all[:, j, i] = p_test[j * 100000:(j + 1) * 100000] models.append(model) id_y['pred'] = oof oof = pd.pivot_table(id_y, index='id', columns='var', values='pred').values auc = f'seed{SEED} AUC(all var): {roc_auc_score(y_train, (9 * oof / (1 - oof)).prod(axis=1))}' utils.send_line(auc) l = y_train.shape[0] oof_odds = np.ones(l) * 1 / 9 for j in range(var_len): if roc_auc_score(y_train, oof[:, j]) >= 0.500: oof_odds *= (9 * oof[:, j] / (1 - oof[:, j])) auc = f'seed{SEED} AUC(th0.5): {roc_auc_score(y_train, oof_odds)}' utils.send_line(auc) # save raw pred np.save('../data/{__file__}_oof', oof) np.save('../data/{__file__}_p_test_all', p_test_all) # =============================================================================
seed=SEED) y_pred = ex.eval_oob( X_52_90, y_52_90.values, models, SEED, stratified=True, shuffle=True, ) oofs.append(y_pred) model_all += models nround_mean += len(ret['auc-mean']) wloss_list.append(ret['auc-mean'][-1]) nround_mean = int((nround_mean / 2) * 1.3) utils.send_line(f'nround_mean: {nround_mean}') result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" utils.send_line(result) for i, y_pred in enumerate(oofs): if i == 0: tmp = y_pred else: tmp += y_pred tmp /= len(oofs) oof = tmp.copy().values.astype(float) X_oid_52_90['52vs90'] = oof # =============================================================================