def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_df = dh.load('../data/input/train_data.csv') test_df = dh.load('../data/input/test_data.csv') oof = np.zeros((len(train_df), len(cfg.models))) preds = np.zeros((len(test_df), len(cfg.models))) for i, m in enumerate(cfg.models): name = getattr(cfg.models, m).name log_dir = Path(f'../logs/{name}') model_oof = dh.load(log_dir / 'oof.npy') model_cfg = dh.load(log_dir / 'config.yml') if model_cfg.common.drop: drop_idxs = np.array([]) for drop_name in model_cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idxs = np.append(drop_idxs, drop_idx) model_oof = factory.fill_dropped(model_oof, drop_idx) model_preds = dh.load(f'../logs/{name}/raw_preds.npy') oof[:, i] = model_oof[:len(train_df)] preds[:, i] = model_preds with t.timer('drop index'): if cfg.common.drop is not None: drop_idxs = np.array([]) for drop_name in model_cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idxs = np.append(drop_idxs, drop_idx) train_df = train_df.drop(drop_idxs, axis=0).reset_index(drop=True) with t.timer('optimize model weight'): metric = factory.get_metrics(cfg.common.metrics.name) y_true = train_df[cfg.common.target] def objective(trial): p_list = [0 for i in range(len(cfg.models))] for i in range(len(cfg.models) - 1): p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01) p_list[-1] = round(1 - sum(p_list[:-1]), 2) y_pred = np.zeros(len(train_df)) for i in range(oof.shape[1]): y_pred += oof[:, i] * p_list[i] return metric(y_true, y_pred) study = optuna.create_study(direction='minimize') study.optimize(objective, timeout=10) best_params = list(study.best_params.values()) best_weight = best_params + [round(1 - sum(best_params), 2)] with t.timer('ensemble'): ensemble_oof = np.zeros(len(train_df)) ensemble_preds = np.zeros(len(test_df)) for i in range(len(best_weight)): ensemble_oof += oof[:, i] * best_weight[i] ensemble_preds += preds[:, i] * best_weight[i] dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof) dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds) cv = metric(y_true, ensemble_oof) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') print('\n\n===================================\n') print(f'CV: {cv:.4f}') print(f'BEST WEIGHT: {best_weight}') print('\n===================================\n\n') with t.timer('make submission'): sample_path = f'../data/input/sample_submission.feather' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=ensemble_preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{options.model}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
# pretrained: False num_classes = train_params.model_params.n_classes model_wight, oof_list, best_score, train_loss_list, val_loss_list, val_score_list = train_model( x_trn, x_val, train_params, num_classes, weights, device) np.save(f'../logs/{run_name}/oof_gr.npy', oof_list[0]) np.save(f'../logs/{run_name}/oof_vo.npy', oof_list[1]) np.save(f'../logs/{run_name}/oof_co.npy', oof_list[2]) torch.save(model_wight, f'../logs/{run_name}/weight_best.pt') save_png(run_name, train_params, train_loss_list, val_loss_list, val_score_list) logging.disable(logging.FATAL) logger_path.rename(f'../logs/{run_name}_{best_score:.3f}') process_minutes = t.get_processing_time(type='hour') with t.timer('notify'): message = f'''{model_name}\ncv: {best_score:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) send_notion(token_v2=notify_params.notion.token_v2, url=notify_params.notion.url, name=run_name, created=now, model=train_params.model_params.model_name, local_cv=round(best_score, 4), time_=process_minutes, comment=comment)
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) dh.save(logger_path / 'features.yml', features_params) with t.timer('load data'): train_df = dh.load('../data/input/train.csv') train2019_df = dh.load('../data/input/train_concated.csv') train_x = factory.get_features(features, cfg.data.loader.train) test_x = factory.get_features(features, cfg.data.loader.test) train_y = factory.get_target(cfg.data.target) with t.timer('add oof'): if cfg.data.features.oof.name is not None: oof, preds = factory.get_oof(cfg.data) train_x['oof'] = oof test_x['oof'] = preds features.append('oof') with t.timer('make folds'): fold_df = factory.get_fold(cfg.validation, train_df, train_df[['target']]) fold_df = pd.concat([ fold_df, pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))), columns=fold_df.columns) ], axis=0, sort=False, ignore_index=True) if cfg.validation.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('prepare for ad'): if cfg.data.adversarial_validation: train_x, train_y = factory.get_ad(cfg, train_x, test_x) with t.timer('train and predict'): trainer = Trainer(cfg) cv = trainer.train(train_df=train_x, target_df=train_y, fold_df=fold_df) preds = trainer.predict(test_x) trainer.save(run_name) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.data.target.name, sample_path=sample_path, output_path=output_path, comp=False) if cfg.common.kaggle.submit: kaggle = Kaggle(cfg.compe.name, run_name_cv) kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': options.model, 'local_cv': round(cv, 4), 'time': process_minutes, 'comment': comment })
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) dh.save(logger_path / 'features.yml', features_params) with t.timer('load data'): train_x = dh.load('../data/team/X_tra_wo_lec_20M.feather') val_x = dh.load('../data/team/X_val_wo_lec.feather') train_x['is_val'] = 0 val_x['is_val'] = 1 train_x = pd.concat([train_x, val_x], axis=0, sort=False, ignore_index=True) train_y = train_x[const.TARGET_COLS] use_row_id = train_x['row_id'].values val_idx = train_x[train_x['is_val'] == 1].index drop_cols = set(train_x.columns) - set(features) train_x = train_x.drop(drop_cols, axis=1) with t.timer('load additional features'): add_df = pd.DataFrame(index=train_x.index) additional_cols = set(features) - set(train_x.columns) for col in additional_cols: feat_df = pd.read_feather(f'../features/{col}_train.feather') add_df[col] = feat_df.loc[use_row_id, col].values add_df = reduce_mem_usage(add_df) train_x = pd.concat([train_x, add_df], axis=1) with t.timer('preprocessing'): pass with t.timer('make folds'): fold_df = pd.DataFrame(index=range(len(train_x))) fold_df['fold_0'] = 0 fold_df.loc[val_idx, 'fold_0'] += 1 with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train and predict'): trainer = Trainer(cfg) cv = trainer.train(train_df=train_x, target_df=train_y, fold_df=fold_df) trainer.save(run_name) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('kaggle api'): kaggle = Kaggle(cfg, run_name_cv) if cfg.common.kaggle.data: kaggle.create_dataset() if cfg.common.kaggle.notebook: kaggle.push_notebook() with t.timer('notify'): process_minutes = t.get_processing_time() notificator = Notificator(run_name=run_name_cv, model_name=cfg.model.name, cv=round(cv, 4), process_time=round(process_minutes, 2), comment=comment, params=notify_params) notificator.send_line() notificator.send_notion() # notificator.send_slack() with t.timer('git'): git = Git(run_name=run_name_cv) git.push() git.save_hash()
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) dh.save(logger_path / 'features.yml', features_params) with t.timer('load data'): train_x = factory.get_features(features, cfg.data.loader.train) train_y = factory.get_target(cfg.data.target) # with t.timer('add oof'): # if cfg.data.features.oof.name is not None: # oof, preds = factory.get_result(cfg.data.features.oof.name, cfg) # for i in range(oof.shape[1]): # oof_col_name = f'oof_{const.TARGET_COLS[i]}' # train_x[oof_col_name] = oof[:, i] # features.append(oof_col_name) with t.timer('make folds'): valid_idx = np.load('../data/processed/cv1_valid.npy') # valid_idx = np.load('../data/processed/cv1_valid_dropped.npy') fold_df = pd.DataFrame(index=range(len(train_x))) fold_df['fold_0'] = 0 fold_df.loc[valid_idx, 'fold_0'] += 1 with t.timer('drop index'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) if cfg.data.sampling: drop_rows = np.random.choice( fold_df[fold_df['fold_0'] == 0].index.values, 20_000_000) train_x = train_x.drop(drop_rows, axis=0).reset_index(drop=True) train_y = train_y.drop(drop_rows, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_rows, axis=0).reset_index(drop=True) with t.timer('train and predict'): trainer = Trainer(cfg) cv = trainer.train(train_df=train_x, target_df=train_y, fold_df=fold_df) trainer.save(run_name) run_name_cv = f'{run_name}_{cv:.3f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('kaggle api'): kaggle = Kaggle(cfg, run_name_cv) if cfg.common.kaggle.data: kaggle.create_dataset() if cfg.common.kaggle.notebook: kaggle.push_notebook() with t.timer('notify'): process_minutes = t.get_processing_time() notificator = Notificator(run_name=run_name_cv, model_name=cfg.model.name, cv=round(cv, 4), process_time=round(process_minutes, 2), comment=comment, params=notify_params) notificator.send_line() notificator.send_notion() # notificator.send_slack() with t.timer('git'): git = Git(run_name=run_name_cv) git.push() git.save_hash()
save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True) with t.timer('save features importances'): save_importances(RUN_NAME, models, FEATURES) with t.timer('make submission'): output_path = LOGGER_PATH / f'{METER_TYPE}.csv' make_submission(y_pred=np.mean(preds, axis=1), target_name=TARGET_NAME, sample_path=SAMPLE_SUB_PATH, output_path=str(output_path), comp=True) LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}') process_minutes = t.get_processing_time() with t.timer('notify'): message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]''' send_line(NOTIFY_PARAMS['line']['token'], message) send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'], url=NOTIFY_PARAMS['notion']['url'], name=RUN_NAME, created=NOW, model=MODEL_NAME.split('_')[0], local_cv=round(np.mean(scores), 4), time_=process_minutes, comment=COMMENT)
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_x = dh.load('../data/input/train_concated.csv') train_org_x = dh.load('../data/input/train.csv') train_2019_x = dh.load('../data/input/train_2019.csv') test_x = dh.load('../data/input/test.csv') with t.timer('make folds'): fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]]) fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]]) fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True) if cfg.validation.val1.single: fold_df = fold_df[['fold_0']] fold_df /= fold_df['fold_0'].max() with t.timer('load features'): features = dh.load('../configs/feature/all.yml')['features'] for f in features: train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1) test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1) with t.timer('drop several rows'): if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('train model'): result = train_model(run_name, train_x, fold_df, cfg) logging.disable(logging.FATAL) run_name_cv = f'{run_name}_{result["cv"]:.3f}' logger_path.rename(f'../logs/{run_name_cv}') with t.timer('predict'): preds = predict_test(run_name_cv, test_x, fold_df, cfg) with t.timer('post process'): duplicates = { 'ISIC_5224960': 1, 'ISIC_9207777': 1, 'ISIC_6457527': 1, 'ISIC_8347588': 0, 'ISIC_8372206': 1, 'ISIC_9353360': 1, 'ISIC_3689290': 0, 'ISIC_3584949': 0, } for image_name, target in duplicates.items(): idx = test_x[test_x['image_name'] == image_name].index[0] preds[idx] = target with t.timer('make submission'): sample_path = f'../data/input/sample_submission.csv' output_path = f'../data/output/{run_name_cv}.csv' make_submission(y_pred=preds, target_name=cfg.common.target, sample_path=sample_path, output_path=output_path, comp=False) with t.timer('kaggle api'): kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) if cfg.common.kaggle.submit: kaggle.submit(comment) with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': cfg.model.name, 'local_cv': round(result['cv'], 4), 'time': process_minutes, 'comment': comment })
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): root = Path(cfg.common.input_root) train_df = dh.load(root / cfg.common.img_file) with t.timer('create target dataframe'): ordinal_target = np.zeros((len(train_df), 6)) for idx in train_df.index: target = train_df.loc[idx, 'isup_grade'] ordinal_target[idx, :] = [ 1 if target >= i else 0 for i in range(6) ] target_df = pd.DataFrame(ordinal_target, columns=[f'target_{i}' for i in range(6)]) with t.timer('drop several rows'): if cfg.common.drop.name is not None: drop_idx = dh.load(f'../pickle/{cfg.common.drop.name}.npy') train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True) target_df = target_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('make folds'): train_x_all = train_df.drop('isup_grade', axis=1) train_y_all = train_df['isup_grade'] if cfg.model.n_classes == 1: train_y_all = train_y_all.astype(float) trn_x, val_x, trn_y, val_y = train_test_split( train_x_all, target_df, test_size=0.2, shuffle=True, random_state=cfg.common.seed, stratify=train_df['isup_grade']) with t.timer('train model'): result = train_ordinal_reg(run_name, trn_x, val_x, trn_y, val_y, cfg) logging.disable(logging.FATAL) run_name_cv = f'{run_name}_{result["cv"]:.3f}' logger_path.rename(f'../logs/{run_name_cv}') with t.timer('kaggle api'): kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) if cfg.common.kaggle.data: kaggle.create_dataset() if cfg.common.kaggle.notebook: kaggle.push_notebook() with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': cfg.model.name, 'local_cv': round(result['cv'], 4), 'time': process_minutes, 'comment': comment })
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): if cfg.common.debug: train_df = pd.read_csv(const.INPUT_DATA_DIR / 'train.csv', dtype=const.DTYPE, nrows=5_000_000) else: train_df = pd.read_csv(const.INPUT_DATA_DIR / 'train.csv', dtype=const.DTYPE) with t.timer('preprocess'): questions_df = pd.read_csv(const.INPUT_DATA_DIR / 'questions.csv') q2p = dict(questions_df[['question_id', 'part']].values) train_df['part'] = train_df['content_id'].map(q2p) train_df['prior_question_had_explanation'] = train_df[ 'prior_question_had_explanation'].astype(float) te_content_df = pd.read_feather( '../features/te_content_id_by_answered_correctly_train.feather') avg_u_target_df = pd.read_feather( '../features/answered_correctly_avg_u_train.feather') if cfg.common.debug: te_content_df = te_content_df.iloc[:5_000_000] avg_u_target_df = avg_u_target_df.iloc[:5_000_000] train_df['te_content_id_by_answered_correctly'] = te_content_df[ 'te_content_id_by_answered_correctly'] train_df['answered_correctly_avg_u'] = avg_u_target_df[ 'answered_correctly_avg_u'] with t.timer('make folds'): valid_idx = np.load('../data/processed/cv1_valid_v2.npy') if cfg.common.debug: valid_idx = valid_idx[np.where(valid_idx < len(train_df))] fold_df = pd.DataFrame(index=range(len(train_df))) fold_df['fold_0'] = 0 fold_df.loc[valid_idx, 'fold_0'] += 1 with t.timer('drop index'): if cfg.common.drop: drop_idx = factory.get_drop_idx(cfg.common.drop) if cfg.common.debug: drop_idx = drop_idx[np.where(drop_idx < len(train_df))] train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True) fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True) train_df['step'] = train_df.groupby( 'user_id').cumcount() // cfg.data.train.step_size train_df['user_step_id'] = train_df['user_id'].astype( str) + '__' + train_df['step'].astype(str) with t.timer('train model'): trainer = NNTrainer(run_name, fold_df, cfg) cv = trainer.train(train_df, target_df=train_df[const.TARGET_COLS[0]]) trainer.save() run_name_cv = f'{run_name}_{cv:.4f}' logger_path.rename(f'../logs/{run_name_cv}') logging.disable(logging.FATAL) with t.timer('kaggle api'): kaggle = Kaggle(cfg, run_name_cv) if cfg.common.kaggle.data: kaggle.create_dataset() if cfg.common.kaggle.notebook: kaggle.push_notebook() with t.timer('notify'): process_minutes = t.get_processing_time() notificator = Notificator(run_name=run_name_cv, model_name=cfg.model.backbone, cv=round(cv, 4), process_time=round(process_minutes, 2), comment=comment, params=notify_params) notificator.send_line() notificator.send_notion() # notificator.send_slack() with t.timer('git'): git = Git(run_name=run_name_cv) git.push() git.save_hash()
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): train_df = dh.load('../data/team/X_tra_wo_lec_20M.feather') val_df = dh.load('../data/team/X_val_wo_lec.feather') train_df['is_val'] = 0 val_df['is_val'] = 1 train_df = pd.concat([train_df, val_df], axis=0, sort=False, ignore_index=True) val_idx = train_df[train_df['is_val'] == 1].index del train_df gc.collect() with t.timer('drop index'): drop_idx = np.array([]) if cfg.common.drop is not None: drop_idx = factory.get_drop_idx(cfg.common.drop) val_df = val_df.drop(drop_idx, axis=0).reset_index(drop=True) with t.timer('load oof and preds'): oof_list = [] # preds_list = [] for i, log_name in enumerate(sorted(cfg.models)): log_dir = Path(f'../logs/{log_name}') model_oof = factory.get_result(log_dir, cfg, data_type='train') if len(drop_idx) > 0: model_oof = np.delete(model_oof, drop_idx, axis=0) if cfg.preprocess.rank: model_oof = np.argsort(np.argsort(model_oof)) / len(model_oof) oof_list.append(model_oof[val_idx]) with t.timer('optimize model weight'): metric = factory.get_metrics(cfg.common.metrics.name) best_weight_array = np.zeros(len(oof_list)) for target_idx, target in enumerate(const.TARGET_COLS): best_weight = opt_ensemble_weight(cfg, val_df[target], oof_list, metric) with t.timer('ensemble'): ensemble_oof = np.zeros(len(val_df)) cv_list = [] for model_idx, weight in enumerate(best_weight): ensemble_oof += oof_list[model_idx] * weight cv = metric(val_df[const.TARGET_COLS[0]], ensemble_oof) cv_list.append(cv) dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof) # dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds) dh.save(f'../logs/{run_name}/best_weight.npy', best_weight_array) cv = np.mean(cv_list) run_name_cv = f'{run_name}_{cv:.6f}' logger_path.rename(f'../logs/{run_name_cv}') print('\n\n===================================\n') print(f'CV: {cv:.6f}') print('\n===================================\n\n') with t.timer('kaggle api'): kaggle = Kaggle(cfg, run_name_cv) if cfg.common.kaggle.data: kaggle.create_dataset() if cfg.common.kaggle.notebook: kaggle.push_notebook() with t.timer('notify'): process_minutes = t.get_processing_time() notificator = Notificator(run_name=run_name_cv, model_name='ensemble', cv=round(cv, 4), process_time=round(process_minutes, 2), comment=comment, params=notify_params) notificator.send_line() notificator.send_notion() # notificator.send_slack() with t.timer('git'): git = Git(run_name=run_name_cv) git.push() git.save_hash()
def main(): t = Timer() seed_everything(cfg.common.seed) logger_path.mkdir(exist_ok=True) logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG) dh.save(logger_path / 'config.yml', cfg) with t.timer('load data'): root = Path(cfg.common.input_root) train_df = dh.load(root / cfg.common.img_file) with t.timer('drop several rows'): if cfg.common.drop is not None: drop_idx_list = [] for drop_name in cfg.common.drop: drop_idx = dh.load(f'../pickle/{drop_name}.npy') drop_idx_list.append(drop_idx) all_drop_idx = np.unique(np.concatenate(drop_idx_list)) train_df = train_df.drop(all_drop_idx, axis=0).reset_index(drop=True) with t.timer('make folds'): train_x_all = train_df.drop('isup_grade', axis=1) train_y_all = train_df['isup_grade'] if cfg.model.n_classes == 1: train_y_all = train_y_all.astype(float) trn_x, val_x, trn_y, val_y = train_test_split( train_x_all, train_y_all, test_size=0.2, shuffle=True, random_state=cfg.common.seed, stratify=train_df['isup_grade']) with t.timer('train model'): result = train_cnn(run_name, trn_x, val_x, trn_y, val_y, cfg) logging.disable(logging.FATAL) run_name_cv = f'{run_name}_{result["cv"]:.3f}' logger_path.rename(f'../logs/{run_name_cv}') with t.timer('kaggle api'): kaggle = Kaggle(cfg.compe.compe_name, run_name_cv) if cfg.common.kaggle.data: kaggle.create_dataset() if cfg.common.kaggle.notebook: kaggle.push_notebook() with t.timer('notify'): process_minutes = t.get_processing_time() message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]''' send_line(notify_params.line.token, message) notion = Notion(token=notify_params.notion.token_v2) notion.set_url(url=notify_params.notion.url) notion.insert_rows({ 'name': run_name_cv, 'created': now, 'model': cfg.model.name, 'local_cv': round(result['cv'], 4), 'time': process_minutes, 'comment': comment })