Ejemplo n.º 1
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train_data.csv')
        test_df = dh.load('../data/input/test_data.csv')

        oof = np.zeros((len(train_df), len(cfg.models)))
        preds = np.zeros((len(test_df), len(cfg.models)))

        for i, m in enumerate(cfg.models):
            name = getattr(cfg.models, m).name

            log_dir = Path(f'../logs/{name}')
            model_oof = dh.load(log_dir / 'oof.npy')
            model_cfg = dh.load(log_dir / 'config.yml')
            if model_cfg.common.drop:
                drop_idxs = np.array([])
                for drop_name in model_cfg.common.drop:
                    drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                    drop_idxs = np.append(drop_idxs, drop_idx)
                model_oof = factory.fill_dropped(model_oof, drop_idx)

            model_preds = dh.load(f'../logs/{name}/raw_preds.npy')

            oof[:, i] = model_oof[:len(train_df)]
            preds[:, i] = model_preds

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idxs = np.array([])
            for drop_name in model_cfg.common.drop:
                drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                drop_idxs = np.append(drop_idxs, drop_idx)
            train_df = train_df.drop(drop_idxs, axis=0).reset_index(drop=True)

    with t.timer('optimize model weight'):
        metric = factory.get_metrics(cfg.common.metrics.name)
        y_true = train_df[cfg.common.target]

        def objective(trial):
            p_list = [0 for i in range(len(cfg.models))]
            for i in range(len(cfg.models) - 1):
                p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01)
            p_list[-1] = round(1 - sum(p_list[:-1]), 2)

            y_pred = np.zeros(len(train_df))
            for i in range(oof.shape[1]):
                y_pred += oof[:, i] * p_list[i]

            return metric(y_true, y_pred)

        study = optuna.create_study(direction='minimize')
        study.optimize(objective, timeout=10)
        best_params = list(study.best_params.values())
        best_weight = best_params + [round(1 - sum(best_params), 2)]

    with t.timer('ensemble'):
        ensemble_oof = np.zeros(len(train_df))
        ensemble_preds = np.zeros(len(test_df))
        for i in range(len(best_weight)):
            ensemble_oof += oof[:, i] * best_weight[i]
            ensemble_preds += preds[:, i] * best_weight[i]

        dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof)
        dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds)

        cv = metric(y_true, ensemble_oof)
        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')

        print('\n\n===================================\n')
        print(f'CV: {cv:.4f}')
        print(f'BEST WEIGHT: {best_weight}')
        print('\n===================================\n\n')

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.feather'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=ensemble_preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{options.model}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })
Ejemplo n.º 2
0
    # pretrained: False
    num_classes = train_params.model_params.n_classes
    model_wight, oof_list, best_score, train_loss_list, val_loss_list, val_score_list = train_model(
        x_trn, x_val, train_params, num_classes, weights, device)
    np.save(f'../logs/{run_name}/oof_gr.npy', oof_list[0])
    np.save(f'../logs/{run_name}/oof_vo.npy', oof_list[1])
    np.save(f'../logs/{run_name}/oof_co.npy', oof_list[2])

    torch.save(model_wight, f'../logs/{run_name}/weight_best.pt')
    save_png(run_name, train_params, train_loss_list, val_loss_list,
             val_score_list)

logging.disable(logging.FATAL)
logger_path.rename(f'../logs/{run_name}_{best_score:.3f}')

process_minutes = t.get_processing_time(type='hour')

with t.timer('notify'):
    message = f'''{model_name}\ncv: {best_score:.3f}\ntime: {process_minutes:.2f}[h]'''
    send_line(notify_params.line.token, message)

    send_notion(token_v2=notify_params.notion.token_v2,
                url=notify_params.notion.url,
                name=run_name,
                created=now,
                model=train_params.model_params.model_name,
                local_cv=round(best_score, 4),
                time_=process_minutes,
                comment=comment)
Ejemplo n.º 3
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)
    dh.save(logger_path / 'features.yml', features_params)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train.csv')
        train2019_df = dh.load('../data/input/train_concated.csv')
        train_x = factory.get_features(features, cfg.data.loader.train)
        test_x = factory.get_features(features, cfg.data.loader.test)
        train_y = factory.get_target(cfg.data.target)

    with t.timer('add oof'):
        if cfg.data.features.oof.name is not None:
            oof, preds = factory.get_oof(cfg.data)
            train_x['oof'] = oof
            test_x['oof'] = preds
            features.append('oof')

    with t.timer('make folds'):
        fold_df = factory.get_fold(cfg.validation, train_df,
                                   train_df[['target']])
        fold_df = pd.concat([
            fold_df,
            pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))),
                         columns=fold_df.columns)
        ],
                            axis=0,
                            sort=False,
                            ignore_index=True)
        if cfg.validation.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('prepare for ad'):
        if cfg.data.adversarial_validation:
            train_x, train_y = factory.get_ad(cfg, train_x, test_x)

    with t.timer('train and predict'):
        trainer = Trainer(cfg)
        cv = trainer.train(train_df=train_x,
                           target_df=train_y,
                           fold_df=fold_df)
        preds = trainer.predict(test_x)
        trainer.save(run_name)

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.data.target.name,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })
Ejemplo n.º 4
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)
    dh.save(logger_path / 'features.yml', features_params)

    with t.timer('load data'):
        train_x = dh.load('../data/team/X_tra_wo_lec_20M.feather')
        val_x = dh.load('../data/team/X_val_wo_lec.feather')

        train_x['is_val'] = 0
        val_x['is_val'] = 1

        train_x = pd.concat([train_x, val_x],
                            axis=0,
                            sort=False,
                            ignore_index=True)
        train_y = train_x[const.TARGET_COLS]

        use_row_id = train_x['row_id'].values
        val_idx = train_x[train_x['is_val'] == 1].index
        drop_cols = set(train_x.columns) - set(features)
        train_x = train_x.drop(drop_cols, axis=1)

    with t.timer('load additional features'):
        add_df = pd.DataFrame(index=train_x.index)

        additional_cols = set(features) - set(train_x.columns)
        for col in additional_cols:
            feat_df = pd.read_feather(f'../features/{col}_train.feather')
            add_df[col] = feat_df.loc[use_row_id, col].values

        add_df = reduce_mem_usage(add_df)

        train_x = pd.concat([train_x, add_df], axis=1)

    with t.timer('preprocessing'):
        pass

    with t.timer('make folds'):
        fold_df = pd.DataFrame(index=range(len(train_x)))
        fold_df['fold_0'] = 0
        fold_df.loc[val_idx, 'fold_0'] += 1

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train and predict'):
        trainer = Trainer(cfg)
        cv = trainer.train(train_df=train_x,
                           target_df=train_y,
                           fold_df=fold_df)
        trainer.save(run_name)

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg, run_name_cv)
        if cfg.common.kaggle.data:
            kaggle.create_dataset()
        if cfg.common.kaggle.notebook:
            kaggle.push_notebook()

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        notificator = Notificator(run_name=run_name_cv,
                                  model_name=cfg.model.name,
                                  cv=round(cv, 4),
                                  process_time=round(process_minutes, 2),
                                  comment=comment,
                                  params=notify_params)
        notificator.send_line()
        notificator.send_notion()
        # notificator.send_slack()

    with t.timer('git'):
        git = Git(run_name=run_name_cv)
        git.push()
        git.save_hash()
Ejemplo n.º 5
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)
    dh.save(logger_path / 'features.yml', features_params)

    with t.timer('load data'):
        train_x = factory.get_features(features, cfg.data.loader.train)
        train_y = factory.get_target(cfg.data.target)

    # with t.timer('add oof'):
    #     if cfg.data.features.oof.name is not None:
    #         oof, preds = factory.get_result(cfg.data.features.oof.name, cfg)

    #         for i in range(oof.shape[1]):
    #             oof_col_name = f'oof_{const.TARGET_COLS[i]}'
    #             train_x[oof_col_name] = oof[:, i]
    #             features.append(oof_col_name)

    with t.timer('make folds'):
        valid_idx = np.load('../data/processed/cv1_valid.npy')
        # valid_idx = np.load('../data/processed/cv1_valid_dropped.npy')

        fold_df = pd.DataFrame(index=range(len(train_x)))
        fold_df['fold_0'] = 0
        fold_df.loc[valid_idx, 'fold_0'] += 1

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

        if cfg.data.sampling:
            drop_rows = np.random.choice(
                fold_df[fold_df['fold_0'] == 0].index.values, 20_000_000)
            train_x = train_x.drop(drop_rows, axis=0).reset_index(drop=True)
            train_y = train_y.drop(drop_rows, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_rows, axis=0).reset_index(drop=True)

    with t.timer('train and predict'):
        trainer = Trainer(cfg)
        cv = trainer.train(train_df=train_x,
                           target_df=train_y,
                           fold_df=fold_df)
        trainer.save(run_name)

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg, run_name_cv)
        if cfg.common.kaggle.data:
            kaggle.create_dataset()
        if cfg.common.kaggle.notebook:
            kaggle.push_notebook()

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        notificator = Notificator(run_name=run_name_cv,
                                  model_name=cfg.model.name,
                                  cv=round(cv, 4),
                                  process_time=round(process_minutes, 2),
                                  comment=comment,
                                  params=notify_params)
        notificator.send_line()
        notificator.send_notion()
        # notificator.send_slack()

    with t.timer('git'):
        git = Git(run_name=run_name_cv)
        git.push()
        git.save_hash()
Ejemplo n.º 6
0
        save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True)

with t.timer('save features importances'):
    save_importances(RUN_NAME, models, FEATURES)

with t.timer('make submission'):
    output_path = LOGGER_PATH / f'{METER_TYPE}.csv'
    make_submission(y_pred=np.mean(preds, axis=1),
                    target_name=TARGET_NAME,
                    sample_path=SAMPLE_SUB_PATH,
                    output_path=str(output_path),
                    comp=True)

LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}')

process_minutes = t.get_processing_time()

with t.timer('notify'):
    message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]'''

    send_line(NOTIFY_PARAMS['line']['token'], message)

    send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'],
                url=NOTIFY_PARAMS['notion']['url'],
                name=RUN_NAME,
                created=NOW,
                model=MODEL_NAME.split('_')[0],
                local_cv=round(np.mean(scores), 4),
                time_=process_minutes,
                comment=COMMENT)
Ejemplo n.º 7
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_x = dh.load('../data/input/train_concated.csv')
        train_org_x = dh.load('../data/input/train.csv')
        train_2019_x = dh.load('../data/input/train_2019.csv')
        test_x = dh.load('../data/input/test.csv')

    with t.timer('make folds'):
        fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]])
        fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]])
        fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True)
        if cfg.validation.val1.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('load features'):
        features = dh.load('../configs/feature/all.yml')['features']
        for f in features:
            train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1)
            test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1)

    with t.timer('drop several rows'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train model'):
        result = train_model(run_name, train_x, fold_df, cfg)
    
    logging.disable(logging.FATAL)
    run_name_cv = f'{run_name}_{result["cv"]:.3f}'
    logger_path.rename(f'../logs/{run_name_cv}')

    with t.timer('predict'):
        preds = predict_test(run_name_cv, test_x, fold_df, cfg)

    with t.timer('post process'):
        duplicates = {
            'ISIC_5224960': 1,
            'ISIC_9207777': 1,
            'ISIC_6457527': 1,
            'ISIC_8347588': 0,
            'ISIC_8372206': 1,
            'ISIC_9353360': 1,
            'ISIC_3689290': 0,
            'ISIC_3584949': 0,  
        }
        for image_name, target in duplicates.items():
            idx = test_x[test_x['image_name'] == image_name].index[0]
            preds[idx] = target

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
        if cfg.common.kaggle.submit:
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': cfg.model.name,
            'local_cv': round(result['cv'], 4),
            'time': process_minutes,
            'comment': comment
        })
Ejemplo n.º 8
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        root = Path(cfg.common.input_root)
        train_df = dh.load(root / cfg.common.img_file)

    with t.timer('create target dataframe'):
        ordinal_target = np.zeros((len(train_df), 6))

        for idx in train_df.index:
            target = train_df.loc[idx, 'isup_grade']
            ordinal_target[idx, :] = [
                1 if target >= i else 0 for i in range(6)
            ]

        target_df = pd.DataFrame(ordinal_target,
                                 columns=[f'target_{i}' for i in range(6)])

    with t.timer('drop several rows'):
        if cfg.common.drop.name is not None:
            drop_idx = dh.load(f'../pickle/{cfg.common.drop.name}.npy')
            train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True)
            target_df = target_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('make folds'):
        train_x_all = train_df.drop('isup_grade', axis=1)
        train_y_all = train_df['isup_grade']
        if cfg.model.n_classes == 1:
            train_y_all = train_y_all.astype(float)
        trn_x, val_x, trn_y, val_y = train_test_split(
            train_x_all,
            target_df,
            test_size=0.2,
            shuffle=True,
            random_state=cfg.common.seed,
            stratify=train_df['isup_grade'])

    with t.timer('train model'):
        result = train_ordinal_reg(run_name, trn_x, val_x, trn_y, val_y, cfg)

    logging.disable(logging.FATAL)
    run_name_cv = f'{run_name}_{result["cv"]:.3f}'
    logger_path.rename(f'../logs/{run_name_cv}')

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
        if cfg.common.kaggle.data:
            kaggle.create_dataset()
        if cfg.common.kaggle.notebook:
            kaggle.push_notebook()

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': cfg.model.name,
            'local_cv': round(result['cv'], 4),
            'time': process_minutes,
            'comment': comment
        })
Ejemplo n.º 9
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        if cfg.common.debug:
            train_df = pd.read_csv(const.INPUT_DATA_DIR / 'train.csv',
                                   dtype=const.DTYPE,
                                   nrows=5_000_000)
        else:
            train_df = pd.read_csv(const.INPUT_DATA_DIR / 'train.csv',
                                   dtype=const.DTYPE)

    with t.timer('preprocess'):
        questions_df = pd.read_csv(const.INPUT_DATA_DIR / 'questions.csv')
        q2p = dict(questions_df[['question_id', 'part']].values)
        train_df['part'] = train_df['content_id'].map(q2p)

        train_df['prior_question_had_explanation'] = train_df[
            'prior_question_had_explanation'].astype(float)

        te_content_df = pd.read_feather(
            '../features/te_content_id_by_answered_correctly_train.feather')
        avg_u_target_df = pd.read_feather(
            '../features/answered_correctly_avg_u_train.feather')

        if cfg.common.debug:
            te_content_df = te_content_df.iloc[:5_000_000]
            avg_u_target_df = avg_u_target_df.iloc[:5_000_000]

        train_df['te_content_id_by_answered_correctly'] = te_content_df[
            'te_content_id_by_answered_correctly']
        train_df['answered_correctly_avg_u'] = avg_u_target_df[
            'answered_correctly_avg_u']

    with t.timer('make folds'):
        valid_idx = np.load('../data/processed/cv1_valid_v2.npy')
        if cfg.common.debug:
            valid_idx = valid_idx[np.where(valid_idx < len(train_df))]

        fold_df = pd.DataFrame(index=range(len(train_df)))
        fold_df['fold_0'] = 0
        fold_df.loc[valid_idx, 'fold_0'] += 1

    with t.timer('drop index'):
        if cfg.common.drop:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            if cfg.common.debug:
                drop_idx = drop_idx[np.where(drop_idx < len(train_df))]
            train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

        train_df['step'] = train_df.groupby(
            'user_id').cumcount() // cfg.data.train.step_size
        train_df['user_step_id'] = train_df['user_id'].astype(
            str) + '__' + train_df['step'].astype(str)

    with t.timer('train model'):
        trainer = NNTrainer(run_name, fold_df, cfg)
        cv = trainer.train(train_df, target_df=train_df[const.TARGET_COLS[0]])
        trainer.save()

        run_name_cv = f'{run_name}_{cv:.4f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg, run_name_cv)
        if cfg.common.kaggle.data:
            kaggle.create_dataset()
        if cfg.common.kaggle.notebook:
            kaggle.push_notebook()

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        notificator = Notificator(run_name=run_name_cv,
                                  model_name=cfg.model.backbone,
                                  cv=round(cv, 4),
                                  process_time=round(process_minutes, 2),
                                  comment=comment,
                                  params=notify_params)
        notificator.send_line()
        notificator.send_notion()
        # notificator.send_slack()

    with t.timer('git'):
        git = Git(run_name=run_name_cv)
        git.push()
        git.save_hash()
Ejemplo n.º 10
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_df = dh.load('../data/team/X_tra_wo_lec_20M.feather')
        val_df = dh.load('../data/team/X_val_wo_lec.feather')

        train_df['is_val'] = 0
        val_df['is_val'] = 1

        train_df = pd.concat([train_df, val_df],
                             axis=0,
                             sort=False,
                             ignore_index=True)
        val_idx = train_df[train_df['is_val'] == 1].index

        del train_df
        gc.collect()

    with t.timer('drop index'):
        drop_idx = np.array([])
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            val_df = val_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('load oof and preds'):
        oof_list = []
        # preds_list = []

        for i, log_name in enumerate(sorted(cfg.models)):
            log_dir = Path(f'../logs/{log_name}')
            model_oof = factory.get_result(log_dir, cfg, data_type='train')

            if len(drop_idx) > 0:
                model_oof = np.delete(model_oof, drop_idx, axis=0)

            if cfg.preprocess.rank:
                model_oof = np.argsort(np.argsort(model_oof)) / len(model_oof)

            oof_list.append(model_oof[val_idx])

    with t.timer('optimize model weight'):
        metric = factory.get_metrics(cfg.common.metrics.name)

        best_weight_array = np.zeros(len(oof_list))
        for target_idx, target in enumerate(const.TARGET_COLS):
            best_weight = opt_ensemble_weight(cfg, val_df[target], oof_list,
                                              metric)

    with t.timer('ensemble'):
        ensemble_oof = np.zeros(len(val_df))

        cv_list = []
        for model_idx, weight in enumerate(best_weight):
            ensemble_oof += oof_list[model_idx] * weight

        cv = metric(val_df[const.TARGET_COLS[0]], ensemble_oof)
        cv_list.append(cv)

        dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof)
        # dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds)
        dh.save(f'../logs/{run_name}/best_weight.npy', best_weight_array)

        cv = np.mean(cv_list)
        run_name_cv = f'{run_name}_{cv:.6f}'
        logger_path.rename(f'../logs/{run_name_cv}')

        print('\n\n===================================\n')
        print(f'CV: {cv:.6f}')
        print('\n===================================\n\n')

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg, run_name_cv)
        if cfg.common.kaggle.data:
            kaggle.create_dataset()
        if cfg.common.kaggle.notebook:
            kaggle.push_notebook()

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        notificator = Notificator(run_name=run_name_cv,
                                  model_name='ensemble',
                                  cv=round(cv, 4),
                                  process_time=round(process_minutes, 2),
                                  comment=comment,
                                  params=notify_params)
        notificator.send_line()
        notificator.send_notion()
        # notificator.send_slack()

    with t.timer('git'):
        git = Git(run_name=run_name_cv)
        git.push()
        git.save_hash()
Ejemplo n.º 11
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        root = Path(cfg.common.input_root)
        train_df = dh.load(root / cfg.common.img_file)

    with t.timer('drop several rows'):
        if cfg.common.drop is not None:
            drop_idx_list = []
            for drop_name in cfg.common.drop:
                drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                drop_idx_list.append(drop_idx)
            all_drop_idx = np.unique(np.concatenate(drop_idx_list))
            train_df = train_df.drop(all_drop_idx,
                                     axis=0).reset_index(drop=True)

    with t.timer('make folds'):
        train_x_all = train_df.drop('isup_grade', axis=1)
        train_y_all = train_df['isup_grade']
        if cfg.model.n_classes == 1:
            train_y_all = train_y_all.astype(float)
        trn_x, val_x, trn_y, val_y = train_test_split(
            train_x_all,
            train_y_all,
            test_size=0.2,
            shuffle=True,
            random_state=cfg.common.seed,
            stratify=train_df['isup_grade'])

    with t.timer('train model'):
        result = train_cnn(run_name, trn_x, val_x, trn_y, val_y, cfg)

    logging.disable(logging.FATAL)
    run_name_cv = f'{run_name}_{result["cv"]:.3f}'
    logger_path.rename(f'../logs/{run_name_cv}')

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
        if cfg.common.kaggle.data:
            kaggle.create_dataset()
        if cfg.common.kaggle.notebook:
            kaggle.push_notebook()

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': cfg.model.name,
            'local_cv': round(result['cv'], 4),
            'time': process_minutes,
            'comment': comment
        })