def mk_submit():

    files_tr = ('../feature/train_' + features + '.f').tolist()
    files_te = ('../feature/test_' + features + '.f').tolist()

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1)
    y_train = utils.read_pickles('../data/label').TARGET

    # =============================================================================
    # remove old users
    # =============================================================================
    X_train['SK_ID_CURR'] = SK_ID_CURR

    y_train = y_train[~X_train.SK_ID_CURR.isin(drop_ids)]
    X_train = X_train[~X_train.SK_ID_CURR.isin(drop_ids)]
    oof_train = X_train[['SK_ID_CURR']]
    X_train.drop('SK_ID_CURR', axis=1, inplace=True)

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(utils_cat.ALL))

    COL = X_train.columns.tolist()

    # test
    X_test = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)],
        axis=1)[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    dtrain = lgb.Dataset(X_train,
                         y_train,
                         categorical_feature=CAT,
                         free_raw_data=False)

    model_all = []
    y_pred = pd.Series(0, index=y_train.index)
    for i in range(LOOP):
        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        y_pred += ex.eval_oob(X_train, y_train, models, i).rank()

        auc_mean = roc_auc_score(y_train, y_pred)
        result = f"CV auc-mean(loop {i}): {auc_mean}"
        print(result)
        utils.send_line(result)

    y_pred /= y_pred.max()

    auc_mean = roc_auc_score(y_train, y_pred)
    result = f"CV auc-mean: {auc_mean}"
    print(result)
    utils.send_line(result)

    # save
    oof_train['oof'] = y_pred
    oof_train.to_csv('../output/onodera-last-oof.csv', index=False)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
Esempio n. 2
0
EXE_SUBMIT = True

np.random.seed(SEED)
print('seed :', SEED)

# =============================================================================
# wait
# =============================================================================
while True:
    if os.path.isfile('SUCCESS_805'):
        break
    else:
        sleep(60 * 1)

utils.send_line(f'START {__file__}')
# =============================================================================
# load train
# =============================================================================

dtrain = lgb.Dataset('../data/dtrain.mt')
gc.collect()

# =============================================================================
# xgboost
# =============================================================================

param = {
    'objective': 'binary',
    'metric': 'auc',
    'max_bin': 100,
Esempio n. 3
0
                         verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(X[COL],
                         y.values,
                         models,
                         SEED,
                         stratified=True,
                         shuffle=True,
                         n_class=True)
    y_preds.append(y_pred)
    model_all += models
    nround_mean += len(ret['wloss-mean'])
    wloss_list.append(ret['wloss-mean'][-1])

nround_mean = int((nround_mean / 2) * 1.3)
utils.send_line(f'nround_mean: {nround_mean}')

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
print(result)
utils.send_line(result)

for i, y_pred in enumerate(y_preds):
    y_pred = pd.DataFrame(utils_metric.softmax(y_pred.astype(float).values))
    if i == 0:
        tmp = y_pred
    else:
        tmp += y_pred
tmp /= len(y_preds)
y_preds = tmp.copy().values.astype(float)

a_score = utils_metric.akiyama_metric(y.values, y_preds)
Esempio n. 4
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train_data.csv')
        test_df = dh.load('../data/input/test_data.csv')

        oof = np.zeros((len(train_df), len(cfg.models)))
        preds = np.zeros((len(test_df), len(cfg.models)))

        for i, m in enumerate(cfg.models):
            name = getattr(cfg.models, m).name

            log_dir = Path(f'../logs/{name}')
            model_oof = dh.load(log_dir / 'oof.npy')
            model_cfg = dh.load(log_dir / 'config.yml')
            if model_cfg.common.drop:
                drop_idxs = np.array([])
                for drop_name in model_cfg.common.drop:
                    drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                    drop_idxs = np.append(drop_idxs, drop_idx)
                model_oof = factory.fill_dropped(model_oof, drop_idx)

            model_preds = dh.load(f'../logs/{name}/raw_preds.npy')

            oof[:, i] = model_oof[:len(train_df)]
            preds[:, i] = model_preds

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idxs = np.array([])
            for drop_name in model_cfg.common.drop:
                drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                drop_idxs = np.append(drop_idxs, drop_idx)
            train_df = train_df.drop(drop_idxs, axis=0).reset_index(drop=True)

    with t.timer('optimize model weight'):
        metric = factory.get_metrics(cfg.common.metrics.name)
        y_true = train_df[cfg.common.target]

        def objective(trial):
            p_list = [0 for i in range(len(cfg.models))]
            for i in range(len(cfg.models) - 1):
                p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01)
            p_list[-1] = round(1 - sum(p_list[:-1]), 2)

            y_pred = np.zeros(len(train_df))
            for i in range(oof.shape[1]):
                y_pred += oof[:, i] * p_list[i]

            return metric(y_true, y_pred)

        study = optuna.create_study(direction='minimize')
        study.optimize(objective, timeout=10)
        best_params = list(study.best_params.values())
        best_weight = best_params + [round(1 - sum(best_params), 2)]

    with t.timer('ensemble'):
        ensemble_oof = np.zeros(len(train_df))
        ensemble_preds = np.zeros(len(test_df))
        for i in range(len(best_weight)):
            ensemble_oof += oof[:, i] * best_weight[i]
            ensemble_preds += preds[:, i] * best_weight[i]

        dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof)
        dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds)

        cv = metric(y_true, ensemble_oof)
        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')

        print('\n\n===================================\n')
        print(f'CV: {cv:.4f}')
        print(f'BEST WEIGHT: {best_weight}')
        print('\n===================================\n\n')

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.feather'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=ensemble_preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{options.model}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })
Esempio n. 5
0
#         'lambda_l1': 3,
#         'lambda_l2': 3,
         
         'seed': SEED
         }

# =============================================================================
# wait
# =============================================================================
while True:
    if os.path.isfile('SUCCESS_801'):
        break
    else:
        sleep(60*1)

utils.send_line('START {}'.format(__file__))

# =============================================================================
# load
# =============================================================================
imp = pd.read_csv('imp_802_importance_502-2.py.csv').set_index('index')
feature_all = imp[imp.weight!=0].index.tolist()

X_train = pd.read_feather('../data/X_train_mini.f')[feature_all]
y_train = pd.read_feather('../data/y_train_mini.f').is_attributed

gc.collect()

X_valid = pd.read_feather('../data/X_valid_mini.f')[feature_all]
y_valid = pd.read_feather('../data/y_valid_mini.f').is_attributed
gc.collect()
    gc.collect()
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, 
                         feval=utils_metric.lgb_multi_weighted_logloss_gal,
                         early_stopping_rounds=100, verbose_eval=50,
                         seed=SEED)
    model_all += models
    nround_mean += len(ret['multi_logloss-mean'])
    wloss_list.append( ret['wloss-mean'][-1] )

nround_mean = int((nround_mean/LOOP) * 1.3)

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
print(result)

utils.send_line(result)
imp = ex.getImp(model_all)
imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']

imp.sort_values('total', ascending=False, inplace=True)
imp.reset_index(drop=True, inplace=True)

print(imp.head(200).feature.map(lambda x: x.split('_')[0]).value_counts())

COL = imp.feature.tolist()[:3000]
X_gal = X_gal[COL]

# ======== for gal ========
X_gal['g'] = np.arange(X_gal.shape[0]) % NFOLD
        nround_mean += len(ret['auc-mean'])
        loss_list.append(ret['auc-mean'][-1])

    nround_mean = int((nround_mean / LOOP) * 1.3)

    imp = ex.getImp(model_all)
    imp['split'] /= imp['split'].max()
    imp['gain'] /= imp['gain'].max()
    imp['total'] = imp['split'] + imp['gain']
    imp.sort_values('total', ascending=False, inplace=True)
    imp.reset_index(drop=True, inplace=True)

    for i, y_pred in enumerate(y_preds):
        if i == 0:
            oof = y_pred
        else:
            oof += y_pred
    oof /= len(y_preds)

    imp.to_csv(f'LOG/imp_{__file__}.csv', index=False)
    pd.DataFrame(oof, columns=['oof']).to_csv(f'../data/oof_{__file__}.csv',
                                              index=False)

    utils.savefig_imp(imp, f'LOG/imp_{__file__}.png', x='total')

    utils.send_line(f'oof AUC({var}): {round(roc_auc_score(y_train, oof), 5)}')

#==============================================================================
utils.end(__file__)
#utils.stop_instance()
Esempio n. 8
0
def mk_submit():

    files_tr = ('../feature/train_' + features + '.f').tolist()
    files_te = ('../feature/test_' + features + '.f').tolist()

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = loader.train()
    X_train_ = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1)
    X_train = pd.concat([X_train, X_train_], axis=1)

    y_train = utils.read_pickles('../data/label').TARGET

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(loader.category()))

    COL = X_train.columns.tolist()

    # test
    X_test = loader.test()
    X_test_ = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)
    X_test = pd.concat([X_test, X_test_], axis=1)[COL]

    # =============================================================================
    # groupKfold
    # =============================================================================
    sk_tbl = pd.read_csv('../data/user_id_v8.csv.gz')  # TODO: check
    user_tbl = sk_tbl.user_id.drop_duplicates().reset_index(
        drop=True).to_frame()

    sub_train = pd.read_csv('../input/application_train.csv.zip',
                            usecols=['SK_ID_CURR']).set_index('SK_ID_CURR')
    sub_train['y'] = y_train.values

    group_kfold = GroupKFold(n_splits=NFOLD)

    # =============================================================================
    # training with cv
    # =============================================================================
    model_all = []
    auc_mean = 0
    for i in range(LOOP):
        dtrain = lgb.Dataset(X_train,
                             y_train,
                             categorical_feature=CAT,
                             free_raw_data=False)

        # shuffle fold
        ids = list(range(user_tbl.shape[0]))
        np.random.shuffle(ids)
        user_tbl['g'] = np.array(ids) % NFOLD
        sk_tbl_ = pd.merge(sk_tbl, user_tbl, on='user_id',
                           how='left').set_index('SK_ID_CURR')

        sub_train['g'] = sk_tbl_.g
        folds = group_kfold.split(X_train, sub_train['y'], sub_train['g'])

        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             folds=folds,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        auc_mean += ret['auc-mean'][-1]
    auc_mean /= LOOP

    result = f"CV auc-mean({COMMENT}): {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
    param['seed'] = np.random.randint(9999)

    ret, models = lgb.cv(param,
                         dtrain,
                         NROUND,
                         nfold=NFOLD,
                         stratified=True,
                         shuffle=True,
                         feval=ex.eval_auc,
                         early_stopping_rounds=ESR,
                         verbose_eval=VERBOSE_EVAL,
                         seed=SEED + i)

    p_train = ex.eval_oob(X_train[col],
                          y_train.values,
                          models,
                          SEED + i,
                          stratified=True,
                          shuffle=True)

    model_all += models
    nround_mean += len(ret['auc-mean'])
    loss_list.append(ret['auc-mean'][-1])

    utils.send_line(
        f'oof AUC({i}): {round(roc_auc_score(y_train, p_train), 5)}')

#==============================================================================
utils.end(__file__)
#utils.stop_instance()
Esempio n. 10
0
def mk_submit():

    files_tr = ('../feature/train_' + features + '.f').tolist()
    files_te = ('../feature/test_' + features + '.f').tolist()

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = loader.train()
    X_train_ = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1)
    X_train = pd.concat([X_train, X_train_], axis=1)
    y_train = utils.read_pickles('../data/label').TARGET

    # remove old users
    X_train = X_train[new_train_users]
    y_train = y_train[new_train_users]

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(loader.category()))

    COL = X_train.columns.tolist()

    # test
    X_test = loader.test()
    X_test_ = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)
    X_test = pd.concat([X_test, X_test_], axis=1)[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    model_all = []
    auc_mean = 0
    for i in range(LOOP):
        dtrain = lgb.Dataset(X_train,
                             y_train,
                             categorical_feature=CAT,
                             free_raw_data=False)

        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        auc_mean += ret['auc-mean'][-1]
    auc_mean /= LOOP

    result = f"CV auc-mean({COMMENT}): {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
    p_test = model.predict(X_test)
    #    valid_idx_uq = pd.unique(valid_idx%200300)
    for j in range(var_len):
        #        id_y.loc[valid_idx, 'pred'] = p_valid
        oof[valid_idx] = p_valid
        #        oof[valid_idx_uq, j]     = p_valid[j*l:(j+1)*l]
        p_test_all[:, j, i] = p_test[j * 100000:(j + 1) * 100000]

    models.append(model)

id_y['pred'] = oof
oof = pd.pivot_table(id_y, index='id', columns='var',
                     values='pred').head(200000).values

auc = roc_auc_score(y_train, (9 * oof / (1 - oof)).prod(axis=1))
utils.send_line(f'AUC(all var): {auc}')

l = y_train.shape[0]
oof_odds = np.ones(l) * 1 / 9
for j in range(var_len):
    if roc_auc_score(y_train, oof[:, j]) >= 0.500:
        oof_odds *= (9 * oof[:, j] / (1 - oof[:, j]))

auc = roc_auc_score(y_train, oof_odds)
print(f'AUC(th0.5): {auc}')

sub_train = pd.DataFrame(zip(y_train, oof_odds), columns=['y', 'p'])
sub_train.sort_values('p', ascending=False, inplace=True)

for i in range(100, 2000, 100):
    sub_train_ = sub_train.head(i)
Esempio n. 12
0
    param['seed'] = np.random.randint(9999)
    ret, models = lgb.cv(param,
                         dtrain,
                         99999,
                         nfold=NFOLD,
                         early_stopping_rounds=100,
                         verbose_eval=50,
                         seed=SEED + i)
    model_all += models
    nround_mean += len(ret['multi_logloss-mean'])
    wloss_list.append(ret['multi_logloss-mean'][-1])

nround_mean = int((nround_mean / LOOP) * 1.3)

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
utils.send_line(result)

imp = ex.getImp(model_all)
imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']

imp.sort_values('total', ascending=False, inplace=True)
imp.reset_index(drop=True, inplace=True)

print(imp.head(100).feature.map(lambda x: x.split('_')[0]).value_counts())

imp.to_csv(f'LOG/imp_{__file__}-1.csv', index=False)
"""

__file__ = '816_cv_mlogloss.py'
                             free_raw_data=False)
        model = lgb.train(params,
                          train_set=dtrain,
                          num_boost_round=NROUND,
                          verbose_eval=100)
        l = valid_idx.shape[0]

        p_valid = model.predict(X_valid)
        p_test = model.predict(X_test)
        for j in range(var_len):
            oof[valid_idx, j] = p_valid[j * l:(j + 1) * l]
            p_test_all[:, j, i] = p_test[j * 100000:(j + 1) * 100000]

        models.append(model)

    auc = roc_auc_score(y_train, (9 * oof / (1 - oof)).prod(axis=1))
    utils.send_line(f'{ayasii} AUC(all var): {auc}, {AUC_bench1 - auc}')

    l = y_train.shape[0]
    oof_odds = np.ones(l) * 1 / 9
    for j in range(var_len):
        if roc_auc_score(y_train, oof[:, j]) >= 0.500:
            oof_odds *= (9 * oof[:, j] / (1 - oof[:, j]))

    auc = roc_auc_score(y_train, oof_odds)
    utils.send_line(f'{ayasii} AUC(th0.5): {auc}, {AUC_bench2 - auc}')

#==============================================================================
utils.end(__file__)
utils.stop_instance()
# =============================================================================
dtrain = lgb.Dataset(
    X, y, categorical_feature=list(set(X.columns) & set(categorical_feature)))
gc.collect()

ret = lgb.cv(param,
             dtrain,
             9999,
             nfold=5,
             early_stopping_rounds=50,
             verbose_eval=10,
             seed=SEED)
print(f"CV auc-mean {ret['auc-mean'][-1]}")

best_score = ret['auc-mean'][-1]
utils.send_line(f'all features best_score: {best_score}')

# =============================================================================
#
# =============================================================================
dtrain = lgb.Dataset(
    X, y, categorical_feature=list(set(X.columns) & set(categorical_feature)))
model = lgb.train(param, dtrain, len(ret['auc-mean']))

imp = ex.getImp(model)

imp.to_csv(f'LOG/imp_{__file__}.csv', index=False)

#==============================================================================
utils.end(__file__)
utils.stop_instance()
Esempio n. 15
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_x = dh.load('../data/input/train_concated.csv')
        train_org_x = dh.load('../data/input/train.csv')
        train_2019_x = dh.load('../data/input/train_2019.csv')
        test_x = dh.load('../data/input/test.csv')

    with t.timer('make folds'):
        fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]])
        fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]])
        fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True)
        if cfg.validation.val1.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('load features'):
        features = dh.load('../configs/feature/all.yml')['features']
        for f in features:
            train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1)
            test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1)

    with t.timer('drop several rows'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train model'):
        result = train_model(run_name, train_x, fold_df, cfg)
    
    logging.disable(logging.FATAL)
    run_name_cv = f'{run_name}_{result["cv"]:.3f}'
    logger_path.rename(f'../logs/{run_name_cv}')

    with t.timer('predict'):
        preds = predict_test(run_name_cv, test_x, fold_df, cfg)

    with t.timer('post process'):
        duplicates = {
            'ISIC_5224960': 1,
            'ISIC_9207777': 1,
            'ISIC_6457527': 1,
            'ISIC_8347588': 0,
            'ISIC_8372206': 1,
            'ISIC_9353360': 1,
            'ISIC_3689290': 0,
            'ISIC_3584949': 0,  
        }
        for image_name, target in duplicates.items():
            idx = test_x[test_x['image_name'] == image_name].index[0]
            preds[idx] = target

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
        if cfg.common.kaggle.submit:
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': cfg.model.name,
            'local_cv': round(result['cv'], 4),
            'time': process_minutes,
            'comment': comment
        })
Esempio n. 16
0
        for i, y_pred in enumerate(y_preds):
            if i == 0:
                oof = y_pred
            else:
                oof += y_pred
        oof /= len(y_preds)

        auc = round(roc_auc_score(y_train, oof), 5)

        if auc_best < auc:
            auc_best = auc
            round_best = round_
            oof_best = oof

        utils.send_line(f'oof AUC({var, round_}): {auc}')
        result.append(auc)

    result_all.append(result)

    oof_best = pd.DataFrame(oof_best, columns=['oof'])
    oof_best.to_pickle(f'../data/806/oof_{__file__}_{var}_r{round_best}.pkl')

#    imp.to_csv(f'LOG/imp_{__file__}.csv', index=False)
#    utils.savefig_imp(imp, f'LOG/imp_{__file__}.png', x='total')

result_all = pd.DataFrame(result_all,
                          columns=['r4', 'r3', 'r2', 'r1', 'r0'],
                          index=var_names)
result_all.to_csv(f'LOG/auc_{__file__}.csv')
Esempio n. 17
0
        save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True)

with t.timer('save features importances'):
    save_importances(RUN_NAME, models, FEATURES)

with t.timer('make submission'):
    output_path = LOGGER_PATH / f'{METER_TYPE}.csv'
    make_submission(y_pred=np.mean(preds, axis=1),
                    target_name=TARGET_NAME,
                    sample_path=SAMPLE_SUB_PATH,
                    output_path=str(output_path),
                    comp=True)

LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}')

process_minutes = t.get_processing_time()

with t.timer('notify'):
    message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]'''

    send_line(NOTIFY_PARAMS['line']['token'], message)

    send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'],
                url=NOTIFY_PARAMS['notion']['url'],
                name=RUN_NAME,
                created=NOW,
                model=MODEL_NAME.split('_')[0],
                local_cv=round(np.mean(scores), 4),
                time_=process_minutes,
                comment=COMMENT)
Esempio n. 18
0
    y_pred /= len(model_all)

    sub[f'class_{i}'] = y_pred

# Compute preds_99 as the proba of class not being any of the others
# preds_99 = 0.1 gives 1.769
sub.to_csv('../output/sub.csv.gz', index=False, compression='gzip')

preds_99 = np.ones(sub.shape[0])
for i in range(sub.shape[1] - 1):
    preds_99 *= (1 - sub.iloc[:, i + 1])
sub['class_99'] = preds_99

sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

png = f'LOG/sub_{__file__}.png'
utils.savefig_sub(sub, png)
utils.send_line('DONE!', png)

# =============================================================================
# submission
# =============================================================================
if EXE_SUBMIT:
    print('submit')
    utils.submit(SUBMIT_FILE_PATH, COMMENT)

#==============================================================================
utils.end(__file__)
utils.stop_instance()
Esempio n. 19
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)
    dh.save(logger_path / 'features.yml', features_params)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train.csv')
        train2019_df = dh.load('../data/input/train_concated.csv')
        train_x = factory.get_features(features, cfg.data.loader.train)
        test_x = factory.get_features(features, cfg.data.loader.test)
        train_y = factory.get_target(cfg.data.target)

    with t.timer('add oof'):
        if cfg.data.features.oof.name is not None:
            oof, preds = factory.get_oof(cfg.data)
            train_x['oof'] = oof
            test_x['oof'] = preds
            features.append('oof')

    with t.timer('make folds'):
        fold_df = factory.get_fold(cfg.validation, train_df,
                                   train_df[['target']])
        fold_df = pd.concat([
            fold_df,
            pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))),
                         columns=fold_df.columns)
        ],
                            axis=0,
                            sort=False,
                            ignore_index=True)
        if cfg.validation.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('prepare for ad'):
        if cfg.data.adversarial_validation:
            train_x, train_y = factory.get_ad(cfg, train_x, test_x)

    with t.timer('train and predict'):
        trainer = Trainer(cfg)
        cv = trainer.train(train_df=train_x,
                           target_df=train_y,
                           fold_df=fold_df)
        preds = trainer.predict(test_x)
        trainer.save(run_name)

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.data.target.name,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })
Esempio n. 20
0
                         verbose_eval=50,
                         seed=SEED)
    y_pred = ex.eval_oob(X[feature_set[i]],
                         y.values,
                         models,
                         SEED,
                         stratified=True,
                         shuffle=True,
                         n_class=True)
    y_preds.append(y_pred)
    model_all += models
    nround_mean += len(ret['multi_logloss-mean'])
    loss_list.append(ret['multi_logloss-mean'][-1])

nround_mean = int((nround_mean / MOD_N) * 1.3)
utils.send_line(f'nround_mean: {nround_mean}')

result = f"CV multi_logloss: {np.mean(loss_list)} + {np.std(loss_list)}"
utils.send_line(result)

for i, y_pred in enumerate(y_preds):
    if i == 0:
        oof = y_pred
    else:
        oof += y_pred
oof /= len(y_preds)
oof.to_pickle(f'../data/oof_{__file__}.pkl')

oid_gal = pd.read_pickle('../data/tr_oid_gal.pkl')['object_id'].tolist()
oid_exgal = pd.read_pickle('../data/tr_oid_exgal.pkl')['object_id'].tolist()
Esempio n. 21
0
        raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X.shape {X.shape}')

    gc.collect()

    CAT = list(set(X.columns) & set(utils_cat.ALL))

    # =============================================================================
    # cv
    # =============================================================================
    dtrain = lgb.Dataset(X, y, categorical_feature=CAT)
    gc.collect()

    ret = lgb.cv(param,
                 dtrain,
                 9999,
                 nfold=7,
                 early_stopping_rounds=100,
                 verbose_eval=50,
                 seed=SEED)

    result = f"CV auc-mean({HEAD}): {ret['auc-mean'][-1]}"
    print(result)

    utils.send_line(result)

#==============================================================================
utils.end(__file__)
#utils.stop_instance()
def mk_submit(HEAD):

    SUBMIT_FILE_PATH_ = SUBMIT_FILE_PATH.replace('feature', str(HEAD))
    files_tr = ('../feature/train_' + imp.head(HEAD).feature + '.f').tolist()
    files_te = ('../feature/test_' + imp.head(HEAD).feature + '.f').tolist()

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1)
    y_train = utils.read_pickles('../data/label').TARGET

    X_train.head().to_csv(SUBMIT_FILE_PATH_.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(utils_cat.ALL))

    COL = X_train.columns.tolist()

    # test
    X_test = pd.concat(
        [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)],
        axis=1)[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    dtrain = lgb.Dataset(X_train,
                         y_train,
                         categorical_feature=CAT,
                         free_raw_data=False)

    model_all = []
    y_pred = pd.Series(0, index=y_train.index)
    for i in range(LOOP):
        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        y_pred += ex.eval_oob(X_train, y_train, models, i).rank()
    y_pred /= y_pred.max()

    auc_mean = roc_auc_score(y_train, y_pred)
    result = f"CV auc-mean(feature {HEAD}): {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH_, index=False, compression='gzip')
Esempio n. 23
0
print(result)

imp = ex.getImp(model_all)
imp['split'] /= imp['split'].max()
imp['gain'] /= imp['gain'].max()
imp['total'] = imp['split'] + imp['gain']

imp.sort_values('total', ascending=False, inplace=True)
imp.reset_index(drop=True, inplace=True)


imp.to_csv(f'LOG/imp_{__file__}.csv', index=False)

png = f'LOG/imp_{__file__}.png'
utils.savefig_imp(imp, png, x='total', title=f'{__file__}')
utils.send_line(result, png)

for i,y_pred in enumerate(y_preds):
    y_pred = pd.DataFrame(utils_metric.softmax(y_pred.astype(float).values))
    if i==0:
        tmp = y_pred
    else:
        tmp += y_pred
tmp /= len(y_preds)
y_preds = tmp.copy().values.astype(float)

a_score = utils_metric.akiyama_metric(y.values, y_preds)
print('akiyama_metric:', a_score)


# =============================================================================
Esempio n. 24
0
def mk_submit():

    # =============================================================================
    # load
    # =============================================================================
    # train
    X_train = loader.train()
    col = [c for c in X_train.columns if c.startswith('f702_')]
    X_train.drop(col, axis=1, inplace=True)

    y_train = utils.read_pickles('../data/label').TARGET

    X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'),
                          index=False,
                          compression='gzip')

    if X_train.columns.duplicated().sum() > 0:
        raise Exception(
            f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }')
    print('no dup :) ')
    print(f'X_train.shape {X_train.shape}')

    gc.collect()

    CAT = list(set(X_train.columns) & set(loader.category()))
    print('CAT :', CAT)

    COL = X_train.columns.tolist()

    # test
    X_test = loader.test()[COL]

    # =============================================================================
    # training with cv
    # =============================================================================
    dtrain = lgb.Dataset(X_train,
                         y_train,
                         categorical_feature=CAT,
                         free_raw_data=False)

    model_all = []
    y_pred = pd.Series(0, index=y_train.index)
    for i in range(LOOP):
        gc.collect()
        param['seed'] = i
        ret, models = lgb.cv(param,
                             dtrain,
                             9999,
                             nfold=NFOLD,
                             early_stopping_rounds=100,
                             verbose_eval=50,
                             seed=i)
        model_all += models
        y_pred += ex.eval_oob(X_train, y_train, models, i).rank()

        auc_mean = roc_auc_score(y_train, y_pred)
        result = f"CV auc-mean(loop {i}): {auc_mean} {ret['auc-mean'][-1]}"
        print(result)
        utils.send_line(result)

    y_pred /= y_pred.max()

    auc_mean = roc_auc_score(y_train, y_pred)
    result = f"CV auc-mean: {auc_mean}"
    print(result)
    utils.send_line(result)

    # =============================================================================
    # predict
    # =============================================================================
    sub = pd.read_pickle('../data/sub.p')

    gc.collect()

    label_name = 'TARGET'

    sub[label_name] = 0
    for model in model_all:
        y_pred = model.predict(X_test)
        sub[label_name] += pd.Series(y_pred).rank()
    sub[label_name] /= len(model_all)
    sub[label_name] /= sub[label_name].max()
    sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int)

    sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip')

    # =============================================================================
    # submission
    # =============================================================================
    if EXE_SUBMIT:
        print('submit')
        utils.submit(SUBMIT_FILE_PATH, COMMENT)
EXE_SUBMIT = True

np.random.seed(SEED)
print('seed :', SEED)

# =============================================================================
# wait
# =============================================================================
while True:
    if os.path.isfile('SUCCESS_802'):
        break
    else:
        sleep(60 * 1)

utils.send_line('{} start'.format(__file__))
# =============================================================================
# load train
# =============================================================================

dtrain = lgb.Dataset('../data/dtrain.mt')
gc.collect()

# =============================================================================
# xgboost
# =============================================================================

param = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.1,
Esempio n. 26
0
        print(f'add {c}')
    
    dtrain = lgb.Dataset(X[features_new], y,
                         categorical_feature=list( set(features_new)&set(categorical_feature)) )
    ret = lgb.cv(param, dtrain, 9999, nfold=5,
                 early_stopping_rounds=50, verbose_eval=None,
                 seed=SEED)
    score = ret['auc-mean'][-1]
    print(f"auc-mean {score}")
    
    if best_score < score:
        print(f'UPDATE!    SCORE:{score:+.5f}    DIFF:{score-best_score:+.5f}')
        print(f'features: {features_new}')
        best_score = score
        features_curr = features_new
        utils.send_line(f'{c}: {best_score}')


if False:
    # =============================================================================
    # best
    # =============================================================================
    features = ['app_001_AMT_ANNUITY',
                 'app_001_AMT_CREDIT',
                 'app_001_AMT_GOODS_PRICE',
                 'app_001_APARTMENTS_AVG',
                 'app_001_CODE_GENDER',
                 'app_001_COMMONAREA_AVG',
                 'app_001_DAYS_BIRTH',
                 'app_001_DAYS_EMPLOYED',
                 'app_001_DAYS_EMPLOYED-m-DAYS_BIRTH',
Esempio n. 27
0
    # pretrained: False
    num_classes = train_params.model_params.n_classes
    model_wight, oof_list, best_score, train_loss_list, val_loss_list, val_score_list = train_model(
        x_trn, x_val, train_params, num_classes, weights, device)
    np.save(f'../logs/{run_name}/oof_gr.npy', oof_list[0])
    np.save(f'../logs/{run_name}/oof_vo.npy', oof_list[1])
    np.save(f'../logs/{run_name}/oof_co.npy', oof_list[2])

    torch.save(model_wight, f'../logs/{run_name}/weight_best.pt')
    save_png(run_name, train_params, train_loss_list, val_loss_list,
             val_score_list)

logging.disable(logging.FATAL)
logger_path.rename(f'../logs/{run_name}_{best_score:.3f}')

process_minutes = t.get_processing_time(type='hour')

with t.timer('notify'):
    message = f'''{model_name}\ncv: {best_score:.3f}\ntime: {process_minutes:.2f}[h]'''
    send_line(notify_params.line.token, message)

    send_notion(token_v2=notify_params.notion.token_v2,
                url=notify_params.notion.url,
                name=run_name,
                created=now,
                model=train_params.model_params.model_name,
                local_cv=round(best_score, 4),
                time_=process_minutes,
                comment=comment)
Esempio n. 28
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        root = Path(cfg.common.input_root)
        train_df = dh.load(root / cfg.common.img_file)

    with t.timer('create target dataframe'):
        ordinal_target = np.zeros((len(train_df), 6))

        for idx in train_df.index:
            target = train_df.loc[idx, 'isup_grade']
            ordinal_target[idx, :] = [
                1 if target >= i else 0 for i in range(6)
            ]

        target_df = pd.DataFrame(ordinal_target,
                                 columns=[f'target_{i}' for i in range(6)])

    with t.timer('drop several rows'):
        if cfg.common.drop.name is not None:
            drop_idx = dh.load(f'../pickle/{cfg.common.drop.name}.npy')
            train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True)
            target_df = target_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('make folds'):
        train_x_all = train_df.drop('isup_grade', axis=1)
        train_y_all = train_df['isup_grade']
        if cfg.model.n_classes == 1:
            train_y_all = train_y_all.astype(float)
        trn_x, val_x, trn_y, val_y = train_test_split(
            train_x_all,
            target_df,
            test_size=0.2,
            shuffle=True,
            random_state=cfg.common.seed,
            stratify=train_df['isup_grade'])

    with t.timer('train model'):
        result = train_ordinal_reg(run_name, trn_x, val_x, trn_y, val_y, cfg)

    logging.disable(logging.FATAL)
    run_name_cv = f'{run_name}_{result["cv"]:.3f}'
    logger_path.rename(f'../logs/{run_name_cv}')

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
        if cfg.common.kaggle.data:
            kaggle.create_dataset()
        if cfg.common.kaggle.notebook:
            kaggle.push_notebook()

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': cfg.model.name,
            'local_cv': round(result['cv'], 4),
            'time': process_minutes,
            'comment': comment
        })
                      verbose_eval=100)
    l = valid_idx.shape[0]

    p_valid = model.predict(X_valid)
    p_test = model.predict(X_test)
    for j in range(var_len):
        oof[valid_idx] = p_valid
        p_test_all[:, j, i] = p_test[j * 100000:(j + 1) * 100000]

    models.append(model)

id_y['pred'] = oof
oof = pd.pivot_table(id_y, index='id', columns='var', values='pred').values

auc = f'seed{SEED} AUC(all var): {roc_auc_score(y_train, (9 * oof / (1 - oof)).prod(axis=1))}'
utils.send_line(auc)

l = y_train.shape[0]
oof_odds = np.ones(l) * 1 / 9
for j in range(var_len):
    if roc_auc_score(y_train, oof[:, j]) >= 0.500:
        oof_odds *= (9 * oof[:, j] / (1 - oof[:, j]))

auc = f'seed{SEED} AUC(th0.5): {roc_auc_score(y_train, oof_odds)}'
utils.send_line(auc)

# save raw pred
np.save('../data/{__file__}_oof', oof)
np.save('../data/{__file__}_p_test_all', p_test_all)

# =============================================================================
Esempio n. 30
0
                         seed=SEED)
    y_pred = ex.eval_oob(
        X_52_90,
        y_52_90.values,
        models,
        SEED,
        stratified=True,
        shuffle=True,
    )
    oofs.append(y_pred)
    model_all += models
    nround_mean += len(ret['auc-mean'])
    wloss_list.append(ret['auc-mean'][-1])

nround_mean = int((nround_mean / 2) * 1.3)
utils.send_line(f'nround_mean: {nround_mean}')

result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}"
utils.send_line(result)

for i, y_pred in enumerate(oofs):
    if i == 0:
        tmp = y_pred
    else:
        tmp += y_pred
tmp /= len(oofs)
oof = tmp.copy().values.astype(float)

X_oid_52_90['52vs90'] = oof

# =============================================================================