free_raw_data=False) gc.collect() model_all = [] nround_mean = 0 wloss_list = [] y_preds = [] for i in range(1): gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, fobj=utils_metric.wloss_objective, feval=utils_metric.wloss_metric, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X, y, models, SEED, stratified=True, shuffle=True, n_class=y.unique().shape[0]) y_preds.append(y_pred) model_all += models nround_mean += len(ret['wloss-mean']) wloss_list.append( ret['wloss-mean'][-1] ) nround_mean = int((nround_mean/1) * 1.3) result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" print(result) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain']
# best(gal) # ============================================================================= N = best_N #N = 250 dtrain = lgb.Dataset(X_gal[COL[:N]], y_gal, #categorical_feature=CAT, free_raw_data=False) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, fobj=utils_metric.wloss_objective_gal, feval=utils_metric.wloss_metric_gal, folds=group_kfold.split(X_gal, y_gal, group_gal), early_stopping_rounds=100, verbose_eval=50, seed=SEED) score = ret['wloss-mean'][-1] y_pred_gal = ex.eval_oob(X_gal[COL[:N]], y_gal, models, SEED, stratified=True, shuffle=True, n_class=y_exgal.unique().shape[0]) # ============================================================================= # cv(exgal) # ============================================================================= print('==== EXGAL ====') param['num_class'] = 9 param['learning_rate'] = 0.1 dtrain = lgb.Dataset(X_exgal, y_exgal, #categorical_feature=CAT, free_raw_data=False) gc.collect() model_all = [] nround_mean = 0 wloss_list = []
param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, NROUND, nfold=NFOLD, stratified=True, shuffle=True, feval=ex.eval_auc, early_stopping_rounds=ESR, verbose_eval=VERBOSE_EVAL, seed=SEED + i) y_pred = ex.eval_oob(X_train, y_train.values, models, SEED + i, stratified=True, shuffle=True) y_preds.append(y_pred) model_all += models nround_mean += len(ret['auc-mean']) loss_list.append(ret['auc-mean'][-1]) nround_mean = int((nround_mean / LOOP) * 1.3) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True)
def mk_submit(HEAD): SUBMIT_FILE_PATH_ = SUBMIT_FILE_PATH.replace('feature', str(HEAD)) files_tr = ('../feature/train_' + imp.head(HEAD).feature + '.f').tolist() files_te = ('../feature/test_' + imp.head(HEAD).feature + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH_.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(utils_cat.ALL)) COL = X_train.columns.tolist() # test X_test = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(feature {HEAD}): {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH_, index=False, compression='gzip')
def mk_submit(): # ============================================================================= # load # ============================================================================= # train X_train = loader.train() col = [c for c in X_train.columns if c.startswith('f702_')] X_train.drop(col, axis=1, inplace=True) y_train = utils.read_pickles('../data/label').TARGET X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(loader.category())) print('CAT :', CAT) COL = X_train.columns.tolist() # test X_test = loader.test()[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(loop {i}): {auc_mean} {ret['auc-mean'][-1]}" print(result) utils.send_line(result) y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean: {auc_mean}" print(result) utils.send_line(result) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
oofs = [] for i in range(LOOP): gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob( X_52_90, y_52_90.values, models, SEED, stratified=True, shuffle=True, ) oofs.append(y_pred) model_all += models nround_mean += len(ret['auc-mean']) wloss_list.append(ret['auc-mean'][-1]) nround_mean = int((nround_mean / 2) * 1.3) utils.send_line(f'nround_mean: {nround_mean}') result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" utils.send_line(result) for i, y_pred in enumerate(oofs):
# ============================================================================= # cv # ============================================================================= dtrain = lgb.Dataset(X, y, categorical_feature=CAT) gc.collect() ret, models = lgb.cv(param, dtrain, 9999, folds=folds, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X, y, models, SEED) result = f"CV auc-mean({COMMENT}): {ret['auc-mean'][-1]} + {ret['auc-stdv'][-1]}" print(result) utils.send_line(result) imp = ex.getImp(models) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) imp.to_csv('LOG/imp_f021.csv', index=False)
param, dtrain, NROUND, # nfold=NFOLD, folds=group_kfold.split(X_train_, y_train_, group), stratified=True, shuffle=True, feval=ex.eval_auc, early_stopping_rounds=ESR, verbose_eval=VERBOSE_EVAL, seed=SEED + i) y_pred = ex.eval_oob(X_train_, y_train_.values, models, SEED + i, folds=group_kfold.split(X_train_, y_train_, group), stratified=True, shuffle=True) y_preds.append(y_pred) model_all += models nround_mean += len(ret['auc-mean']) loss_list.append(ret['auc-mean'][-1]) nround_mean = int((nround_mean / LOOP) * 1.3) # ============================================================================= # test # =============================================================================
param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, NROUND, nfold=NFOLD, stratified=True, shuffle=True, feval=ex.eval_auc, early_stopping_rounds=ESR, verbose_eval=VERBOSE_EVAL, seed=SEED + i) p_train = ex.eval_oob(X_train[col], y_train.values, models, SEED + i, stratified=True, shuffle=True) model_all += models nround_mean += len(ret['auc-mean']) loss_list.append(ret['auc-mean'][-1]) utils.send_line( f'oof AUC({i}): {round(roc_auc_score(y_train, p_train), 5)}') #============================================================================== utils.end(__file__) #utils.stop_instance()
# ============================================================================= # cv # ============================================================================= dtrain = lgb.Dataset(X, y) gc.collect() ret, models = lgb.cv(param, dtrain, 99999, nfold=7, early_stopping_rounds=100, verbose_eval=50, seed=111) y_pred = ex.eval_oob(X, y, models, 111) result = f"CV auc-mean: {ret['auc-mean'][-1]} + {ret['auc-stdv'][-1]}" print(result) utils.send_line(result) imp = ex.getImp(models) # ============================================================================= # cv loop # ============================================================================= from sklearn.metrics import roc_auc_score dtrain = lgb.Dataset(X, y, free_raw_data=False) gc.collect()
# ============================================================================= # cv loop # ============================================================================= dtrain = lgb.Dataset(X_new, y, categorical_feature=CAT, free_raw_data=False) gc.collect() y_pred = pd.Series(0, index=y.index) for i in range(5): ret, models = lgb.cv(param, dtrain, 99999, nfold=7, early_stopping_rounds=100, verbose_eval=50, seed=i) y_pred += ex.eval_oob(X_new, y, models, i).rank() y_pred /= y_pred.max() auc_mean = roc_auc_score(y, y_pred) result = f"CV auc-mean(ext imp): {auc_mean}" print(result) utils.send_line(result) #============================================================================== utils.end(__file__) #utils.stop_instance()
def mk_submit(): files_tr = ('../feature/train_' + features + '.f').tolist() files_te = ('../feature/test_' + features + '.f').tolist() # ============================================================================= # load # ============================================================================= # train X_train = pd.concat( [pd.read_feather(f) for f in tqdm(files_tr, mininterval=60)], axis=1) y_train = utils.read_pickles('../data/label').TARGET # ============================================================================= # remove old users # ============================================================================= X_train['SK_ID_CURR'] = SK_ID_CURR y_train = y_train[~X_train.SK_ID_CURR.isin(drop_ids)] X_train = X_train[~X_train.SK_ID_CURR.isin(drop_ids)] oof_train = X_train[['SK_ID_CURR']] X_train.drop('SK_ID_CURR', axis=1, inplace=True) X_train.head().to_csv(SUBMIT_FILE_PATH.replace('.csv', '_X.csv'), index=False, compression='gzip') if X_train.columns.duplicated().sum() > 0: raise Exception( f'duplicated!: { X_train.columns[X_train.columns.duplicated()] }') print('no dup :) ') print(f'X_train.shape {X_train.shape}') gc.collect() CAT = list(set(X_train.columns) & set(utils_cat.ALL)) COL = X_train.columns.tolist() # test X_test = pd.concat( [pd.read_feather(f) for f in tqdm(files_te, mininterval=60)], axis=1)[COL] # ============================================================================= # training with cv # ============================================================================= dtrain = lgb.Dataset(X_train, y_train, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y_train.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X_train, y_train, models, i).rank() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean(loop {i}): {auc_mean}" print(result) utils.send_line(result) y_pred /= y_pred.max() auc_mean = roc_auc_score(y_train, y_pred) result = f"CV auc-mean: {auc_mean}" print(result) utils.send_line(result) # save oof_train['oof'] = y_pred oof_train.to_csv('../output/onodera-last-oof.csv', index=False) # ============================================================================= # predict # ============================================================================= sub = pd.read_pickle('../data/sub.p') gc.collect() label_name = 'TARGET' sub[label_name] = 0 for model in model_all: y_pred = model.predict(X_test) sub[label_name] += pd.Series(y_pred).rank() sub[label_name] /= len(model_all) sub[label_name] /= sub[label_name].max() sub['SK_ID_CURR'] = sub['SK_ID_CURR'].map(int) sub.to_csv(SUBMIT_FILE_PATH, index=False, compression='gzip') # ============================================================================= # submission # ============================================================================= if EXE_SUBMIT: print('submit') utils.submit(SUBMIT_FILE_PATH, COMMENT)
# ============================================================================= # cv # ============================================================================= dtrain = lgb.Dataset(X, y, categorical_feature=CAT) gc.collect() ret, models = lgb.cv(param, dtrain, 9999, folds=folds, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X, y, models, SEED) result = f"CV auc-mean({COMMENT}): {ret['auc-mean'][-1]} + {ret['auc-stdv'][-1]}" print(result) utils.send_line(result) imp = ex.getImp(models) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) # =============================================================================
gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, fobj=utils_metric.wloss_objective_gal, feval=utils_metric.wloss_metric_gal, early_stopping_rounds=100, verbose_eval=50, folds=group_kfold.split(X_gal, y_gal, group_gal), seed=SEED) oof = ex.eval_oob(X_gal, y_gal.values, models, SEED, stratified=True, shuffle=True, n_class=True) oofs_gal.append(oof) # model_all += models nround_mean += len(ret['wloss-mean']) wloss_list.append(ret['wloss-mean'][-1]) nround_mean = int((nround_mean / 2) * 1.3) utils.send_line(f'nround_mean: {nround_mean}') result = f"CV GAL wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" utils.send_line(result) # =============================================================================
gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, fobj=utils_metric.wloss_objective, feval=utils_metric.wloss_metric, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X[feature_set[i]], y.values, models, SEED, stratified=True, shuffle=True, n_class=True) y_preds.append(y_pred) model_all += models nround_mean += len(ret['wloss-mean']) wloss_list.append(ret['wloss-mean'][-1]) nround_mean = int((nround_mean / MOD_N) * 1.3) utils.send_line(f'nround_mean: {nround_mean}') result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" print(result) utils.send_line(result)
dtrain = lgb.Dataset(X, y, categorical_feature=CAT, free_raw_data=False) model_all = [] y_pred = pd.Series(0, index=y.index) for i in range(LOOP): gc.collect() param['seed'] = i ret, models = lgb.cv(param, dtrain, 9999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=i) model_all += models y_pred += ex.eval_oob(X, y, models, i) auc_mean = roc_auc_score(y, y_pred) result = f"CV auc-mean(loop {i}): {auc_mean} {ret['auc-mean'][-1]}" print(result) utils.send_line(result) y_pred /= LOOP auc_mean = roc_auc_score(y, y_pred) result = f"CV auc-mean: {auc_mean}" print(result) utils.send_line(result) y_pred.name = 'f190_adv' y_pred = y_pred.to_frame()
gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, fobj=utils_metric.wloss_objective, feval=utils_metric.wloss_metric, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X[COL], y.values, models, SEED, stratified=True, shuffle=True, n_class=True) y_preds.append(y_pred) model_all += models nround_mean += len(ret['wloss-mean']) wloss_list.append(ret['wloss-mean'][-1]) nround_mean = int((nround_mean / 2) * 1.3) utils.send_line(f'nround_mean: {nround_mean}') result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" print(result) utils.send_line(result)
gc.collect() model_all = [] nround_mean = 0 auc_list = [] for i in range(LOOP): gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X, y, models, SEED, stratified=True, shuffle=True) model_all += models nround_mean += len(ret['auc-mean']) auc_list.append(ret['auc-mean'][-1]) nround_mean = int((nround_mean / LOOP) * 1.3) result = f"CV wloss: {np.mean(nround_mean)} + {np.std(nround_mean)}" print(result) utils.send_line(result) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain']