def get_imp(shuffle=True, loop=1): if shuffle: dtrain = lgb.Dataset(X, y.sample(frac=1).values, free_raw_data=False) gc.collect() else: dtrain = lgb.Dataset(X, y.values, free_raw_data=False) gc.collect() model_all = [] nround_mean = 0 wloss_list = [] for i in range(loop): gc.collect() param['seed'] = np.random.randint(9999) ret, models = lgb.cv(param, dtrain, 99999, nfold=NFOLD, fobj=utils_metric.wloss_objective, feval=utils_metric.wloss_metric, early_stopping_rounds=100, verbose_eval=200, seed=SEED + i) model_all += models nround_mean += len(ret['multi_logloss-mean']) wloss_list.append(ret['wloss-mean'][-1]) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) return imp.set_index('feature')
CAT = list(set(X.columns) & set(utils_cat.ALL)) if X.columns.duplicated().sum() > 0: raise Exception(f'duplicated!: { X.columns[X.columns.duplicated()] }') print('no dup :) ') print(f'X.shape {X.shape}') gc.collect() # ============================================================================= # imp # ============================================================================= dtrain = lgb.Dataset(X, y, categorical_feature=CAT) model = lgb.train(params, dtrain, 1000) imp = ex.getImp(model).sort_values(['gain', 'feature'], ascending=[False, True]) features_search = imp[imp['split'] > 0].feature.tolist() features_curr = features_search[:20] # ============================================================================= # stepwise # ============================================================================= ex.stepwise(params, X, y, features_search, features_curr, best_score=0, send_line=utils.send_line,
shuffle=True, feval=ex.eval_auc, early_stopping_rounds=ESR, verbose_eval=VERBOSE_EVAL, categorical_feature=CAT, seed=SEED) for i, model in enumerate(models): model.save_model(f'../data/lgb{i}.model') #models = [] #for i in range(LOOP): # model = lgb.Booster(model_file=f'../data/lgb{i}.model') # models.append(model) imp = ex.getImp(models) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) imp.to_csv(f'LOG/imp_{__file__}.csv', index=False) utils.savefig_imp(imp, f'LOG/imp_{__file__}.png', x='total') RESULT_DICT['nfold'] = NFOLD RESULT_DICT['seed'] = SEED RESULT_DICT['eta'] = param['learning_rate'] RESULT_DICT['best NROUND'] = len(ret['auc-mean']) RESULT_DICT['train AUC'] = ret['auc-mean'][-1]
feval=utils_metric.wloss_metric, early_stopping_rounds=100, verbose_eval=50, seed=SEED) y_pred = ex.eval_oob(X, y, models, SEED, stratified=True, shuffle=True, n_class=y.unique().shape[0]) y_preds.append(y_pred) model_all += models nround_mean += len(ret['wloss-mean']) wloss_list.append( ret['wloss-mean'][-1] ) nround_mean = int((nround_mean/1) * 1.3) result = f"CV wloss: {np.mean(wloss_list)} + {np.std(wloss_list)}" print(result) imp = ex.getImp(model_all) imp['split'] /= imp['split'].max() imp['gain'] /= imp['gain'].max() imp['total'] = imp['split'] + imp['gain'] imp.sort_values('total', ascending=False, inplace=True) imp.reset_index(drop=True, inplace=True) imp.to_csv(f'LOG/imp_{__file__}.csv', index=False) png = f'LOG/imp_{__file__}.png' utils.savefig_imp(imp, png, x='total', title=f'{__file__}') utils.send_line(result, png) for i,y_pred in enumerate(y_preds):