valid_score = np.mean(cv_results) message = f"""cv: {valid_score:.5f} feats: {feats} model_params: {lgb_params} fit_params: {fit_params}""" send_line_notification(message) print('=' * 60) print(message) print('=' * 60) with timer('output results'): RESULT_DIR = OUTPUT / (timestamp() + '_' + NAME) RESULT_DIR.mkdir() val_df = pd.DataFrame({ 'TARGET': y_train, 'p': val_series }).to_csv(RESULT_DIR / f'{NAME}_cv_pred.csv', index=False) pred = test_df.mean(axis=1).ravel() generate_submit(pred, f'{NAME}_{valid_score:.5f}', RESULT_DIR) print('output feature importances') feat_df.mean(axis=1).sort_values(ascending=False).to_csv(RESULT_DIR / 'feats.csv') imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50] imp[::-1].plot.barh(figsize=(20, 15)) plt.savefig(str(RESULT_DIR / 'feature_importances.pdf'), bbox_inches='tight')
feat_df[i] = model.feature_importances_ val_df = pd.DataFrame({ 'TARGET': y_train, 'p': val_series }).to_csv(OUTPUT / f'{NAME}_cv_pred.csv', index=False) valid_score = np.mean(cv_results) message = f"""cv: {valid_score: .5f} feats: {feats} model_params: {lgb_params} fit_params: {fit_params}""" send_line_notification(message) print('=' * 60) print(message) print('=' * 60) if rank_average: pred = test_df.apply(lambda x: rankdata(x) / len(x)).mean(axis=1).ravel() else: pred = test_df.mean(axis=1).ravel() generate_submit(pred, f'{NAME}_{valid_score:.5f}') print('output feature importances') feat_df.mean(axis=1).sort_values(ascending=False).to_csv(OUTPUT / f'{NAME}_feat.csv') imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50] imp[::-1].plot.barh(figsize=(20, 15)) plt.savefig(str(DROPBOX / f'{NAME}_feature_importances.pdf'), bbox_inches='tight')
def run(name, feats, params, fit_params, fill=-9999): logger = getLogger(name) logger.setLevel(DEBUG) ch = StreamHandler() ch.setLevel(DEBUG) handler = StreamHandler() handler.setLevel(DEBUG) logger.setLevel(DEBUG) logger.addHandler(handler) train = pd.read_feather(str(TRAIN)) with timer('load datasets'): X_train, y_train, X_test, cv = load_dataset(feats) print('train:', X_train.shape) print('test :', X_test.shape) with timer('clean datasets'): # drop id cols id_cols = X_train.filter(regex='(SK_ID_CURR|SK_ID_PREV)').columns print('drop id:', id_cols.tolist()) X_train.drop(id_cols, axis=1, inplace=True) X_test.drop(id_cols, axis=1, inplace=True) # drop columns which contains many NaN ref_train = X_train.isnull().mean() > 0.95 ref_test = X_test.isnull().mean() > 0.95 nan_cols = X_train.columns[ref_train | ref_test] print('drop many nan:', nan_cols.tolist()) X_train.drop(nan_cols, axis=1, inplace=True) X_test.drop(nan_cols, axis=1, inplace=True) print('train:', X_train.shape) print('test :', X_test.shape) with timer('impute missing'): if fill == 'mean': assert X_train.mean().isnull().sum() == 0 print('fill nan with mean') X_train.fillna(X_train.mean(), inplace=True) X_test.fillna(X_train.mean(), inplace=True) else: print(f'fill nan with {fill}') X_train.fillna(fill, inplace=True) X_test.fillna(fill, inplace=True) assert X_train.isnull().sum().sum() == 0 assert X_test.isnull().sum().sum() == 0 if 'colsample_bytree' in params and params['colsample_bytree'] == 'auto': n_samples = X_train.shape[1] params['colsample_bytree'] = np.sqrt(n_samples) / n_samples print(f'set colsample_bytree = {params["colsample_bytree"]}') with timer('training'): cv_results = [] cv_df = pd.DataFrame(index=range(len(y_train)), columns=range(cv.get_n_splits())) test_df = pd.DataFrame() feat_df = None for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)): X_trn = X_train.iloc[trn_idx].copy() y_trn = y_train[trn_idx] X_val = X_train.iloc[val_idx].copy() y_val = y_train[val_idx] X_tst = X_test.copy() print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) with timer('target encoding'): cat_cols = X_trn.select_dtypes(['object']).columns.tolist() te = TargetEncoder(cols=cat_cols) X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols], y_trn) X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols]) X_tst.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols]) with timer('fit'): model = lgb.LGBMClassifier(**params) model.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], **fit_params) p = model.predict_proba(X_val)[:, 1] cv_df.loc[val_idx, i] = p cv_results.append(roc_auc_score(y_val, p)) test_df[i] = model.predict_proba(X_tst)[:, 1] if feat_df is None: feat_df = pd.DataFrame(index=X_trn.columns) feat_df[i] = model.feature_importances_ valid_score = np.mean(cv_results) message = f"""cv: {valid_score:.5f} scores: {[round(c, 4) for c in cv_results]} feats: {feats} model_params: {params} fit_params: {fit_params}""" send_line_notification(message) with timer('output results'): RESULT_DIR = OUTPUT / (timestamp() + '_' + name) RESULT_DIR.mkdir() # output cv prediction tmp = pd.DataFrame({ 'SK_ID_CURR': train['SK_ID_CURR'], 'TARGET': cv_df.mean(axis=1) }) tmp.to_csv(RESULT_DIR / f'{name}_cv.csv', index=None) # output test prediction pred = test_df.mean(axis=1).ravel() generate_submit(pred, f'{name}_{valid_score:.5f}', RESULT_DIR, compression=False) # output feature importances feat_df = (feat_df / feat_df.mean(axis=0)) * 100 feat_df.mean(axis=1).sort_values(ascending=False).to_csv(RESULT_DIR / 'feats.csv') imp = feat_df.mean(axis=1).sort_values(ascending=False)[:50] imp[::-1].plot.barh(figsize=(20, 15)) plt.savefig(str(RESULT_DIR / 'feature_importances.pdf'), bbox_inches='tight') print('=' * 60) print(message) print('=' * 60)