def hold_out_lgb_validation(X, y, params, eval_metric='mae', columns=None, plot_feature_importance=False, verbose=10000, early_stopping_rounds=200, n_estimators=50000, ): columns = X.columns if columns is None else columns # to set up scoring parameters metrics_dict = {'mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error}, 'group_mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae}, 'mse': {'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error} } result_dict = {} X_train, X_valid, y_train, y_valid = train_test_split(X[columns], y, test_size=0.1, random_state=42) eval_result = {} callbacks = [lgb.record_evaluation(eval_result)] model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds, callbacks=callbacks) y_pred_valid = model.predict(X_valid) if eval_metric != 'group_mae': score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid) else: score = metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']) if plot_feature_importance: # feature importance feature_importance = pd.DataFrame() feature_importance["feature"] = columns feature_importance["importance"] = model.feature_importances_ else: feature_importance = None try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + f'HOLD_OUT score: {score:.4f} .' print(cv_score_msg) send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["model"] = model result_dict['y_pred_valid'] = pd.DataFrame(y_pred_valid, index=X_valid.index, columns=["scalar_coupling_constant"]) result_dict['score'] = score result_dict["importance"] = feature_importance result_dict["eval_result"] = eval_result return result_dict
def hold_out_lgb_validation( X_train, X_valid, y_train, y_valid, params, columns=None, plot_feature_importance=False, verbose=10000, early_stopping_rounds=200, n_estimators=50000, ): columns = X.columns if columns is None else columns result_dict = {} model = lgb.LGBMClassifier(**params, n_estimators=n_estimators, n_jobs=-1) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric="auc", verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict(X_valid) score = roc_auc_score(y_valid, y_pred_valid) if plot_feature_importance: # feature importance feature_importance = pd.DataFrame() feature_importance["feature"] = columns feature_importance["importance"] = model.feature_importances_ else: feature_importance = None try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + f'HOLD_OUT score: {score:.4f} .' print(cv_score_msg) send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["model"] = model result_dict['y_pred_valid'] = pd.DataFrame( y_pred_valid, index=X_valid.index, columns=["scalar_coupling_constant"]) result_dict['score'] = score result_dict["importance"] = feature_importance return result_dict
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, mol_type=-1, fold_group=None): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ columns = X.columns if columns is None else columns X_test = X_test[columns] # to set up scoring parameters metrics_dict = { 'mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error }, 'group_mae': { 'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae }, 'mse': { 'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error } } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(len(X)) # averaged predictions on train data prediction = np.zeros(len(X_test)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() model_list = [] # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=fold_group)): print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[ valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1, importance_type='gain') print(model) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] params["objective"] = "reg:linear" params["eval_metric"] = metrics_dict[eval_metric][ 'lgb_metric_name'] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor( iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric] ['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) if eval_metric != 'group_mae': scores.append( metrics_dict[eval_metric]['sklearn_scoring_function']( y_valid, y_pred_valid)) else: scores.append(metrics_dict[eval_metric]['scoring_function']( y_valid, y_pred_valid, X_valid['type'])) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 feature_importance = pd.concat( [feature_importance, fold_importance], axis=0) model_list += [model] if model_type == 'lgb' and plot_feature_importance: result_dict['importance'] = feature_importance prediction /= folds.n_splits try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + ' CV mean score: {0:.4f}, std: {1:.4f}.'.format( np.mean(scores), np.std(scores)) print(cv_score_msg) send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["models"] = model_list result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores # if model_type == 'lgb': # if plot_feature_importance: # feature_importance["importance"] /= folds.n_splits # cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values( # by="importance", ascending=False)[:50].index # # best_features = feature_importance.loc[feature_importance.feature.isin(cols)] # # plt.figure(figsize=(16, 12)); # sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False)); # plt.title('LGB Features (avg over folds)'); # feature_importance.to_csv(log_path/f"importance_{mol_type}.csv") # return result_dict
importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.csv' result_dict["importance"].to_csv(importance_path, index=True) for t, s in zip(X['type'].unique(), score_list): print(f"type {t}, score: {s:0.5f}") if TRAIN_ALL_DATA or CV_FOLD: ######################################################################################################### # create oof & submission file. sub = pd.read_csv(f'../input/sample_submission.csv') sub['scalar_coupling_constant'] = X_short_test['prediction'] sub.to_csv(submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=False) print(sub.head()) send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}") if CV_FOLD: oof_log_mae = group_mean_log_mae(X_short['target'], X_short['oof'], X_short['type'], floor=1e-9) print(f"oof_log_mae: {oof_log_mae}") df_oof = pd.DataFrame(index=train.id) df_oof["scalar_coupling_constant"] = X_short['oof'] df_oof.to_csv(submit_path / f'oof_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=True) send_message( f"finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}"
verbose=500, early_stopping_rounds=200, n_estimators=15000, mol_type=t) X_short.loc[X_short['type'] == t, 'oof'] = result_dict_lgb3['oof'] X_short_test.loc[X_short_test['type'] == t, 'prediction'] = result_dict_lgb3['prediction'] X_short.to_csv(submit_path / f"tmp_oof_{t}.csv") X_short_test.to_csv(submit_path / f"tmp_sub_{t}.csv") to_pickle(model_path / f"second_model_list_{DATA_VERSION}_{TRIAL_NO}.pkl", result_dict_lgb3["models"]) sub['scalar_coupling_constant'] = X_short_test['prediction'] sub.to_csv(submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}.csv', index=False) print(sub.head()) oof_log_mae = group_mean_log_mae(X_short['target'], X_short['oof'], X_short['type'], floor=1e-9) print(f"oof_log_mae: {oof_log_mae}") print(f"finished. : {current_time()}") df_oof = pd.DataFrame(index=train.id) df_oof["scalar_coupling_constant"] = X_short['oof'] df_oof.to_csv(submit_path / f'oof_{DATA_VERSION}_{TRIAL_NO}.csv', index=True) send_message( f"finish train_{DATA_VERSION}_{TRIAL_NO}, oof_log_mae: {oof_log_mae}")
def train_main(seed, type_): print(f"==================== seed: {seed} ====================") params = { #'num_leaves': 128, 'min_child_samples': 79, 'objective': 'regression', 'max_depth': -1, #9, 'learning_rate': 0.2, "boosting_type": "gbdt", "subsample_freq": 1, "subsample": 0.9, "metric": 'mae', "verbosity": -1, 'reg_alpha': 0.1, 'reg_lambda': 0.3, 'colsample_bytree': 1.0, 'num_threads' : -1, } params["seed"] = seed params["bagging_seed"] = seed + 1 params["feature_fraction_seed"] = seed + 2 n_estimators = 5 #10000 params["num_leaves"] = 256 if DEBUG: n_estimators = 5 X_short = pd.DataFrame({ 'ind': list(X.index), 'type': X['type'].values, 'oof': [0] * len(X), 'target': y.values, 'fc': y_fc.values }) X_short_test = pd.DataFrame({ 'ind': list(X_test.index), 'type': X_test['type'].values, 'prediction': [0] * len(X_test) }) print(f'{current_time()} Training of type {type_} / {X["type"].unique()}') X_t = X.loc[X['type'] == type_] X_test_t = X_test.loc[X_test['type'] == type_] y_fc_t = X_short.loc[X_short['type'] == type_, 'fc'] y_t = X_short.loc[X_short['type'] == type_, 'target'] mol_name_t = mol_name.loc[X['type'] == type_][ X_t.index] if GROUP_K_FOLD else None print( f"X_t.shape: {X_t.shape}, X_test_t.shape: {X_test_t.shape}, y_t.shape: {y_t.shape}" ) ######################################################################################################## # fc print("=" * 30 + " fc " + "=" * 30) result_dict_lgb1 = train_model_regression(X=X_t, X_test=X_test_t, y=y_fc_t, params=params, folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=False, verbose=1000, early_stopping_rounds=200, n_estimators=n_estimators, fold_group=mol_name.values) X['oof_fc'] = result_dict_lgb1['oof'] X_test['oof_fc'] = result_dict_lgb1['prediction'] to_pickle( submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", X['oof_fc']) to_pickle( submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", X_test['oof_fc']) to_pickle( model_path / f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", result_dict_lgb1["models"]) ######################################################################################################### # 2nd layer model params["seed"] = seed + 3 params["bagging_seed"] = seed + 4 params["feature_fraction_seed"] = seed + 5 params["num_leaves"] = 256 # num_leaves_dict[t] start_time = current_time() bairitsu = 256 / params["num_leaves"] n_estimators = 5 #int(15000 * bairitsu) if DEBUG: n_estimators = 5 if TRAIN_ALL_DATA: print("============= 2nd layer TRIAN ALL DATA ================") result_dict = train_lgb_regression_alldata( X=X_t, X_test=X_test_t, y=y_t, params=params, eval_metric='group_mae', plot_feature_importance=True, verbose=5000, n_estimators=int(n_estimators * 1.6), mol_type=type_) X_short_test.loc[X_short_test['type'] == type_, 'prediction'] = result_dict['prediction'] X_short_test.to_csv( submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") elif CV_FOLD: print("============= 2nd layer CV ================") result_dict = train_model_regression(X_t, X_test_t, y_t, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=True, model=None, verbose=1000, early_stopping_rounds=200, n_estimators=n_estimators, mol_type=-1, fold_group=mol_name_t) result_dict["start_time"] = start_time result_dict["n_estimator"] = n_estimators result_dict["X_t_len"] = X_t.shape[0] result_dict["type"] = type_ result_dict["type_name"] = type_name[type_] X_short.loc[X_short['type'] == type_, 'oof'] = result_dict['oof'] X_short.to_csv(submit_path / f"oof_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") X_short_test.loc[X_short_test['type'] == type_, 'prediction'] = result_dict['prediction'] X_short_test.to_csv( submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") else: print("============= 2nd layer hold out ================") result_dict = hold_out_lgb_validation(X=X_t, y=y_t, params=params, eval_metric='mae', plot_feature_importance=True, verbose=5000, early_stopping_rounds=200, n_estimators=n_estimators) result_dict["start_time"] = start_time result_dict["n_estimator"] = n_estimators result_dict["X_t_len"] = X_t.shape[0] result_dict["type"] = type_ result_dict["type_name"] = type_name[type_] eval_result: list = result_dict["eval_result"]["valid_1"]["l1"] training_log_df: pd.DataFrame = pd.DataFrame( eval_result, index=np.arange(len(eval_result)) + 1) training_log_df.columns = ["l1"] training_log_df.index.name = "iter" training_log_df.to_csv( log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{type_}.csv") to_pickle( model_path / f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", result_dict["model"]) # # # to_pickle(log_path / f"result_dict_{type_}_{seed}.pkl", result_dict) # importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv' # result_dict["importance"].to_csv(importance_path, index=True) # # for type_, s in zip(X['type'].unique(), score_list): # print(f"type {type_}, score: {s:0.5f}") if TRAIN_ALL_DATA or CV_FOLD: ######################################################################################################### # create oof & submission file. sub = pd.read_csv(f'../input/sample_submission.csv') sub['scalar_coupling_constant'] = X_short_test['prediction'] sub.to_csv(submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=False) print(sub.head()) send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}") if CV_FOLD: oof_log_mae = group_mean_log_mae(X_short['target'], X_short['oof'], X_short['type'], floor=1e-9) print(f"oof_log_mae: {oof_log_mae}") df_oof = pd.DataFrame(index=train.id) df_oof["scalar_coupling_constant"] = X_short['oof'] df_oof.to_csv(submit_path / f'oof_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=True) send_message( f"finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}" )
for t, s in zip(X_rgs['type'].unique(), score_list): print(f"type {t}, score: {s:0.5f}") sub = pd.read_csv(f'../input/sample_submission.csv') sub['scalar_coupling_constant'] = X_short_test['prediction'] sub.to_csv( submit_path / f'submission_nn_{DATA_VERSION}_{TRIAL_NO}_{MODEL_TYPE}_{seed}.csv', index=False) print(sub.head()) oof_log_mae = group_mean_log_mae(X_short['target'], X_short['oof'], X_short['type'], floor=1e-9) print(f"oof_log_mae: {oof_log_mae}") train_ids = pd.read_csv(f'../input/train.csv')["id"].values df_oof = pd.DataFrame(index=train_ids) df_oof["scalar_coupling_constant"] = X_short['oof'] df_oof.to_csv(submit_path / f'oof_nn_{DATA_VERSION}_{TRIAL_NO}_{MODEL_TYPE}_{seed}.csv', index=True) if not DEBUG: send_message( f"{MODEL_TYPE}: finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}" ) print(f"finished. : {current_time()}")
eval_result, index=np.arange(len(eval_result)) + 1) training_log_df.columns = ["l1"] training_log_df.index.name = "iter" training_log_df.to_csv( log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{t}.csv") to_pickle( model_path / f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.pkl", result_dict["model"]) result_dict['y_pred_valid'].to_csv( submit_path / f'holdout_pred_{DATA_VERSION}_{TRIAL_NO}_{t}_{seed}.csv', index=True) to_pickle( log_path / f"result_dict_{t}_{params['num_leaves']}_{seed}.pkl", result_dict) if TRAIN_ALL_DATA: ######################################################################################################### # create oof & submission file. sub = pd.read_csv(f'../input/sample_submission.csv') sub['scalar_coupling_constant'] = X_short_test['prediction'] sub.to_csv(submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=False) print(sub.head()) send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}") print(f"finished. : {current_time()}")