srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=SEED) mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=SEED) tsne = TSNE(n_components=3, random_state=SEED) ss = StandardScaler() df_ss = pd.DataFrame(ss.fit_transform(df.iloc[:, 2:]), columns=df.columns[2:]) decomp_cols = [] comp_results = [] comp_names = ["fa", "pca", "tsvd", "ica", "grp", "srp", "mbkm"] #, "tsne"] # removing tsne for name, transform in zip(comp_names, [fa, pca, tsvd, ica, grp, srp, mbkm, tsne]): print(current_time(), "{} converting...".format(name), flush=True) n_components = N_COMP if name == 'mbkm': n_components = num_clusters2 elif name == "tsne": n_components = 2 df_results = pd.DataFrame(transform.fit_transform(df_ss)) decomp_col = ["{0}_{1:02d}".format(name, i) for i in range(n_components)] df_results.columns = decomp_col decomp_cols.extend(decomp_col) df_results.reset_index(inplace=True) del df_results['index'] comp_results.append(df_results) comp_results_df = pd.concat(comp_results, axis=1) comp_results_df = pd.concat([
print(f"X['type'].unique(): {X['type'].unique()}") for t in X['type'].unique(): #if seed==current_seed and t in [0, 3, 1, 4] : continue # [0, 3, 1, 4, 2, 6] print(f'{current_time()} Training of type {t} / {X["type"].unique()}') X_t = X.loc[X['type'] == t] X_test_t = X_test.loc[X_test['type'] == t] y_t = X_short.loc[X_short['type'] == t, 'target'] mol_name_t = mol_name.loc[X['type'] == t][ X_t.index] if GROUP_K_FOLD else None print( f"X_t.shape: {X_t.shape}, X_test_t.shape: {X_test_t.shape}, y_t.shape: {y_t.shape}" ) params["num_leaves"] = 256 # num_leaves_dict[t] start_time = current_time() bairitsu = 256 / params["num_leaves"] n_estimators = int(15000 * bairitsu) if DEBUG: n_estimators = 5 if TRAIN_ALL_DATA: print("============= 2nd layer TRIAN ALL DATA ================") if t == 0: print("if t==0, then not using mullkan feat.") X_t = X_t.drop(["oof_mullkan_0", "oof_mullkan_1"], axis=1) X_test_t = X_test_t.drop(["oof_mullkan_0", "oof_mullkan_1"], axis=1) result_dict = train_lgb_regression_alldata(
def train_main(seed, type_): print(f"==================== seed: {seed} ====================") params = { #'num_leaves': 128, 'min_child_samples': 79, 'objective': 'regression', 'max_depth': -1, #9, 'learning_rate': 0.2, "boosting_type": "gbdt", "subsample_freq": 1, "subsample": 0.9, "metric": 'mae', "verbosity": -1, 'reg_alpha': 0.1, 'reg_lambda': 0.3, 'colsample_bytree': 1.0, 'num_threads' : -1, } params["seed"] = seed params["bagging_seed"] = seed + 1 params["feature_fraction_seed"] = seed + 2 n_estimators = 5 #10000 params["num_leaves"] = 256 if DEBUG: n_estimators = 5 X_short = pd.DataFrame({ 'ind': list(X.index), 'type': X['type'].values, 'oof': [0] * len(X), 'target': y.values, 'fc': y_fc.values }) X_short_test = pd.DataFrame({ 'ind': list(X_test.index), 'type': X_test['type'].values, 'prediction': [0] * len(X_test) }) print(f'{current_time()} Training of type {type_} / {X["type"].unique()}') X_t = X.loc[X['type'] == type_] X_test_t = X_test.loc[X_test['type'] == type_] y_fc_t = X_short.loc[X_short['type'] == type_, 'fc'] y_t = X_short.loc[X_short['type'] == type_, 'target'] mol_name_t = mol_name.loc[X['type'] == type_][ X_t.index] if GROUP_K_FOLD else None print( f"X_t.shape: {X_t.shape}, X_test_t.shape: {X_test_t.shape}, y_t.shape: {y_t.shape}" ) ######################################################################################################## # fc print("=" * 30 + " fc " + "=" * 30) result_dict_lgb1 = train_model_regression(X=X_t, X_test=X_test_t, y=y_fc_t, params=params, folds=folds, model_type='lgb', eval_metric='group_mae', plot_feature_importance=False, verbose=1000, early_stopping_rounds=200, n_estimators=n_estimators, fold_group=mol_name.values) X['oof_fc'] = result_dict_lgb1['oof'] X_test['oof_fc'] = result_dict_lgb1['prediction'] to_pickle( submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", X['oof_fc']) to_pickle( submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", X_test['oof_fc']) to_pickle( model_path / f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", result_dict_lgb1["models"]) ######################################################################################################### # 2nd layer model params["seed"] = seed + 3 params["bagging_seed"] = seed + 4 params["feature_fraction_seed"] = seed + 5 params["num_leaves"] = 256 # num_leaves_dict[t] start_time = current_time() bairitsu = 256 / params["num_leaves"] n_estimators = 5 #int(15000 * bairitsu) if DEBUG: n_estimators = 5 if TRAIN_ALL_DATA: print("============= 2nd layer TRIAN ALL DATA ================") result_dict = train_lgb_regression_alldata( X=X_t, X_test=X_test_t, y=y_t, params=params, eval_metric='group_mae', plot_feature_importance=True, verbose=5000, n_estimators=int(n_estimators * 1.6), mol_type=type_) X_short_test.loc[X_short_test['type'] == type_, 'prediction'] = result_dict['prediction'] X_short_test.to_csv( submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") elif CV_FOLD: print("============= 2nd layer CV ================") result_dict = train_model_regression(X_t, X_test_t, y_t, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=True, model=None, verbose=1000, early_stopping_rounds=200, n_estimators=n_estimators, mol_type=-1, fold_group=mol_name_t) result_dict["start_time"] = start_time result_dict["n_estimator"] = n_estimators result_dict["X_t_len"] = X_t.shape[0] result_dict["type"] = type_ result_dict["type_name"] = type_name[type_] X_short.loc[X_short['type'] == type_, 'oof'] = result_dict['oof'] X_short.to_csv(submit_path / f"oof_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") X_short_test.loc[X_short_test['type'] == type_, 'prediction'] = result_dict['prediction'] X_short_test.to_csv( submit_path / f"sub_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv") else: print("============= 2nd layer hold out ================") result_dict = hold_out_lgb_validation(X=X_t, y=y_t, params=params, eval_metric='mae', plot_feature_importance=True, verbose=5000, early_stopping_rounds=200, n_estimators=n_estimators) result_dict["start_time"] = start_time result_dict["n_estimator"] = n_estimators result_dict["X_t_len"] = X_t.shape[0] result_dict["type"] = type_ result_dict["type_name"] = type_name[type_] eval_result: list = result_dict["eval_result"]["valid_1"]["l1"] training_log_df: pd.DataFrame = pd.DataFrame( eval_result, index=np.arange(len(eval_result)) + 1) training_log_df.columns = ["l1"] training_log_df.index.name = "iter" training_log_df.to_csv( log_path / f"train_log_{DATA_VERSION}_{TRIAL_NO}_{type_}.csv") to_pickle( model_path / f"hold_out_model_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.pkl", result_dict["model"]) # # # to_pickle(log_path / f"result_dict_{type_}_{seed}.pkl", result_dict) # importance_path = log_path / f'importance_{DATA_VERSION}_{TRIAL_NO}_{type_}_{seed}.csv' # result_dict["importance"].to_csv(importance_path, index=True) # # for type_, s in zip(X['type'].unique(), score_list): # print(f"type {type_}, score: {s:0.5f}") if TRAIN_ALL_DATA or CV_FOLD: ######################################################################################################### # create oof & submission file. sub = pd.read_csv(f'../input/sample_submission.csv') sub['scalar_coupling_constant'] = X_short_test['prediction'] sub.to_csv(submit_path / f'submission_t_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=False) print(sub.head()) send_message(f"finish all_data train_{DATA_VERSION}_{TRIAL_NO}_{seed}") if CV_FOLD: oof_log_mae = group_mean_log_mae(X_short['target'], X_short['oof'], X_short['type'], floor=1e-9) print(f"oof_log_mae: {oof_log_mae}") df_oof = pd.DataFrame(index=train.id) df_oof["scalar_coupling_constant"] = X_short['oof'] df_oof.to_csv(submit_path / f'oof_{DATA_VERSION}_{TRIAL_NO}_{seed}.csv', index=True) send_message( f"finish train_{DATA_VERSION}_{TRIAL_NO}_{seed}, oof_log_mae: {oof_log_mae}" )