def make_relative_position(): with utils.timer("read data"): train_df = compe_data.read_train() test_df = compe_data.read_test() st_df = compe_data.read_structures() # merge structure to train and test with utils.timer("merge atom_st"): train_df = map_atom_info(train_df, st_df, 0) train_df = map_atom_info(train_df, st_df, 1) test_df = map_atom_info(test_df, st_df, 0) test_df = map_atom_info(test_df, st_df, 1) # make structure dict #with utils.timer("make st_dict"): # st_dict = make_st_dict(st_df) # del st_df # gc.collect() # train test each make relative position matrix train_df = each_make(train_df, st_df) test_df = each_make(test_df, st_df)
def train(train_df=None, y=None, use_feats=None, train_type=None, permutation=False, cat_cols=[], single_fold=False, log_dir="./", target_mode="target", meta_col=None): cat_cols = [c for c in cat_cols if c in use_feats] full_train = train_df[use_feats] train_df_for_score = train_df[["type", "scalar_coupling_constant"]] logger = logging.getLogger(__name__) log_callback = log_evaluation(logger, period=n_log_period) #if target_mode == "target": # lgb_param = lgb_param_target #else: # lgb_param = lgb_param_meta lgb_param = get_param(train_type, meta_col) clfs = [] importances = pd.DataFrame() oof_preds = np.zeros(len( compe_data.read_train())) #np.zeros(len(full_train)) oof_preds[:] = np.nan # save use feature utils.save_list(os.path.join(log_dir, "train_feats.csv"), full_train.columns.tolist()) # get validation split index list split_idx_list = validation.get_split_list()[train_type] # fold training logging.info("------ {} fold learning --------".format( len(split_idx_list))) for fold_, (trn_, val_) in enumerate(split_idx_list): logging.info("-----------------") logging.info("{}th Fold Start".format(fold_ + 1)) if 0: #target_mode == "target": train_x, train_y = index_augment(full_train.loc[trn_], y.loc[trn_]) else: train_x = full_train.loc[trn_] train_y = y.loc[trn_] trn_data = lgb.Dataset(train_x, label=train_y, categorical_feature=cat_cols) val_data = lgb.Dataset(full_train.loc[val_], label=y.loc[val_], categorical_feature=cat_cols) logging.info("train shape:({0},{1})".format(full_train.shape[1], len(trn_))) clf = lgb.train( lgb_param, trn_data, n_round, valid_sets=[trn_data, val_data], verbose_eval=n_verbose, early_stopping_rounds=100, #feval = eval_f1, callbacks=[log_callback], ) # oof prediction #mini_idx_srs = full_train.reset_index().iloc[full_train.loc[val_]] oof_preds[val_] = \ clf.predict(full_train.loc[val_], num_iteration=clf.best_iteration) # permutation importance if permutation: fold_cv_score = competition_metric(train_df_for_score.loc[val_], oof_preds[val_]) feature_util.permutation_importance( model=clf, val_x=full_train.loc[val_], val_y=train_df_for_score.loc[val_], cv_score=fold_cv_score, cols=full_train.columns, metric_func=competition_metric, suffix=train_type, output=True, out_dir=log_dir) # feature importance imp_df = pd.DataFrame() imp_df['feature'] = full_train.columns imp_df['gain'] = clf.feature_importance(importance_type="gain") imp_df['fold'] = fold_ + 1 importances = pd.concat([importances, imp_df], axis=0, sort=False) clfs.append(clf) if permutation: #仮でtrainのidxのoof_predsも0で埋めておく oof_preds[trn_] = 0 break if single_fold: break # oof predsがtype別になる前のサイズなのでnanを除くことでもとのサイズにする oof_preds = oof_preds[~np.isnan(oof_preds)] # total CV value if permutation: cv_score = 0 logging.info("This is permutation importance so cv score skip!!") elif target_mode == "target": cv_score = competition_metric(train_df_for_score, oof_preds) logging.info("CV Score {}".format(cv_score)) else: cv_score = mean_absolute_error(y.values, oof_preds) logging.info("CV Score {}".format(cv_score)) # CV Predicitionの対象 = TimeValidのtrain期間 return clfs, importances, cv_score, oof_preds
def learn(): with utils.timer("load train and test data"): train_pair = compe_data.read_train() test_pair = compe_data.read_test() st_df = compe_data.read_structures() with utils.timer("make atomic data frame"): train_df, test_df = make_atomic_data(train_pair, test_pair, st_df) # set validation fold validation.set_fold( fold_type = "GroupKFold", fold_num = settings.fold_num, random_state = settings.fold_seed, shuffle_flg = True, ) # validationのsplitは、本来のターゲットのindexに合わせる validation.make_splits( train_pair, train_pair[features.TARGET_COL], group_col=features.GROUP_COL ) validation.make_atom_splits(train_df, train_pair) # make feature with utils.timer("feature make"): train_df, test_df, cat_cols = atom_features.make( train_df, test_df, train_pair, test_pair ) pd.set_option("max_columns",100) ## for debug train_df.head(10).to_csv( os.path.join(log_dir,"check_atom_train_df.csv"),index=False) # predict only train feature # training with utils.timer("atomic_meta feature training"): meta_train_df = pd.DataFrame() meta_test_df = pd.DataFrame() meta_features = compe_data.read_mulliken_charges() #仮。変わるかも for meta_col in ["mulliken_charge"]: meta_feat = meta_features[meta_col] logging.info("-------- learning {} ---------".format(meta_col)) use_feats = [x for x in train_df.columns if not x in atom_features.EXCEPT_FEATURES] train_y = meta_feat # training per type test_preds = np.zeros(len(test_df)) clfs, importances, val_score, oof_preds = \ atom_training.train( train_df, train_y, use_feats = use_feats, permutation=False, cat_cols = cat_cols, log_dir = log_dir, target_mode = "meta" ) utils.save_importances(importances_=importances, save_dir=log_dir, prefix="meta_col_") for clf in clfs: test_preds += clf.predict(test_df.loc[:, use_feats], num_iteration=clf.best_iteration) / len(clfs) meta_train_df[meta_col] = oof_preds meta_test_df[meta_col] = test_preds meta_train_df["molecule_name"] = train_df["molecule_name"] meta_train_df["atom_index"] = train_df["atom_index"] meta_test_df["molecule_name"] = test_df["molecule_name"] meta_test_df["atom_index"] = test_df["atom_index"] # merge train and test_df meta_train_df.to_pickle("../pickle/atomic_meta_train.pkl") meta_test_df.to_pickle("../pickle/atomic_meta_test.pkl")
def execute(): with utils.timer("load train and test data"): train_df = compe_data.read_train() test_df = compe_data.read_test() st_df = compe_data.read_structures() # set validation fold validation.set_fold( fold_type="GroupKFold", fold_num=2, #settings.fold_num, random_state=2222, shuffle_flg=True, ) validation.make_splits(train_df, train_df[features.TARGET_COL], group_col=features.GROUP_COL) # make feature with utils.timer("feature make"): train_df, test_df, cat_cols = features.make(train_df, test_df, st_df) del test_df gc.collect() # predict only train feature # training #with utils.timer("reduce memory"): # train_df = utils.reduce_memory(train_df) # #test_df = utils.reduce_memory(test_df) flg_use_scc_meta = True #False if flg_use_scc_meta: with utils.timer("meta feature training"): flg_meta_learn = True feature_selection = True if flg_meta_learn: meta_train_df = pd.DataFrame() meta_features = compe_data.read_scalar_coupling_contributions() for meta_col in ["fc", "sd", "pso", "dso"]: meta_feat = meta_features[meta_col] logging.info( "******************************".format(meta_col)) logging.info( "******** learning {} *********".format(meta_col)) logging.info( "******************************".format(meta_col)) use_feats = [ x for x in train_df.columns if not x in features.EXCEPT_FEATURES ] train_y = meta_feat #train_df[features.TARGET_COL] # training per type types = train_df["type"].unique() oof = np.zeros(len(train_df)) types_clfs = {} types_scores = {} for type_name in types: logging.info("----- training type == {} -----".format( type_name)) type_idx = train_df.type == type_name if feature_selection: type_use_feats = feature_util.feature_select( use_feats, importance_df=pd.read_csv( f"./importance/permu_importance{type_name}.csv", names=["feature", "importance"]), threshold=-0.05, reverse=True, ) else: type_use_feats = use_feats.copy() select_feats = features.select_type_feats( use_feats, type_name) param_tuning( train_df=train_df.loc[type_idx], y=train_y.loc[type_idx], use_feats=select_feats, objective_metric= mean_absolute_error, #competition_metric, train_type=type_name, type_name=meta_col + "_" + type_name, cat_cols=cat_cols, ) else: meta_train_df = pd.read_pickle("../pickle/meta_train.pkl") #train_df = utils.fast_concat(train_df, meta_train_df) #test_df = utils.fast_concat(test_df, meta_test_df) """
def learn(): with utils.timer("load train and test data"): train_df = compe_data.read_train() test_df = compe_data.read_test() st_df = compe_data.read_structures() # set validation fold validation.set_fold( fold_type="GroupKFold", fold_num=settings.fold_num, random_state=settings.fold_seed, shuffle_flg=True, ) validation.make_splits(train_df, train_df[features.TARGET_COL], group_col=features.GROUP_COL) # make feature with utils.timer("feature make"): train_df, test_df, cat_cols = features.make(train_df, test_df, st_df) ## for debug train_df.head(30).to_csv(os.path.join(log_dir, "check_train_df.csv"), index=False) # predict only train feature # training flg_use_scc_meta = True #False if flg_use_scc_meta: with utils.timer("meta feature training"): flg_meta_learn = False #True#True feature_selection = True if flg_meta_learn: meta_train_df = pd.DataFrame() meta_test_df = pd.DataFrame() meta_features = compe_data.read_scalar_coupling_contributions() for meta_col in ["fc", "sd", "pso", "dso"]: meta_feat = meta_features[meta_col] logging.info("******************************") logging.info( "******** learning {} *********".format(meta_col)) logging.info("******************************") use_feats = [ x for x in train_df.columns if not x in features.EXCEPT_FEATURES ] train_y = meta_feat #train_df[features.TARGET_COL] # training per type types = train_df["type"].unique() oof = np.zeros(len(train_df)) test_preds = np.zeros(len(test_df)) types_clfs = {} types_scores = {} for type_name in types: logging.info("----- training type == {} -----".format( type_name)) type_idx = train_df.type == type_name test_type_idx = test_df.type == type_name if feature_selection: type_use_feats = feature_util.feature_select( use_feats, importance_df=pd.read_csv( f"./importance/permu_importance{type_name}.csv", names=["feature", "importance"]), threshold=-0.05, reverse=True, ) else: type_use_feats = use_feats.copy() select_feats = features.select_type_feats( use_feats, type_name) clfs, importances, val_score, oof_preds = \ training.train( train_df.loc[type_idx], train_y.loc[type_idx], use_feats = select_feats, train_type = type_name, permutation=False, cat_cols = cat_cols, log_dir = log_dir, target_mode = "meta", meta_col = meta_col ) types_clfs[type_name] = clfs types_scores[type_name] = val_score utils.save_importances( importances_=importances, save_dir=log_dir, prefix=f"{meta_col}_{type_name}") oof[type_idx] = oof_preds for clf in clfs: test_preds[test_type_idx] += \ clf.predict(test_df.loc[test_type_idx, use_feats], num_iteration=clf.best_iteration) / len(clfs) #total_cv = training.metric(train_df, oof) #print("TotalCV = {:.5f}".format(total_cv)) meta_train_df[meta_col] = oof meta_test_df[meta_col] = test_preds logging.info( "---------- meta {} types val score ----------".format( meta_col)) for type_name, score in types_scores.items(): logging.info("{0} : {1}".format(type_name, score)) # merge train and test_df meta_train_df.to_pickle("../pickle/meta_train.pkl") meta_test_df.to_pickle("../pickle/meta_test.pkl") else: meta_train_df = pd.read_pickle("../pickle/gnn_meta_train.pkl") meta_test_df = pd.read_pickle("../pickle/gnn_meta_test.pkl") train_df = utils.fast_concat(train_df, meta_train_df) test_df = utils.fast_concat(test_df, meta_test_df) #with utils.timer("reduce memory"): # train_df = utils.reduce_memory(train_df) # test_df = utils.reduce_memory(test_df) with utils.timer("training"): feature_selection = True use_feats = [ x for x in train_df.columns if not x in features.EXCEPT_FEATURES ] train_y = train_df[features.TARGET_COL] # training per type types = train_df["type"].unique() #types = train_df["new_type"].unique() #hinokkiタイプ oof = np.zeros(len(train_df)) types_clfs = {} types_scores = {} use_feats_dict = {} for type_name in types: logging.info("----- training type == {} -----".format(type_name)) type_idx = train_df.type == type_name if feature_selection: type_use_feats = feature_util.feature_select( use_feats, importance_df=pd.read_csv( f"./importance/permu_importance{type_name}.csv", names=["feature", "importance"]), #importance_path = f"./importance/{type_name}feature_importance_summary.csv", threshold=-0.05, reverse=True, ) else: type_use_feats = use_feats.copy() select_use_feats = features.select_type_feats( type_use_feats, type_name) use_feats_dict[type_name] = select_use_feats clfs, importances, val_score, oof_preds = \ training.train( train_df.loc[type_idx], train_y.loc[type_idx], use_feats = select_use_feats, train_type = type_name, #permutation=True, permutation=False, cat_cols = cat_cols, log_dir = log_dir, ) types_clfs[type_name] = clfs types_scores[type_name] = val_score utils.save_importances(importances_=importances, save_dir=log_dir, prefix=type_name) oof[type_idx] = oof_preds oof_df = pd.DataFrame({ "id": train_df.loc[type_idx, "id"], "oof_preds": oof_preds }) oof_df.to_csv(os.path.join(log_dir, "oof_{}.csv".format(type_name)), index=False) for type_name, score in types_scores.items(): logging.info("{0} : {1}".format(type_name, score)) total_cv = competition_metric(train_df, oof) logging.info("TotalCV = {:.5f}".format(total_cv)) del train_df gc.collect() # prediction with utils.timer("prediction"): prediction.predict(types_clfs, test_df, use_feats_dict=use_feats_dict, val_score=total_cv, log_dir=log_dir) return total_cv