def get_prev_train_test_data(prev_data_version=None, prev_trial_no=None): file_folder = '../input' train = pd.read_csv(f'{file_folder}/train.csv') sample_loaded = False prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" if DEBUG: # v003_033 train_path = Path( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl" ) test_path = Path( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl" ) if train_path.exists() and test_path.exists(): print("sample loading") train = unpickle(train_path) test = unpickle(test_path) sample_loaded = True print("sample load finish") if not sample_loaded: print(f"loading previous dataest") print("train loading") train: pd.DataFrame = unpickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J.pkl", ) assert "scalar_coupling_constant" in train.columns print("test loading") test: pd.DataFrame = unpickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J.pkl", ) print(f"loading finished") if DEBUG and not sample_loaded: n_sample = 5000 print(f"sampling {n_sample} rows.") train = train.sample(n=n_sample) test = test.sample(n=n_sample) Path( f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" ).mkdir(parents=True, exist_ok=True) to_pickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl", train) to_pickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl", test) print("saved.") ################################################################################################### # add additional feature for trying # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) return train, test
def func(data): graph_list = data["graph_list"] node_list = [] for j in range(len(graph_list)): graph_name = graph_list[j] graph_name = graph_name.split("/")[-1].replace(".pickle","") g = unpickle(graph_list[j]) node_df = pd.concat([structure[structure.molecule_name==graph_name][["molecule_name", "atom_index"]].reset_index(drop=True), pd.DataFrame(np.concatenate(g.node, -1), columns=[f"node_{i}" for i in range(13)])], axis=1) node_list += [node_df] return node_list
#################################################################################################### # path setting save_path = Path(f"../processed/{DATA_VERSION}") save_path.mkdir(parents=True, exist_ok=True) model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}{debug_str}") model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}") submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}") log_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading pca_feat_df_002 = unpickle(save_path / "pca_feat_df_002.pkl") train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="033") train = train.merge(pca_feat_df_002, on="molecule_name", how="left") test = test.merge(pca_feat_df_002, on="molecule_name", how="left") use_cols_revised = [ c for c in use_cols.good_columns if c not in use_cols.remove_cols ] use_cols_revised = [c for c in use_cols_revised if c in train.columns] # high_importance_dict = get_high_importance_cols(data_version="v003", trial_version="045", verbose=False) mol_name = train.molecule_name
AUGMENT = False if AUGMENT: mol_name += train_.molecule_name.values.tolist() mol_name = np.array(mol_name) USE_PREVIOUS_DATA = False if not USE_PREVIOUS_DATA: train = pd.read_csv(f'{file_folder}/train.csv') test = pd.read_csv(f'{file_folder}/test.csv') # sub = pd.read_csv(f'{file_folder}/sample_submission.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') train_cos = unpickle("../processed/v001/train_003.df.pkl", )[[ "id", "f003:cos_0_1", "f003:cos_1" ]] test_cos = unpickle("../processed/v001/test_003.df.pkl", )[[ "id", "f003:cos_0_1", "f003:cos_1" ]] # train_angle_add = unpickle("../processed/v003/train_005.df.pkl", ) # test_angle_add = unpickle("../processed/v003/test_005.df.pkl", ) train_add = unpickle("../processed/v003/train_006.df.pkl", ) test_add = unpickle("../processed/v003/test_006.df.pkl", ) babel_cols = ['id', 'Angle', 'Torsion', 'cos2T', 'cosT', 'sp'] babel_train = pd.read_csv("../processed/v003/babel_train.csv", usecols=babel_cols) babel_test = pd.read_csv("../processed/v003/babel_test.csv",
# Check exists assert Path(f'{file_folder}/rotated_structures_71.csv').exits() assert Path("../processed/v001/train_003.df.pkl").exits() assert Path("../processed/v004/train_augmented_006.df.pkl").exits() assert Path("../processed/v003/babel_aubmented_train.csv").exits() assert Path("../processed/v003/rdkit_augmented_train.csv").exits() assert Path("../processed/v003/ob_charges_augmented.csv").exits() train = pd.read_csv(f'{file_folder}/train.csv') sub = pd.read_csv(f'{file_folder}/sample_submission.csv') #structures = pd.read_csv(f'{file_folder}/structures.csv') structures = pd.read_csv(f'{file_folder}/rotated_structures_71.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') train_cos = unpickle("../processed/v001/train_003.df.pkl", )[[ "id", "f003:cos_0_1", "f003:cos_1" ]] train_add = unpickle("../processed/v004/train_augmented_006.df.pkl", ) babel_cols = ['id', 'Angle', 'Torsion', 'cos2T', 'cosT', 'sp'] babel_train = pd.read_csv("../processed/v003/babel_aubmented_train.csv", usecols=babel_cols) rdkit_cols = [ 'id', 'a1_degree', 'a1_hybridization', 'a1_inring', 'a1_inring3', 'a1_inring4', 'a1_inring5', 'a1_inring6', 'a1_inring7', 'a1_inring8', 'a1_nb_h', 'a1_nb_o', 'a1_nb_c', 'a1_nb_n', 'a1_nb_na', 'a0_nb_degree', 'a0_nb_hybridization', 'a0_nb_inring', 'a0_nb_inring3', 'a0_nb_inring4', 'a0_nb_inring5', 'a0_nb_inring6', 'a0_nb_inring7', 'a0_nb_inring8', 'a0_nb_nb_h', 'a0_nb_nb_o', 'a0_nb_nb_c', 'a0_nb_nb_n', 'a0_nb_nb_na', 'x_a0_nb', 'y_a0_nb', 'z_a0_nb', 'a1_nb_degree', 'a1_nb_hybridization', 'a1_nb_inring', 'a1_nb_inring3', 'a1_nb_inring4', 'a1_nb_inring5',
log_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading file_folder = '../input' sub = pd.read_csv(f'{file_folder}/sample_submission.csv') train = pd.read_csv(f'{file_folder}/train.csv') mol_name = train.molecule_name.values if False: test = pd.read_csv(f'{file_folder}/test.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') train_cos = unpickle( save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] test_cos = unpickle( save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] train_add = unpickle(save_path / "train_006.df.pkl", ) test_add = unpickle(save_path / "test_006.df.pkl", ) babel_train = pd.read_csv(save_path / "babel_train.csv", usecols=use_cols.babel_cols) babel_test = pd.read_csv(save_path / "babel_test.csv", usecols=use_cols.babel_cols) use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id'] rdkit_train = pd.read_csv(save_path / "rdkit_train.csv", usecols=use_cols.rdkit_cols) rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
# In[3]: DATA_VERSION = "v001" TRIAL_NO = "001" save_path = Path(f"../processed/{DATA_VERSION}") save_path.mkdir(parents=True, exist_ok=True) model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}") model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}") submit_path.mkdir(parents=True, exist_ok=True) print("start loading...") train = unpickle(save_path/"train_002.df.pkl", ) print(f"train loaded.") test = unpickle(save_path/"test_002.df.pkl", ) print(f"test loaded.") y = train["scalar_coupling_constant"] train.drop("scalar_coupling_constant", axis=1, inplace=True) train.set_index("id", inplace=True) test.set_index("id", inplace=True) print(f"train: {train.shape}, test: {test.shape}") groups = unpickle(save_path/"lbl_molecule_name.pkl", ) groups = pd.Series(groups).value_counts().sort_index().values print(f"groups: {groups.shape}") print(train.shape, test.shape, y.shape)
#################################################################################################### # Data Loading train, test = get_train_test_data(use_prev=True, prev_data_version="v004", prev_trial_no="999") use_cols_revised = [ c for c in use_cols.good_columns if c not in use_cols.remove_cols ] use_cols_revised = [c for c in use_cols_revised if c in train.columns] # high_importance_dict = get_high_importance_cols(data_version="v003", trial_version="045", verbose=False) mol_name = train.molecule_name if MULLKEN_OOF: mullken_df = unpickle("../processed/v003/mullken_0_1_train.pkl") mullkan_0 = mullken_df["mulliken_0"] mullkan_1 = mullken_df["mulliken_1"] mullkan_0 = mullkan_0.loc[train["id"]] mullkan_1 = mullkan_1.loc[train["id"]] ################################################################################################### # final data preparation for train X: pd.DataFrame = train[use_cols_revised].copy() y: pd.Series = train['scalar_coupling_constant'] y_fc: pd.Series = train['fc'] X_test: pd.DataFrame = test[use_cols_revised].copy() print(f"X.shape: {X.shape}, X_test.shape: {X_test.shape}") # X.to_csv("../info/X_sampled.csv") # export colnames
submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}") log_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading file_folder = '../input' if False: train = pd.read_csv(f'{file_folder}/train.csv') test = pd.read_csv(f'{file_folder}/test.csv') sub = pd.read_csv(f'{file_folder}/sample_submission.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') train_cos = unpickle("../processed/v001/train_003.df.pkl", )[[ "id", "f003:cos_0_1", "f003:cos_1" ]] test_cos = unpickle("../processed/v001/test_003.df.pkl", )[[ "id", "f003:cos_0_1", "f003:cos_1" ]] # train_angle_add = unpickle("../processed/v003/train_005.df.pkl", ) # test_angle_add = unpickle("../processed/v003/test_005.df.pkl", ) train_add = unpickle("../processed/v003/train_006.df.pkl", ) test_add = unpickle("../processed/v003/test_006.df.pkl", ) babel_cols = ['id', 'Angle', 'Torsion', 'cos2T', 'cosT', 'sp'] babel_train = pd.read_csv("../processed/v003/babel_train.csv", usecols=babel_cols) babel_test = pd.read_csv("../processed/v003/babel_test.csv",
#################################################################################################### # path setting save_path = Path(f"../processed/{DATA_VERSION}") save_path.mkdir(parents=True, exist_ok=True) model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}{debug_str}") model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}") submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}") log_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading #train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="078") train = unpickle("../processed/v003/v003_078/train_compact_v003_078_yiemon_123J_HnJ_H123J.pkl") test = unpickle("../processed/v003/v003_078/test_compact_v003_078_yiemon_123J_HnJ_H123J.pkl") train_ = pd.read_csv("../input/train.csv") test_ = pd.read_csv("../input/test.csv") train["atom_index_0"] = train_["atom_index_0"].values train["atom_index_1"] = train_["atom_index_1"].values test["atom_index_0"] = test_["atom_index_0"].values test["atom_index_1"] = test_["atom_index_1"].values del train_ del test_ gc.collect() print(f"test.shape: {test.shape}") print(f"train.shape: {train.shape}") print("train & test data loaded")
#################################################################################################### # Data Loading prev_data_version = "v003" prev_trial_no = "033" prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" train_data_path = f"{prev_folder}/X_{prev_data_version}_{prev_trial_no}_yiemon_123J_rgs{debug_str}.pkl" test_data_path = f"{prev_folder}/X_test_{prev_data_version}_{prev_trial_no}_yiemon_123J_rgs{debug_str}.pkl" y_data_path = f"{prev_folder}/y_{prev_data_version}_{prev_trial_no}_yiemon_123J_rgs{debug_str}.pkl" mol_name_data_path = f"{prev_folder}/mol_name_{prev_data_version}_{prev_trial_no}_yiemon_123J_rgs{debug_str}.pkl" if Path(train_data_path).exists() and Path(test_data_path).exists() and \ Path(y_data_path).exists() and Path(mol_name_data_path).exists(): print("loading exist files") X_rgs = unpickle(train_data_path) X_test_rgs = unpickle(test_data_path) y = unpickle(y_data_path) mol_name = unpickle(mol_name_data_path) print("loaded exist files") else: print("gathering files for model train.") train, test = get_prev_train_test_data(prev_data_version=prev_data_version, prev_trial_no=prev_trial_no) use_cols_revised = [ c for c in use_cols.good_columns if c not in use_cols.remove_cols ] use_cols_revised = [c for c in use_cols_revised if c in train.columns] # high_importance_dict = get_high_importance_cols(data_version="v003", trial_version="045", verbose=False) mol_name = train.molecule_name
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}") model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}") submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}") log_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading train_path = save_path / f"train_concat_xxx" test_path = save_path / f"test_concat_xxx" if train_path.exists() and test_path.exists(): train = unpickle(train_path) test = unpickle(test_path) else: file_folder = '../input' train = pd.read_csv(f'{file_folder}/train.csv') test = pd.read_csv(f'{file_folder}/test.csv') sub = pd.read_csv(f'{file_folder}/sample_submission.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv') train_cos = unpickle("../processed/v001/train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] test_cos = unpickle("../processed/v001/test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] # train_angle_add = unpickle("../processed/v003/train_005.df.pkl", ) # test_angle_add = unpickle("../processed/v003/test_005.df.pkl", )
return self._model.best_iteration # In[3]: DATA_VERSION = "v001" TRIAL_NO = "001" save_path = Path(f"../processed/{DATA_VERSION}") save_path.mkdir(parents=True, exist_ok=True) model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}") model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}") submit_path.mkdir(parents=True, exist_ok=True) print("start loading...") train = unpickle(save_path / "train_002.df.pkl", ) print("train loaded.") test = unpickle(save_path / "test_002.df.pkl", ) print("test loaded.") y = train["scalar_coupling_constant"] train.drop("scalar_coupling_constant", axis=1, inplace=True) train.set_index("id", inplace=True) test.set_index("id", inplace=True) print(train.shape, test.shape, y.shape) categorical = [ 'atom_index_0', 'atom_index_1', 'atom_1', 'atom_0', 'type_0', 'type' ] lgbm_params = {
save_path = Path(f"../processed/{DATA_VERSION}") save_path.mkdir(parents=True, exist_ok=True) model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}") model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}") submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}") log_path.mkdir(parents=True, exist_ok=True) test_id = np.load("../input/test_id.npy") print(f"test_id.shape: {test_id.shape}") print("start loading...") train1 = unpickle(save_path/"train_002.df.pkl", ) train2 = unpickle(save_path/"train_003.df.pkl", ) train3 = unpickle(save_path/"train_004.df.pkl", ) train = train1.merge(train2, on="id", how="left").merge(train3, on="id", how="left") assert train.shape[0] == train1.shape[0], f"{train.shape[0]}, {train1.shape[0]}" print(f"train.shape: {train.shape}") del train1, train2 gc.collect() print(f"train loaded.") test1 = unpickle(save_path/"test_002.df.pkl", ) test2 = unpickle(save_path/"test_003.df.pkl", ) test3 = unpickle(save_path/"test_004.df.pkl", ) test = test1.merge(test2, on="id", how="left").merge(test3, on="id", how="left") assert test.shape[0] == test1.shape[0], f"{test.shape[0]}, {test1.shape[0]}" print(f"test.shape: {test.shape}")
n_estimators=n_estimators, fold_group=None) X['oof_fc'] = result_dict_lgb1['oof'] X_test['oof_fc'] = result_dict_lgb1['prediction'] to_pickle( submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", X['oof_fc']) to_pickle( submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", X_test['oof_fc']) to_pickle( model_path / f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl", result_dict_lgb1["models"]) else: X['oof_fc'] = unpickle( "../submit/v003_036/train_oof_fc_v003_036_21.pkl") X_test['oof_fc'] = unpickle( "../submit/v003_036/test_oof_fc_v003_036_21.pkl") #X['oof_fc'] = unpickle(f"../submit/{DATA_VERSION}_019/train_oof_fc_{DATA_VERSION}_019.pkl", ) #X_test['oof_fc'] = unpickle(f"../submit/{DATA_VERSION}_019/test_oof_fc_{DATA_VERSION}_019.pkl", ) ######################################################################################################### # 2nd layer model X_short = pd.DataFrame({ 'ind': list(X.index), 'type': X['type'].values, 'oof': [0] * len(X), 'target': y.values })
import numpy as np import pandas as pd import os import sys sys.path.append('..') from lib.line_notif import send_message from lib.utils import reduce_mem_usage, current_time, unpickle, to_pickle # sklearn from sklearn.cluster import MiniBatchKMeans from sklearn.decomposition import PCA, TruncatedSVD, FastICA, LatentDirichletAllocation, FactorAnalysis from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection from sklearn.manifold import TSNE from sklearn.preprocessing import StandardScaler df = unpickle("../processed/v003/acsf_feat.pkl") SEED = 71 N_COMP = 5 num_clusters2 = 5 fa = FactorAnalysis(n_components=N_COMP, ) pca = PCA(n_components=N_COMP, random_state=SEED) tsvd = TruncatedSVD(n_components=N_COMP, random_state=SEED) ica = FastICA(n_components=N_COMP, random_state=SEED) grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=SEED) srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=SEED) mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=SEED) tsne = TSNE(n_components=3, random_state=SEED)
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}{debug_str}") model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}") submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}") log_path.mkdir(parents=True, exist_ok=True) mid_path = Path(f"../mid/{DATA_VERSION}_{TRIAL_NO}{debug_str}") mid_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading print("start data loading") # train = unpickle("../processed/v003/v003_098/train_compact_v003_098.pkl") # test = unpickle("../processed/v003/v003_098/test_compact_v003_098.pkl") train = unpickle( "../processed/v003/v003_104/train_compact_v003_104_compact.pkl") test = unpickle("../processed/v003/v003_104/test_compact_v003_104_compact.pkl") train_ = pd.read_csv("../input/train.csv") train_id = train_.id mol_name = train_.molecule_name scalar_coupling_constant = train_.scalar_coupling_constant scalar_coupling_contributions = pd.read_csv( f'../input/scalar_coupling_contributions.csv') fc = scalar_coupling_contributions.fc del train_ del scalar_coupling_contributions # feat_train = unpickle("../processed/v003/atom_3J_substituents1_train_na.pkl") # feat_test = unpickle("../processed/v003/atom_3J_substituents1_test_na.pkl") # train = pd.concat([train, feat_train], axis=1)
model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}") submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}") log_path.mkdir(parents=True, exist_ok=True) mid_path = Path(f"../mid/{DATA_VERSION}_{TRIAL_NO}{debug_str}") mid_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading #train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="078") # train = unpickle("../processed/v003/v003_091/train_compact_v003_091.pkl") # test = unpickle("../processed/v003/v003_091/test_compact_v003_091.pkl") train = unpickle("../processed/v003/v003_098/train_compact_v003_098.pkl") test = unpickle("../processed/v003/v003_098/test_compact_v003_098.pkl") train_ = pd.read_csv("../input/train.csv") train_id = train_.id mol_name = train_.molecule_name scalar_coupling_constant = train_.scalar_coupling_constant scalar_coupling_contributions = pd.read_csv( f'../input/scalar_coupling_contributions.csv') fc = scalar_coupling_contributions.fc del train_ del scalar_coupling_contributions # seg_submolecule_fp_maccs_train = unpickle("../processed/v003/seg_submolecule_fp_maccs_train.pkl") # seg_submolecule_fp_maccs_test = unpickle("../processed/v003/seg_submolecule_fp_maccs_test.pkl") #
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None, plot_feature_importance=False, model=None, verbose=10000, early_stopping_rounds=200, n_estimators=50000, mol_type=-1, fold_group=None, skip_folds=None, phase_mark="", skipped_mark=[]): """ A function to train a variety of regression models. Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances. :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing) :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing) :params: y - target :params: folds - folds to split data :params: model_type - type of model to use :params: eval_metric - metric to use :params: columns - columns to use. If None - use all columns :params: plot_feature_importance - whether to plot feature importance of LGB :params: model - sklearn model, works only for "sklearn" model type """ assert isinstance(skip_folds, list) or skip_folds is None print(f"skip_folds :{skip_folds}") columns = X.columns if columns is None else columns X_test = X_test[columns] # to set up scoring parameters metrics_dict = {'mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'sklearn_scoring_function': metrics.mean_absolute_error}, 'group_mae': {'lgb_metric_name': 'mae', 'catboost_metric_name': 'MAE', 'scoring_function': group_mean_log_mae}, 'mse': {'lgb_metric_name': 'mse', 'catboost_metric_name': 'MSE', 'sklearn_scoring_function': metrics.mean_squared_error} } result_dict = {} # out-of-fold predictions on train data oof = np.zeros(len(X)) # averaged predictions on train data prediction = np.zeros(len(X_test)) # list of scores on folds scores = [] feature_importance = pd.DataFrame() model_list = [] # split and train on folds for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=fold_group)): if skip_folds is not None and fold_n in skip_folds and phase_mark in skipped_mark: print(f'Fold {fold_n + 1} is skipped!!! at {time.ctime()}') oof = unpickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", ) y_pred = unpickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", ) model = unpickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", ) fold_importance = unpickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", ) feature_importance = pd.concat([feature_importance, fold_importance], axis=0) prediction += y_pred model_list += [model] continue print(f'Fold {fold_n + 1} started at {time.ctime()}') if type(X) == np.ndarray: X_train, X_valid = X[columns][train_index], X[columns][valid_index] y_train, y_valid = y[train_index], y[valid_index] else: X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index] y_train, y_valid = y.iloc[train_index], y.iloc[valid_index] if model_type == 'lgb': model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1, importance_type='gain') print(model) model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=metrics_dict[eval_metric]['lgb_metric_name'], verbose=verbose, early_stopping_rounds=early_stopping_rounds) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test, num_iteration=model.best_iteration_) if model_type == 'xgb': train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns) valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns) watchlist = [(train_data, 'train'), (valid_data, 'valid_data')] params["objective"] = "reg:linear" params["eval_metric"] = metrics_dict[eval_metric]['lgb_metric_name'] model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=verbose, params=params) y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns), ntree_limit=model.best_ntree_limit) y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit) if model_type == 'sklearn': model = model model.fit(X_train, y_train) y_pred_valid = model.predict(X_valid).reshape(-1, ) score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid) print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.') print('') y_pred = model.predict(X_test).reshape(-1, ) if model_type == 'cat': model = CatBoostRegressor(iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'], **params, loss_function=metrics_dict[eval_metric]['catboost_metric_name']) model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False) y_pred_valid = model.predict(X_valid) y_pred = model.predict(X_test) oof[valid_index] = y_pred_valid.reshape(-1, ) if eval_metric != 'group_mae': scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)) else: scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type'])) prediction += y_pred if model_type == 'lgb' and plot_feature_importance: # feature importance fold_importance = pd.DataFrame() fold_importance["feature"] = columns fold_importance["importance"] = model.feature_importances_ fold_importance["fold"] = fold_n + 1 try: fold_importance.to_csv(mid_path / f"importance_cv_{fold_n}.csv") except Exception as e: print("failed to save importance...") print(e) feature_importance = pd.concat([feature_importance, fold_importance], axis=0) model_list += [model] try: to_pickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", oof) to_pickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", y_pred) to_pickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", model) to_pickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", fold_importance) except Exception as e: print("failed to save intermediate data...") print(e) if model_type == 'lgb' and plot_feature_importance: result_dict['importance'] = feature_importance prediction /= folds.n_splits try: cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' +' CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)) print(cv_score_msg) send_message(cv_score_msg) except Exception as e: print(e) pass result_dict["models"] = model_list result_dict['oof'] = oof result_dict['prediction'] = prediction result_dict['scores'] = scores return result_dict
log_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading file_folder = '../input' sub = pd.read_csv(f'{file_folder}/sample_submission.csv') train = pd.read_csv(f'{file_folder}/train.csv') mol_name = train.molecule_name.values if True: test = pd.read_csv(f'{file_folder}/test.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') train_cos = unpickle( save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] test_cos = unpickle( save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] train_add = unpickle(save_path / "train_006.df.pkl", ) test_add = unpickle(save_path / "test_006.df.pkl", ) babel_train = pd.read_csv(save_path / "babel_train.csv", usecols=use_cols.babel_cols) babel_test = pd.read_csv(save_path / "babel_test.csv", usecols=use_cols.babel_cols) use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id'] rdkit_train = pd.read_csv(save_path / "rdkit_train.csv", usecols=use_cols.rdkit_cols) rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
#################################################################################################### # Data Loading train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="078") def map_acsf(df, acsf_df, atom_idx): df = pd.merge(df, acsf_df, how='left', left_on=['molecule_name', f'atom_index_{atom_idx}'], right_on=['molecule_name', 'atom_index']) df = df.drop('atom_index', axis=1) df = df.rename(columns={c:f"{c}_{atom_idx}" for c in acsf_df.columns[2:]}) return df acsf_df = unpickle("../processed/v003/acsf_feat.pkl") print("acsf_train 0") train = map_acsf(train, acsf_df, 0) print("acsf_train 1") train = map_acsf(train, acsf_df, 1) print("acsf_test 0") test = map_acsf(test, acsf_df, 0) print("acsf_test 1") test = map_acsf(test, acsf_df, 1) print("acsf finished") # seg_H1J_bond_extension1_train = unpickle("../processed/v003/seg_H1J_bond_extension1_train.pkl") # seg_H1J_bond_extension1_test = unpickle("../processed/v003/seg_H1J_bond_extension1_test.pkl") # train = pd.concat([train, seg_H1J_bond_extension1_train], axis=1) # test = pd.concat([test, seg_H1J_bond_extension1_test], axis=1)
submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}") log_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading file_folder = '../input' train = pd.read_csv(f'{file_folder}/train.csv') test = pd.read_csv(f'{file_folder}/test.csv') sub = pd.read_csv(f'{file_folder}/sample_submission.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') train_cos = unpickle("../processed/v001/train_003.df.pkl", )[[ "id", "f003:cos_0_1", "f003:cos_1" ]] test_cos = unpickle("../processed/v001/test_003.df.pkl", )[[ "id", "f003:cos_0_1", "f003:cos_1" ]] # train_angle_add = unpickle("../processed/v003/train_005.df.pkl", ) # test_angle_add = unpickle("../processed/v003/test_005.df.pkl", ) train_add = unpickle("../processed/v003/train_006.df.pkl", ) test_add = unpickle("../processed/v003/test_006.df.pkl", ) babel_cols = ['id', 'Angle', 'Torsion', 'cos2T', 'cosT', 'sp'] babel_train = pd.read_csv("../processed/v003/babel_train.csv", usecols=babel_cols) babel_test = pd.read_csv("../processed/v003/babel_test.csv",
#################################################################################################### # path setting save_path = Path(f"../processed/{DATA_VERSION}") save_path.mkdir(parents=True, exist_ok=True) model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}{debug_str}") model_path.mkdir(parents=True, exist_ok=True) submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}") submit_path.mkdir(parents=True, exist_ok=True) log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}") log_path.mkdir(parents=True, exist_ok=True) #################################################################################################### # Data Loading #train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="078") train = unpickle("../processed/v003/v003_091/train_compact_v003_091.pkl") test = unpickle("../processed/v003/v003_091/test_compact_v003_091.pkl") train_ = pd.read_csv("../input/train.csv") train_id = train_.id mol_name = train_.molecule_name scalar_coupling_constant = train_.scalar_coupling_constant scalar_coupling_contributions = pd.read_csv(f'../input/scalar_coupling_contributions.csv') fc = scalar_coupling_contributions.fc del train_ del scalar_coupling_contributions seg_submolecule_fp_maccs_train = unpickle("../processed/v003/seg_submolecule_fp_maccs_train.pkl") seg_submolecule_fp_maccs_test = unpickle("../processed/v003/seg_submolecule_fp_maccs_test.pkl") train = pd.concat([train, seg_submolecule_fp_maccs_train], axis=1)
def get_train_test_data(use_prev=False, prev_data_version=None, prev_trial_no=None): if use_prev: assert prev_data_version is not None assert prev_trial_no is not None file_folder = '../input' train = pd.read_csv(f'{file_folder}/train.csv') if not use_prev: test = pd.read_csv(f'{file_folder}/test.csv') structures = pd.read_csv(f'{file_folder}/structures.csv') scalar_coupling_contributions = pd.read_csv( f'{file_folder}/scalar_coupling_contributions.csv') # train_cos = unpickle(save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] # test_cos = unpickle(save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]] train_add = unpickle(save_path / "train_006.df.pkl", ) test_add = unpickle(save_path / "test_006.df.pkl", ) babel_train = pd.read_csv(save_path / "babel_train.csv", usecols=use_cols.babel_cols) babel_test = pd.read_csv(save_path / "babel_test.csv", usecols=use_cols.babel_cols) use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id'] rdkit_train = pd.read_csv(save_path / "rdkit_train.csv", usecols=use_cols.rdkit_cols) rdkit_test = pd.read_csv(save_path / "rdkit_test.csv", usecols=use_cols.rdkit_cols) coulomb_train = pd.read_csv(save_path / "coulomb_interaction_train.csv") coulomb_test = pd.read_csv(save_path / "coulomb_interaction_test.csv") bond_calc_train = unpickle(save_path / "bond_calc_feat_train.pkl") bond_calc_test = unpickle(save_path / "bond_calc_feat_test.pkl") ob_charges = pd.read_csv(save_path / "ob_charges.csv", index_col=0) tda_radius_df = pd.read_csv(save_path / "tda_radius_df.csv", index_col=0) tda_radius_df_03 = pd.read_csv(save_path / "tda_radius_df_v003.csv", index_col=0) pca_feat = unpickle(save_path / "pca_feat_df.pkl") #################################################################################################### # Feature Engineering train = pd.merge( train, scalar_coupling_contributions, how='left', left_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'], right_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type']) train = map_atom_info(train, 0, structures) train = map_atom_info(train, 1, structures) test = map_atom_info(test, 0, structures) test = map_atom_info(test, 1, structures) train_p_0 = train[['x_0', 'y_0', 'z_0']].values train_p_1 = train[['x_1', 'y_1', 'z_1']].values test_p_0 = test[['x_0', 'y_0', 'z_0']].values test_p_1 = test[['x_1', 'y_1', 'z_1']].values train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1) test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1) train['dist_x'] = (train['x_0'] - train['x_1'])**2 test['dist_x'] = (test['x_0'] - test['x_1'])**2 train['dist_y'] = (train['y_0'] - train['y_1'])**2 test['dist_y'] = (test['y_0'] - test['y_1'])**2 train['dist_z'] = (train['z_0'] - train['z_1'])**2 test['dist_z'] = (test['z_0'] - test['z_1'])**2 train['type_0'] = train['type'].apply(lambda x: x[0]) test['type_0'] = test['type'].apply(lambda x: x[0]) train['abs_dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1, ord=1) test['abs_dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1, ord=1) dist12('dist_xy', 'x', 'y') dist12('dist_xz', 'x', 'z') dist12('dist_yz', 'y', 'z') atom_count = structures.groupby(['molecule_name', 'atom']).size().unstack(fill_value=0) train = pd.merge(train, atom_count, how='left', left_on='molecule_name', right_on='molecule_name') test = pd.merge(test, atom_count, how='left', left_on='molecule_name', right_on='molecule_name') train = create_features(train) test = create_features(test) angle_df_train, angle_df_test = angle_feature_conv(structures) train = train.merge(angle_df_train, on="id", how="left") test = test.merge(angle_df_test, on="id", how="left") train = train.merge(train_add, on="id", how="left") test = test.merge(test_add, on="id", how="left") # train = train.merge(train_cos, on="id", how="left") # test = test.merge(test_cos, on="id", how="left") train = train.merge(babel_train, on="id", how="left") test = test.merge(babel_test, on="id", how="left") train = train.merge(rdkit_train, on="id", how="left") test = test.merge(rdkit_test, on="id", how="left") train = train.merge(coulomb_train, on="id", how="left") test = test.merge(coulomb_test, on="id", how="left") train = train.merge(bond_calc_train, on="id", how="left") test = test.merge(bond_calc_test, on="id", how="left") train = train.merge(tda_radius_df, on="molecule_name", how="left") test = test.merge(tda_radius_df, on="molecule_name", how="left") train = train.merge(tda_radius_df_03, on="molecule_name", how="left") test = test.merge(tda_radius_df_03, on="molecule_name", how="left") train = train.merge(pca_feat, on="molecule_name", how="left") test = test.merge(pca_feat, on="molecule_name", how="left") train = map_ob_charges(train, ob_charges, 0) train = map_ob_charges(train, ob_charges, 1) test = map_ob_charges(test, ob_charges, 0) test = map_ob_charges(test, ob_charges, 1) train = reduce_mem_usage(train) test = reduce_mem_usage(test) for f in ['atom_1', 'type_0', 'type']: if f in use_cols.good_columns: lbl = LabelEncoder() lbl.fit(list(train[f].values) + list(test[f].values)) train[f] = lbl.transform(list(train[f].values)) test[f] = lbl.transform(list(test[f].values)) Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) to_pickle( save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) else: sample_loaded = False prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" if DEBUG: # v003_033 train_path = Path( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl" ) test_path = Path( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl" ) if train_path.exists() and test_path.exists(): print("sample loading") train = unpickle(train_path) test = unpickle(test_path) sample_loaded = True print("sample load finish") if not sample_loaded: print(f"loading previous dataest") print("train loading") train: pd.DataFrame = unpickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic.pkl", ) assert "scalar_coupling_constant" in train.columns print("test loading") test: pd.DataFrame = unpickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic.pkl", ) print(f"loading finished") if DEBUG and not sample_loaded: n_sample = 5000 print(f"sampling {n_sample} rows.") train = train.sample(n=n_sample) test = test.sample(n=n_sample) Path( f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}" ).mkdir(parents=True, exist_ok=True) to_pickle( f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl", train) to_pickle( f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl", test) print("saved.") ################################################################################################### # add additional feature for trying # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train) # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test) return train, test
# coding: utf-8 import pandas as pd import sys sys.path.append('..') from lib.utils import current_time, unpickle, to_pickle # sklearn from sklearn.cluster import MiniBatchKMeans from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection from sklearn.manifold import TSNE from sklearn.preprocessing import StandardScaler df = unpickle("../processed/v003/mol_vec_df.pkl").set_index("molecule_name") SEED = 71 N_COMP = 10 num_clusters2 = 10 fa = FactorAnalysis(n_components=N_COMP, ) pca = PCA(n_components=N_COMP, random_state=SEED) tsvd = TruncatedSVD(n_components=N_COMP, random_state=SEED) ica = FastICA(n_components=N_COMP, random_state=SEED) grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=SEED) srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=SEED) mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=SEED) tsne = TSNE(n_components=3, random_state=SEED) ss = StandardScaler()