Exemple #1
0
def get_prev_train_test_data(prev_data_version=None, prev_trial_no=None):

    file_folder = '../input'
    train = pd.read_csv(f'{file_folder}/train.csv')

    sample_loaded = False
    prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
    if DEBUG:
        # v003_033
        train_path = Path(
            f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl"
        )
        test_path = Path(
            f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl"
        )

        if train_path.exists() and test_path.exists():
            print("sample loading")
            train = unpickle(train_path)
            test = unpickle(test_path)
            sample_loaded = True
            print("sample load finish")

    if not sample_loaded:
        print(f"loading previous dataest")
        print("train loading")
        train: pd.DataFrame = unpickle(
            f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J.pkl",
        )
        assert "scalar_coupling_constant" in train.columns
        print("test loading")
        test: pd.DataFrame = unpickle(
            f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J.pkl",
        )
        print(f"loading finished")

    if DEBUG and not sample_loaded:
        n_sample = 5000
        print(f"sampling {n_sample} rows.")
        train = train.sample(n=n_sample)
        test = test.sample(n=n_sample)
        Path(
            f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
        ).mkdir(parents=True, exist_ok=True)
        to_pickle(
            f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl",
            train)
        to_pickle(
            f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_yiemon_123J_sampled.pkl",
            test)
        print("saved.")

        ###################################################################################################
        # add additional feature for trying

        # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test)
    return train, test
Exemple #2
0
def func(data):
    graph_list = data["graph_list"]
    node_list = []
    for j in range(len(graph_list)):
        graph_name = graph_list[j]
        graph_name = graph_name.split("/")[-1].replace(".pickle","")
        g = unpickle(graph_list[j])
        node_df = pd.concat([structure[structure.molecule_name==graph_name][["molecule_name", "atom_index"]].reset_index(drop=True), 
                   pd.DataFrame(np.concatenate(g.node, -1), columns=[f"node_{i}" for i in range(13)])], axis=1)
        node_list += [node_df]
    return node_list
####################################################################################################
# path setting
save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading

pca_feat_df_002 = unpickle(save_path / "pca_feat_df_002.pkl")

train, test = get_train_test_data(use_prev=True,
                                  prev_data_version="v003",
                                  prev_trial_no="033")

train = train.merge(pca_feat_df_002, on="molecule_name", how="left")
test = test.merge(pca_feat_df_002, on="molecule_name", how="left")

use_cols_revised = [
    c for c in use_cols.good_columns if c not in use_cols.remove_cols
]
use_cols_revised = [c for c in use_cols_revised if c in train.columns]
# high_importance_dict = get_high_importance_cols(data_version="v003", trial_version="045", verbose=False)

mol_name = train.molecule_name
Exemple #4
0
AUGMENT = False
if AUGMENT:
    mol_name += train_.molecule_name.values.tolist()

mol_name = np.array(mol_name)

USE_PREVIOUS_DATA = False
if not USE_PREVIOUS_DATA:
    train = pd.read_csv(f'{file_folder}/train.csv')
    test = pd.read_csv(f'{file_folder}/test.csv')
    # sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
    structures = pd.read_csv(f'{file_folder}/structures.csv')
    scalar_coupling_contributions = pd.read_csv(
        f'{file_folder}/scalar_coupling_contributions.csv')
    train_cos = unpickle("../processed/v001/train_003.df.pkl", )[[
        "id", "f003:cos_0_1", "f003:cos_1"
    ]]
    test_cos = unpickle("../processed/v001/test_003.df.pkl", )[[
        "id", "f003:cos_0_1", "f003:cos_1"
    ]]

    # train_angle_add = unpickle("../processed/v003/train_005.df.pkl", )
    # test_angle_add = unpickle("../processed/v003/test_005.df.pkl", )

    train_add = unpickle("../processed/v003/train_006.df.pkl", )
    test_add = unpickle("../processed/v003/test_006.df.pkl", )

    babel_cols = ['id', 'Angle', 'Torsion', 'cos2T', 'cosT', 'sp']
    babel_train = pd.read_csv("../processed/v003/babel_train.csv",
                              usecols=babel_cols)
    babel_test = pd.read_csv("../processed/v003/babel_test.csv",
# Check exists
assert Path(f'{file_folder}/rotated_structures_71.csv').exits()
assert Path("../processed/v001/train_003.df.pkl").exits()
assert Path("../processed/v004/train_augmented_006.df.pkl").exits()
assert Path("../processed/v003/babel_aubmented_train.csv").exits()
assert Path("../processed/v003/rdkit_augmented_train.csv").exits()
assert Path("../processed/v003/ob_charges_augmented.csv").exits()

train = pd.read_csv(f'{file_folder}/train.csv')
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
#structures = pd.read_csv(f'{file_folder}/structures.csv')
structures = pd.read_csv(f'{file_folder}/rotated_structures_71.csv')
scalar_coupling_contributions = pd.read_csv(
    f'{file_folder}/scalar_coupling_contributions.csv')
train_cos = unpickle("../processed/v001/train_003.df.pkl", )[[
    "id", "f003:cos_0_1", "f003:cos_1"
]]
train_add = unpickle("../processed/v004/train_augmented_006.df.pkl", )
babel_cols = ['id', 'Angle', 'Torsion', 'cos2T', 'cosT', 'sp']
babel_train = pd.read_csv("../processed/v003/babel_aubmented_train.csv",
                          usecols=babel_cols)

rdkit_cols = [
    'id', 'a1_degree', 'a1_hybridization', 'a1_inring', 'a1_inring3',
    'a1_inring4', 'a1_inring5', 'a1_inring6', 'a1_inring7', 'a1_inring8',
    'a1_nb_h', 'a1_nb_o', 'a1_nb_c', 'a1_nb_n', 'a1_nb_na', 'a0_nb_degree',
    'a0_nb_hybridization', 'a0_nb_inring', 'a0_nb_inring3', 'a0_nb_inring4',
    'a0_nb_inring5', 'a0_nb_inring6', 'a0_nb_inring7', 'a0_nb_inring8',
    'a0_nb_nb_h', 'a0_nb_nb_o', 'a0_nb_nb_c', 'a0_nb_nb_n', 'a0_nb_nb_na',
    'x_a0_nb', 'y_a0_nb', 'z_a0_nb', 'a1_nb_degree', 'a1_nb_hybridization',
    'a1_nb_inring', 'a1_nb_inring3', 'a1_nb_inring4', 'a1_nb_inring5',
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
file_folder = '../input'
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
train = pd.read_csv(f'{file_folder}/train.csv')
mol_name = train.molecule_name.values

if False:
    test = pd.read_csv(f'{file_folder}/test.csv')
    structures = pd.read_csv(f'{file_folder}/structures.csv')
    scalar_coupling_contributions = pd.read_csv(
        f'{file_folder}/scalar_coupling_contributions.csv')

    train_cos = unpickle(
        save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
    test_cos = unpickle(
        save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

    train_add = unpickle(save_path / "train_006.df.pkl", )
    test_add = unpickle(save_path / "test_006.df.pkl", )

    babel_train = pd.read_csv(save_path / "babel_train.csv",
                              usecols=use_cols.babel_cols)
    babel_test = pd.read_csv(save_path / "babel_test.csv",
                             usecols=use_cols.babel_cols)

    use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id']
    rdkit_train = pd.read_csv(save_path / "rdkit_train.csv",
                              usecols=use_cols.rdkit_cols)
    rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
# In[3]:


DATA_VERSION = "v001"
TRIAL_NO = "001"
save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}")
submit_path.mkdir(parents=True, exist_ok=True)


print("start loading...")
train = unpickle(save_path/"train_002.df.pkl", )
print(f"train loaded.")
test  = unpickle(save_path/"test_002.df.pkl", )
print(f"test loaded.")
y = train["scalar_coupling_constant"]
train.drop("scalar_coupling_constant", axis=1, inplace=True)

train.set_index("id", inplace=True)
test.set_index("id", inplace=True)

print(f"train: {train.shape}, test: {test.shape}")

groups = unpickle(save_path/"lbl_molecule_name.pkl", )
groups = pd.Series(groups).value_counts().sort_index().values
print(f"groups: {groups.shape}")
print(train.shape, test.shape, y.shape)
####################################################################################################
# Data Loading
train, test = get_train_test_data(use_prev=True,
                                  prev_data_version="v004",
                                  prev_trial_no="999")
use_cols_revised = [
    c for c in use_cols.good_columns if c not in use_cols.remove_cols
]
use_cols_revised = [c for c in use_cols_revised if c in train.columns]
# high_importance_dict = get_high_importance_cols(data_version="v003", trial_version="045", verbose=False)

mol_name = train.molecule_name

if MULLKEN_OOF:
    mullken_df = unpickle("../processed/v003/mullken_0_1_train.pkl")
    mullkan_0 = mullken_df["mulliken_0"]
    mullkan_1 = mullken_df["mulliken_1"]
    mullkan_0 = mullkan_0.loc[train["id"]]
    mullkan_1 = mullkan_1.loc[train["id"]]
###################################################################################################
# final data preparation for train
X: pd.DataFrame = train[use_cols_revised].copy()
y: pd.Series = train['scalar_coupling_constant']
y_fc: pd.Series = train['fc']
X_test: pd.DataFrame = test[use_cols_revised].copy()
print(f"X.shape: {X.shape}, X_test.shape: {X_test.shape}")

# X.to_csv("../info/X_sampled.csv")

# export colnames
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}")
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
file_folder = '../input'
if False:
    train = pd.read_csv(f'{file_folder}/train.csv')
    test = pd.read_csv(f'{file_folder}/test.csv')
    sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
    structures = pd.read_csv(f'{file_folder}/structures.csv')
    scalar_coupling_contributions = pd.read_csv(
        f'{file_folder}/scalar_coupling_contributions.csv')
    train_cos = unpickle("../processed/v001/train_003.df.pkl", )[[
        "id", "f003:cos_0_1", "f003:cos_1"
    ]]
    test_cos = unpickle("../processed/v001/test_003.df.pkl", )[[
        "id", "f003:cos_0_1", "f003:cos_1"
    ]]

    # train_angle_add = unpickle("../processed/v003/train_005.df.pkl", )
    # test_angle_add = unpickle("../processed/v003/test_005.df.pkl", )

    train_add = unpickle("../processed/v003/train_006.df.pkl", )
    test_add = unpickle("../processed/v003/test_006.df.pkl", )

    babel_cols = ['id', 'Angle', 'Torsion', 'cos2T', 'cosT', 'sp']
    babel_train = pd.read_csv("../processed/v003/babel_train.csv",
                              usecols=babel_cols)
    babel_test = pd.read_csv("../processed/v003/babel_test.csv",
Exemple #10
0
####################################################################################################
# path setting
save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
#train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="078")
train = unpickle("../processed/v003/v003_078/train_compact_v003_078_yiemon_123J_HnJ_H123J.pkl")
test = unpickle("../processed/v003/v003_078/test_compact_v003_078_yiemon_123J_HnJ_H123J.pkl")

train_ = pd.read_csv("../input/train.csv")
test_ = pd.read_csv("../input/test.csv")
train["atom_index_0"] = train_["atom_index_0"].values
train["atom_index_1"] = train_["atom_index_1"].values
test["atom_index_0"] = test_["atom_index_0"].values
test["atom_index_1"] = test_["atom_index_1"].values
del train_
del test_
gc.collect()

print(f"test.shape: {test.shape}")
print(f"train.shape: {train.shape}")
print("train & test data loaded")
Exemple #11
0
####################################################################################################
# Data Loading
prev_data_version = "v003"
prev_trial_no = "033"

prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
train_data_path = f"{prev_folder}/X_{prev_data_version}_{prev_trial_no}_yiemon_123J_rgs{debug_str}.pkl"
test_data_path = f"{prev_folder}/X_test_{prev_data_version}_{prev_trial_no}_yiemon_123J_rgs{debug_str}.pkl"
y_data_path = f"{prev_folder}/y_{prev_data_version}_{prev_trial_no}_yiemon_123J_rgs{debug_str}.pkl"
mol_name_data_path = f"{prev_folder}/mol_name_{prev_data_version}_{prev_trial_no}_yiemon_123J_rgs{debug_str}.pkl"

if Path(train_data_path).exists() and Path(test_data_path).exists() and \
    Path(y_data_path).exists() and Path(mol_name_data_path).exists():
    print("loading exist files")
    X_rgs = unpickle(train_data_path)
    X_test_rgs = unpickle(test_data_path)
    y = unpickle(y_data_path)
    mol_name = unpickle(mol_name_data_path)
    print("loaded exist files")
else:
    print("gathering files for model train.")
    train, test = get_prev_train_test_data(prev_data_version=prev_data_version,
                                           prev_trial_no=prev_trial_no)
    use_cols_revised = [
        c for c in use_cols.good_columns if c not in use_cols.remove_cols
    ]
    use_cols_revised = [c for c in use_cols_revised if c in train.columns]
    # high_importance_dict = get_high_importance_cols(data_version="v003", trial_version="045", verbose=False)

    mol_name = train.molecule_name
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
file_folder = '../input'
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
train = pd.read_csv(f'{file_folder}/train.csv')
mol_name = train.molecule_name.values

if False:
    test = pd.read_csv(f'{file_folder}/test.csv')
    structures = pd.read_csv(f'{file_folder}/structures.csv')
    scalar_coupling_contributions = pd.read_csv(
        f'{file_folder}/scalar_coupling_contributions.csv')

    train_cos = unpickle(
        save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
    test_cos = unpickle(
        save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

    train_add = unpickle(save_path / "train_006.df.pkl", )
    test_add = unpickle(save_path / "test_006.df.pkl", )

    babel_train = pd.read_csv(save_path / "babel_train.csv",
                              usecols=use_cols.babel_cols)
    babel_test = pd.read_csv(save_path / "babel_test.csv",
                             usecols=use_cols.babel_cols)

    use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id']
    rdkit_train = pd.read_csv(save_path / "rdkit_train.csv",
                              usecols=use_cols.rdkit_cols)
    rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}")
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}")
log_path.mkdir(parents=True, exist_ok=True)


####################################################################################################
# Data Loading


train_path = save_path / f"train_concat_xxx"
test_path  = save_path / f"test_concat_xxx"
if train_path.exists() and test_path.exists():
    train = unpickle(train_path)
    test = unpickle(test_path)

else:

    file_folder = '../input'
    train = pd.read_csv(f'{file_folder}/train.csv')
    test = pd.read_csv(f'{file_folder}/test.csv')
    sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
    structures = pd.read_csv(f'{file_folder}/structures.csv')
    scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
    train_cos = unpickle("../processed/v001/train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
    test_cos = unpickle("../processed/v001/test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

    # train_angle_add = unpickle("../processed/v003/train_005.df.pkl", )
    # test_angle_add = unpickle("../processed/v003/test_005.df.pkl", )
Exemple #14
0
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
file_folder = '../input'
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
train = pd.read_csv(f'{file_folder}/train.csv')
mol_name = train.molecule_name.values

if False:
    test = pd.read_csv(f'{file_folder}/test.csv')
    structures = pd.read_csv(f'{file_folder}/structures.csv')
    scalar_coupling_contributions = pd.read_csv(
        f'{file_folder}/scalar_coupling_contributions.csv')

    train_cos = unpickle(
        save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
    test_cos = unpickle(
        save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

    train_add = unpickle(save_path / "train_006.df.pkl", )
    test_add = unpickle(save_path / "test_006.df.pkl", )

    babel_train = pd.read_csv(save_path / "babel_train.csv",
                              usecols=use_cols.babel_cols)
    babel_test = pd.read_csv(save_path / "babel_test.csv",
                             usecols=use_cols.babel_cols)

    use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id']
    rdkit_train = pd.read_csv(save_path / "rdkit_train.csv",
                              usecols=use_cols.rdkit_cols)
    rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
Exemple #15
0
        return self._model.best_iteration


# In[3]:

DATA_VERSION = "v001"
TRIAL_NO = "001"
save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}")
submit_path.mkdir(parents=True, exist_ok=True)

print("start loading...")
train = unpickle(save_path / "train_002.df.pkl", )
print("train loaded.")
test = unpickle(save_path / "test_002.df.pkl", )
print("test loaded.")
y = train["scalar_coupling_constant"]
train.drop("scalar_coupling_constant", axis=1, inplace=True)

train.set_index("id", inplace=True)
test.set_index("id", inplace=True)

print(train.shape, test.shape, y.shape)

categorical = [
    'atom_index_0', 'atom_index_1', 'atom_1', 'atom_0', 'type_0', 'type'
]
lgbm_params = {
Exemple #16
0
save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}")
submit_path.mkdir(parents=True, exist_ok=True)

log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}")
log_path.mkdir(parents=True, exist_ok=True)

test_id = np.load("../input/test_id.npy")
print(f"test_id.shape: {test_id.shape}")

print("start loading...")
train1 = unpickle(save_path/"train_002.df.pkl", )
train2 = unpickle(save_path/"train_003.df.pkl", )
train3 = unpickle(save_path/"train_004.df.pkl", )
train = train1.merge(train2, on="id", how="left").merge(train3, on="id", how="left")
assert train.shape[0] == train1.shape[0], f"{train.shape[0]}, {train1.shape[0]}"
print(f"train.shape: {train.shape}")
del train1, train2
gc.collect()
print(f"train loaded.")

test1  = unpickle(save_path/"test_002.df.pkl", )
test2  = unpickle(save_path/"test_003.df.pkl", )
test3  = unpickle(save_path/"test_004.df.pkl", )
test = test1.merge(test2, on="id", how="left").merge(test3, on="id", how="left")
assert test.shape[0] == test1.shape[0], f"{test.shape[0]}, {test1.shape[0]}"
print(f"test.shape: {test.shape}")
Exemple #17
0
            n_estimators=n_estimators,
            fold_group=None)
        X['oof_fc'] = result_dict_lgb1['oof']
        X_test['oof_fc'] = result_dict_lgb1['prediction']
        to_pickle(
            submit_path / f"train_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
            X['oof_fc'])
        to_pickle(
            submit_path / f"test_oof_fc_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
            X_test['oof_fc'])
        to_pickle(
            model_path /
            f"first_model_list_{DATA_VERSION}_{TRIAL_NO}_{seed}.pkl",
            result_dict_lgb1["models"])
    else:
        X['oof_fc'] = unpickle(
            "../submit/v003_036/train_oof_fc_v003_036_21.pkl")
        X_test['oof_fc'] = unpickle(
            "../submit/v003_036/test_oof_fc_v003_036_21.pkl")

        #X['oof_fc'] = unpickle(f"../submit/{DATA_VERSION}_019/train_oof_fc_{DATA_VERSION}_019.pkl", )
        #X_test['oof_fc'] = unpickle(f"../submit/{DATA_VERSION}_019/test_oof_fc_{DATA_VERSION}_019.pkl", )

    #########################################################################################################
    # 2nd layer model
    X_short = pd.DataFrame({
        'ind': list(X.index),
        'type': X['type'].values,
        'oof': [0] * len(X),
        'target': y.values
    })
Exemple #18
0
import numpy as np
import pandas as pd
import os
import sys
sys.path.append('..')
from lib.line_notif import send_message
from lib.utils import reduce_mem_usage, current_time, unpickle, to_pickle

# sklearn
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, LatentDirichletAllocation, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

df = unpickle("../processed/v003/acsf_feat.pkl")

SEED = 71
N_COMP = 5
num_clusters2 = 5

fa = FactorAnalysis(n_components=N_COMP, )
pca = PCA(n_components=N_COMP, random_state=SEED)
tsvd = TruncatedSVD(n_components=N_COMP, random_state=SEED)
ica = FastICA(n_components=N_COMP, random_state=SEED)
grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=SEED)
srp = SparseRandomProjection(n_components=N_COMP,
                             dense_output=True,
                             random_state=SEED)
mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=SEED)
tsne = TSNE(n_components=3, random_state=SEED)
Exemple #19
0
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
log_path.mkdir(parents=True, exist_ok=True)

mid_path = Path(f"../mid/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
mid_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
print("start data loading")
# train = unpickle("../processed/v003/v003_098/train_compact_v003_098.pkl")
# test = unpickle("../processed/v003/v003_098/test_compact_v003_098.pkl")
train = unpickle(
    "../processed/v003/v003_104/train_compact_v003_104_compact.pkl")
test = unpickle("../processed/v003/v003_104/test_compact_v003_104_compact.pkl")

train_ = pd.read_csv("../input/train.csv")
train_id = train_.id
mol_name = train_.molecule_name
scalar_coupling_constant = train_.scalar_coupling_constant
scalar_coupling_contributions = pd.read_csv(
    f'../input/scalar_coupling_contributions.csv')
fc = scalar_coupling_contributions.fc
del train_
del scalar_coupling_contributions

# feat_train = unpickle("../processed/v003/atom_3J_substituents1_train_na.pkl")
# feat_test = unpickle("../processed/v003/atom_3J_substituents1_test_na.pkl")
# train = pd.concat([train, feat_train], axis=1)
Exemple #20
0
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
log_path.mkdir(parents=True, exist_ok=True)

mid_path = Path(f"../mid/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
mid_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
#train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="078")
# train = unpickle("../processed/v003/v003_091/train_compact_v003_091.pkl")
# test = unpickle("../processed/v003/v003_091/test_compact_v003_091.pkl")

train = unpickle("../processed/v003/v003_098/train_compact_v003_098.pkl")
test = unpickle("../processed/v003/v003_098/test_compact_v003_098.pkl")

train_ = pd.read_csv("../input/train.csv")
train_id = train_.id
mol_name = train_.molecule_name
scalar_coupling_constant = train_.scalar_coupling_constant
scalar_coupling_contributions = pd.read_csv(
    f'../input/scalar_coupling_contributions.csv')
fc = scalar_coupling_contributions.fc
del train_
del scalar_coupling_contributions

# seg_submolecule_fp_maccs_train = unpickle("../processed/v003/seg_submolecule_fp_maccs_train.pkl")
# seg_submolecule_fp_maccs_test = unpickle("../processed/v003/seg_submolecule_fp_maccs_test.pkl")
#
Exemple #21
0
def train_model_regression(X, X_test, y, params, folds, model_type='lgb', eval_metric='mae', columns=None,
                           plot_feature_importance=False, model=None,
                           verbose=10000, early_stopping_rounds=200, n_estimators=50000, mol_type=-1,
                           fold_group=None, skip_folds=None, phase_mark="", skipped_mark=[]):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    assert isinstance(skip_folds, list) or skip_folds is None
    print(f"skip_folds :{skip_folds}")

    columns = X.columns if columns is None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {'mae': {'lgb_metric_name': 'mae',
                            'catboost_metric_name': 'MAE',
                            'sklearn_scoring_function': metrics.mean_absolute_error},
                    'group_mae': {'lgb_metric_name': 'mae',
                                  'catboost_metric_name': 'MAE',
                                  'scoring_function': group_mean_log_mae},
                    'mse': {'lgb_metric_name': 'mse',
                            'catboost_metric_name': 'MSE',
                            'sklearn_scoring_function': metrics.mean_squared_error}
                    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(len(X))

    # averaged predictions on train data
    prediction = np.zeros(len(X_test))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    model_list = []

    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, groups=fold_group)):

        if skip_folds is not None and fold_n in skip_folds and phase_mark in skipped_mark:
            print(f'Fold {fold_n + 1} is skipped!!! at {time.ctime()}')
            oof = unpickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", )
            y_pred = unpickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", )
            model = unpickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", )
            fold_importance = unpickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", )

            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
            prediction += y_pred
            model_list += [model]
            continue

        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators=n_estimators, n_jobs=-1, importance_type='gain')
            print(model)
            model.fit(X_train, y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose, early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            params["objective"] = "reg:linear"
            params["eval_metric"] = metrics_dict[eval_metric]['lgb_metric_name']
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200,
                              verbose_eval=verbose, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X.columns), ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000, eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                                      **params,
                                      loss_function=metrics_dict[eval_metric]['catboost_metric_name'])
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )

        if eval_metric != 'group_mae':
            scores.append(metrics_dict[eval_metric]['sklearn_scoring_function'](y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1

            try:
                fold_importance.to_csv(mid_path / f"importance_cv_{fold_n}.csv")
            except Exception as e:
                print("failed to save importance...")
                print(e)

            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
        model_list += [model]

        try:
            to_pickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", oof)
            to_pickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", y_pred)
            to_pickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", model)
            to_pickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", fold_importance)
        except Exception as e:
            print("failed to save intermediate data...")
            print(e)

    if model_type == 'lgb' and plot_feature_importance:
        result_dict['importance'] = feature_importance

    prediction /= folds.n_splits
    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' +' CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores))
        print(cv_score_msg)
        send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["models"] = model_list
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    return result_dict
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
file_folder = '../input'
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
train = pd.read_csv(f'{file_folder}/train.csv')
mol_name = train.molecule_name.values

if True:
    test = pd.read_csv(f'{file_folder}/test.csv')
    structures = pd.read_csv(f'{file_folder}/structures.csv')
    scalar_coupling_contributions = pd.read_csv(
        f'{file_folder}/scalar_coupling_contributions.csv')

    train_cos = unpickle(
        save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
    test_cos = unpickle(
        save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

    train_add = unpickle(save_path / "train_006.df.pkl", )
    test_add = unpickle(save_path / "test_006.df.pkl", )

    babel_train = pd.read_csv(save_path / "babel_train.csv",
                              usecols=use_cols.babel_cols)
    babel_test = pd.read_csv(save_path / "babel_test.csv",
                             usecols=use_cols.babel_cols)

    use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id']
    rdkit_train = pd.read_csv(save_path / "rdkit_train.csv",
                              usecols=use_cols.rdkit_cols)
    rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
####################################################################################################
# Data Loading
train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="078")



def map_acsf(df, acsf_df, atom_idx):
    df = pd.merge(df, acsf_df, how='left',
                  left_on=['molecule_name', f'atom_index_{atom_idx}'],
                  right_on=['molecule_name', 'atom_index'])

    df = df.drop('atom_index', axis=1)
    df = df.rename(columns={c:f"{c}_{atom_idx}" for c in acsf_df.columns[2:]})
    return df

acsf_df = unpickle("../processed/v003/acsf_feat.pkl")
print("acsf_train 0")
train = map_acsf(train, acsf_df, 0)
print("acsf_train 1")
train = map_acsf(train, acsf_df, 1)
print("acsf_test 0")
test = map_acsf(test, acsf_df, 0)
print("acsf_test 1")
test = map_acsf(test, acsf_df, 1)
print("acsf finished")

# seg_H1J_bond_extension1_train = unpickle("../processed/v003/seg_H1J_bond_extension1_train.pkl")
# seg_H1J_bond_extension1_test  = unpickle("../processed/v003/seg_H1J_bond_extension1_test.pkl")
# train = pd.concat([train, seg_H1J_bond_extension1_train], axis=1)
# test = pd.concat([test, seg_H1J_bond_extension1_test], axis=1)
Exemple #24
0
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}")
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading

file_folder = '../input'
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
sub = pd.read_csv(f'{file_folder}/sample_submission.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')
scalar_coupling_contributions = pd.read_csv(
    f'{file_folder}/scalar_coupling_contributions.csv')
train_cos = unpickle("../processed/v001/train_003.df.pkl", )[[
    "id", "f003:cos_0_1", "f003:cos_1"
]]
test_cos = unpickle("../processed/v001/test_003.df.pkl", )[[
    "id", "f003:cos_0_1", "f003:cos_1"
]]

# train_angle_add = unpickle("../processed/v003/train_005.df.pkl", )
# test_angle_add = unpickle("../processed/v003/test_005.df.pkl", )

train_add = unpickle("../processed/v003/train_006.df.pkl", )
test_add = unpickle("../processed/v003/test_006.df.pkl", )

babel_cols = ['id', 'Angle', 'Torsion', 'cos2T', 'cosT', 'sp']
babel_train = pd.read_csv("../processed/v003/babel_train.csv",
                          usecols=babel_cols)
babel_test = pd.read_csv("../processed/v003/babel_test.csv",
####################################################################################################
# path setting
save_path = Path(f"../processed/{DATA_VERSION}")
save_path.mkdir(parents=True, exist_ok=True)
model_path = Path(f"../model/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
model_path.mkdir(parents=True, exist_ok=True)
submit_path = Path(f"../submit/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
submit_path.mkdir(parents=True, exist_ok=True)
log_path = Path(f"../log/{DATA_VERSION}_{TRIAL_NO}{debug_str}")
log_path.mkdir(parents=True, exist_ok=True)

####################################################################################################
# Data Loading
#train, test = get_train_test_data(use_prev=True, prev_data_version="v003", prev_trial_no="078")
train = unpickle("../processed/v003/v003_091/train_compact_v003_091.pkl")
test = unpickle("../processed/v003/v003_091/test_compact_v003_091.pkl")

train_ = pd.read_csv("../input/train.csv")
train_id = train_.id
mol_name = train_.molecule_name
scalar_coupling_constant = train_.scalar_coupling_constant
scalar_coupling_contributions = pd.read_csv(f'../input/scalar_coupling_contributions.csv')
fc = scalar_coupling_contributions.fc
del train_
del scalar_coupling_contributions

seg_submolecule_fp_maccs_train = unpickle("../processed/v003/seg_submolecule_fp_maccs_train.pkl")
seg_submolecule_fp_maccs_test = unpickle("../processed/v003/seg_submolecule_fp_maccs_test.pkl")

train = pd.concat([train, seg_submolecule_fp_maccs_train], axis=1)
def get_train_test_data(use_prev=False,
                        prev_data_version=None,
                        prev_trial_no=None):
    if use_prev:
        assert prev_data_version is not None
        assert prev_trial_no is not None

    file_folder = '../input'
    train = pd.read_csv(f'{file_folder}/train.csv')
    if not use_prev:
        test = pd.read_csv(f'{file_folder}/test.csv')
        structures = pd.read_csv(f'{file_folder}/structures.csv')
        scalar_coupling_contributions = pd.read_csv(
            f'{file_folder}/scalar_coupling_contributions.csv')

        # train_cos = unpickle(save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
        # test_cos = unpickle(save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

        train_add = unpickle(save_path / "train_006.df.pkl", )
        test_add = unpickle(save_path / "test_006.df.pkl", )

        babel_train = pd.read_csv(save_path / "babel_train.csv",
                                  usecols=use_cols.babel_cols)
        babel_test = pd.read_csv(save_path / "babel_test.csv",
                                 usecols=use_cols.babel_cols)

        use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id']
        rdkit_train = pd.read_csv(save_path / "rdkit_train.csv",
                                  usecols=use_cols.rdkit_cols)
        rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
                                 usecols=use_cols.rdkit_cols)

        coulomb_train = pd.read_csv(save_path /
                                    "coulomb_interaction_train.csv")
        coulomb_test = pd.read_csv(save_path / "coulomb_interaction_test.csv")

        bond_calc_train = unpickle(save_path / "bond_calc_feat_train.pkl")
        bond_calc_test = unpickle(save_path / "bond_calc_feat_test.pkl")

        ob_charges = pd.read_csv(save_path / "ob_charges.csv", index_col=0)

        tda_radius_df = pd.read_csv(save_path / "tda_radius_df.csv",
                                    index_col=0)

        tda_radius_df_03 = pd.read_csv(save_path / "tda_radius_df_v003.csv",
                                       index_col=0)

        pca_feat = unpickle(save_path / "pca_feat_df.pkl")

        ####################################################################################################
        # Feature Engineering

        train = pd.merge(
            train,
            scalar_coupling_contributions,
            how='left',
            left_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
            right_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

        train = map_atom_info(train, 0, structures)
        train = map_atom_info(train, 1, structures)
        test = map_atom_info(test, 0, structures)
        test = map_atom_info(test, 1, structures)

        train_p_0 = train[['x_0', 'y_0', 'z_0']].values
        train_p_1 = train[['x_1', 'y_1', 'z_1']].values
        test_p_0 = test[['x_0', 'y_0', 'z_0']].values
        test_p_1 = test[['x_1', 'y_1', 'z_1']].values

        train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
        test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
        train['dist_x'] = (train['x_0'] - train['x_1'])**2
        test['dist_x'] = (test['x_0'] - test['x_1'])**2
        train['dist_y'] = (train['y_0'] - train['y_1'])**2
        test['dist_y'] = (test['y_0'] - test['y_1'])**2
        train['dist_z'] = (train['z_0'] - train['z_1'])**2
        test['dist_z'] = (test['z_0'] - test['z_1'])**2

        train['type_0'] = train['type'].apply(lambda x: x[0])
        test['type_0'] = test['type'].apply(lambda x: x[0])

        train['abs_dist'] = np.linalg.norm(train_p_0 - train_p_1,
                                           axis=1,
                                           ord=1)
        test['abs_dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1, ord=1)
        dist12('dist_xy', 'x', 'y')
        dist12('dist_xz', 'x', 'z')
        dist12('dist_yz', 'y', 'z')

        atom_count = structures.groupby(['molecule_name',
                                         'atom']).size().unstack(fill_value=0)
        train = pd.merge(train,
                         atom_count,
                         how='left',
                         left_on='molecule_name',
                         right_on='molecule_name')
        test = pd.merge(test,
                        atom_count,
                        how='left',
                        left_on='molecule_name',
                        right_on='molecule_name')

        train = create_features(train)
        test = create_features(test)

        angle_df_train, angle_df_test = angle_feature_conv(structures)
        train = train.merge(angle_df_train, on="id", how="left")
        test = test.merge(angle_df_test, on="id", how="left")

        train = train.merge(train_add, on="id", how="left")
        test = test.merge(test_add, on="id", how="left")

        # train = train.merge(train_cos, on="id", how="left")
        # test = test.merge(test_cos, on="id", how="left")

        train = train.merge(babel_train, on="id", how="left")
        test = test.merge(babel_test, on="id", how="left")

        train = train.merge(rdkit_train, on="id", how="left")
        test = test.merge(rdkit_test, on="id", how="left")

        train = train.merge(coulomb_train, on="id", how="left")
        test = test.merge(coulomb_test, on="id", how="left")

        train = train.merge(bond_calc_train, on="id", how="left")
        test = test.merge(bond_calc_test, on="id", how="left")

        train = train.merge(tda_radius_df, on="molecule_name", how="left")
        test = test.merge(tda_radius_df, on="molecule_name", how="left")

        train = train.merge(tda_radius_df_03, on="molecule_name", how="left")
        test = test.merge(tda_radius_df_03, on="molecule_name", how="left")

        train = train.merge(pca_feat, on="molecule_name", how="left")
        test = test.merge(pca_feat, on="molecule_name", how="left")

        train = map_ob_charges(train, ob_charges, 0)
        train = map_ob_charges(train, ob_charges, 1)
        test = map_ob_charges(test, ob_charges, 0)
        test = map_ob_charges(test, ob_charges, 1)

        train = reduce_mem_usage(train)
        test = reduce_mem_usage(test)

        for f in ['atom_1', 'type_0', 'type']:
            if f in use_cols.good_columns:
                lbl = LabelEncoder()
                lbl.fit(list(train[f].values) + list(test[f].values))
                train[f] = lbl.transform(list(train[f].values))
                test[f] = lbl.transform(list(test[f].values))

        Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True,
                                                             exist_ok=True)
        to_pickle(
            save_path /
            f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
            train)
        to_pickle(
            save_path /
            f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
            test)
    else:
        sample_loaded = False
        prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
        if DEBUG:
            # v003_033
            train_path = Path(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl"
            )
            test_path = Path(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl"
            )

            if train_path.exists() and test_path.exists():
                print("sample loading")
                train = unpickle(train_path)
                test = unpickle(test_path)
                sample_loaded = True
                print("sample load finish")

        if not sample_loaded:
            print(f"loading previous dataest")
            print("train loading")
            train: pd.DataFrame = unpickle(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic.pkl",
            )
            assert "scalar_coupling_constant" in train.columns
            print("test loading")
            test: pd.DataFrame = unpickle(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic.pkl",
            )
            print(f"loading finished")

        if DEBUG and not sample_loaded:
            n_sample = 5000
            print(f"sampling {n_sample} rows.")
            train = train.sample(n=n_sample)
            test = test.sample(n=n_sample)
            Path(
                f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
            ).mkdir(parents=True, exist_ok=True)
            to_pickle(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl",
                train)
            to_pickle(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl",
                test)
            print("saved.")

        ###################################################################################################
        # add additional feature for trying

        # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test)
    return train, test
# coding: utf-8
import pandas as pd
import sys
sys.path.append('..')
from lib.utils import current_time, unpickle, to_pickle

# sklearn
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

df = unpickle("../processed/v003/mol_vec_df.pkl").set_index("molecule_name")

SEED = 71
N_COMP = 10
num_clusters2 = 10

fa = FactorAnalysis(n_components=N_COMP, )
pca = PCA(n_components=N_COMP, random_state=SEED)
tsvd = TruncatedSVD(n_components=N_COMP, random_state=SEED)
ica = FastICA(n_components=N_COMP, random_state=SEED)
grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=SEED)
srp = SparseRandomProjection(n_components=N_COMP,
                             dense_output=True,
                             random_state=SEED)
mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=SEED)
tsne = TSNE(n_components=3, random_state=SEED)

ss = StandardScaler()