Esempio n. 1
0
def create_features(df):
    df['molecule_couples'] = df.groupby('molecule_name')['id'].transform('count')
    df['molecule_dist_mean'] = df.groupby('molecule_name')['dist'].transform('mean')
    df['molecule_dist_min'] = df.groupby('molecule_name')['dist'].transform('min')
    df['molecule_dist_max'] = df.groupby('molecule_name')['dist'].transform('max')
    df['atom_0_couples_count'] = df.groupby(['molecule_name', 'atom_index_0'])['id'].transform('count')
    df['atom_1_couples_count'] = df.groupby(['molecule_name', 'atom_index_1'])['id'].transform('count')

    df[f'molecule_atom_index_0_x_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['x_1'].transform('std')
    df[f'molecule_atom_index_0_y_1_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('mean')
    df[f'molecule_atom_index_0_y_1_mean_diff'] = df[f'molecule_atom_index_0_y_1_mean'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_mean_div'] = df[f'molecule_atom_index_0_y_1_mean'] / df['y_1']
    df[f'molecule_atom_index_0_y_1_max'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('max')
    df[f'molecule_atom_index_0_y_1_max_diff'] = df[f'molecule_atom_index_0_y_1_max'] - df['y_1']
    df[f'molecule_atom_index_0_y_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['y_1'].transform('std')
    df[f'molecule_atom_index_0_z_1_std'] = df.groupby(['molecule_name', 'atom_index_0'])['z_1'].transform('std')
    df[f'molecule_atom_index_0_dist_mean'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('mean')
    df[f'molecule_atom_index_0_dist_mean_diff'] = df[f'molecule_atom_index_0_dist_mean'] - df['dist']
    df[f'molecule_atom_index_0_dist_mean_div'] = df[f'molecule_atom_index_0_dist_mean'] / df['dist']
    df[f'molecule_atom_index_0_dist_max'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('max')
    df[f'molecule_atom_index_0_dist_max_diff'] = df[f'molecule_atom_index_0_dist_max'] - df['dist']
    df[f'molecule_atom_index_0_dist_max_div'] = df[f'molecule_atom_index_0_dist_max'] / df['dist']
    df[f'molecule_atom_index_0_dist_min'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('min')
    df[f'molecule_atom_index_0_dist_min_diff'] = df[f'molecule_atom_index_0_dist_min'] - df['dist']
    df[f'molecule_atom_index_0_dist_min_div'] = df[f'molecule_atom_index_0_dist_min'] / df['dist']
    df[f'molecule_atom_index_0_dist_std'] = df.groupby(['molecule_name', 'atom_index_0'])['dist'].transform('std')
    df[f'molecule_atom_index_0_dist_std_diff'] = df[f'molecule_atom_index_0_dist_std'] - df['dist']
    df[f'molecule_atom_index_0_dist_std_div'] = df[f'molecule_atom_index_0_dist_std'] / df['dist']
    df[f'molecule_atom_index_1_dist_mean'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('mean')
    df[f'molecule_atom_index_1_dist_mean_diff'] = df[f'molecule_atom_index_1_dist_mean'] - df['dist']
    df[f'molecule_atom_index_1_dist_mean_div'] = df[f'molecule_atom_index_1_dist_mean'] / df['dist']
    df[f'molecule_atom_index_1_dist_max'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('max')
    df[f'molecule_atom_index_1_dist_max_diff'] = df[f'molecule_atom_index_1_dist_max'] - df['dist']
    df[f'molecule_atom_index_1_dist_max_div'] = df[f'molecule_atom_index_1_dist_max'] / df['dist']
    df[f'molecule_atom_index_1_dist_min'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('min')
    df[f'molecule_atom_index_1_dist_min_diff'] = df[f'molecule_atom_index_1_dist_min'] - df['dist']
    df[f'molecule_atom_index_1_dist_min_div'] = df[f'molecule_atom_index_1_dist_min'] / df['dist']
    df[f'molecule_atom_index_1_dist_std'] = df.groupby(['molecule_name', 'atom_index_1'])['dist'].transform('std')
    df[f'molecule_atom_index_1_dist_std_diff'] = df[f'molecule_atom_index_1_dist_std'] - df['dist']
    df[f'molecule_atom_index_1_dist_std_div'] = df[f'molecule_atom_index_1_dist_std'] / df['dist']
    df[f'molecule_atom_1_dist_mean'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('mean')
    df[f'molecule_atom_1_dist_min'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('min')
    df[f'molecule_atom_1_dist_min_diff'] = df[f'molecule_atom_1_dist_min'] - df['dist']
    df[f'molecule_atom_1_dist_min_div'] = df[f'molecule_atom_1_dist_min'] / df['dist']
    df[f'molecule_atom_1_dist_std'] = df.groupby(['molecule_name', 'atom_1'])['dist'].transform('std')
    df[f'molecule_atom_1_dist_std_diff'] = df[f'molecule_atom_1_dist_std'] - df['dist']
    df[f'molecule_type_0_dist_std'] = df.groupby(['molecule_name', 'type_0'])['dist'].transform('std')
    df[f'molecule_type_0_dist_std_diff'] = df[f'molecule_type_0_dist_std'] - df['dist']
    df[f'molecule_type_dist_mean'] = df.groupby(['molecule_name', 'type'])['dist'].transform('mean')
    df[f'molecule_type_dist_mean_diff'] = df[f'molecule_type_dist_mean'] - df['dist']
    df[f'molecule_type_dist_mean_div'] = df[f'molecule_type_dist_mean'] / df['dist']
    df[f'molecule_type_dist_max'] = df.groupby(['molecule_name', 'type'])['dist'].transform('max')
    df[f'molecule_type_dist_min'] = df.groupby(['molecule_name', 'type'])['dist'].transform('min')
    df[f'molecule_type_dist_std'] = df.groupby(['molecule_name', 'type'])['dist'].transform('std')
    df[f'molecule_type_dist_std_diff'] = df[f'molecule_type_dist_std'] - df['dist']

    df = reduce_mem_usage(df)
    return df
Esempio n. 2
0
def add_qm9_features(df):
    data_qm9 = pd.read_pickle('../input/data.covs.pickle')
    to_drop = [
        'type', 'linear', 'atom_index_0', 'atom_index_1',
        'scalar_coupling_constant', 'U', 'G', 'H', 'mulliken_mean', 'r2', 'U0'
    ]
    data_qm9 = data_qm9.drop(columns=to_drop, axis=1)
    data_qm9 = reduce_mem_usage(data_qm9, verbose=False)
    df = pd.merge(df, data_qm9, how='left', on=['molecule_name', 'id'])
    del data_qm9

    df = dummies(df, ['type', 'atom_1'])
    return df
Esempio n. 3
0
def get_train_test_data(use_prev=False,
                        prev_data_version=None,
                        prev_trial_no=None):
    if use_prev:
        assert prev_data_version is not None
        assert prev_trial_no is not None

    file_folder = '../input'
    train = pd.read_csv(f'{file_folder}/train.csv')
    if not use_prev:
        test = pd.read_csv(f'{file_folder}/test.csv')
        structures = pd.read_csv(f'{file_folder}/structures.csv')
        scalar_coupling_contributions = pd.read_csv(
            f'{file_folder}/scalar_coupling_contributions.csv')

        # train_cos = unpickle(save_path / "train_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]
        # test_cos = unpickle(save_path / "test_003.df.pkl", )[["id", "f003:cos_0_1", "f003:cos_1"]]

        train_add = unpickle(save_path / "train_006.df.pkl", )
        test_add = unpickle(save_path / "test_006.df.pkl", )

        babel_train = pd.read_csv(save_path / "babel_train.csv",
                                  usecols=use_cols.babel_cols)
        babel_test = pd.read_csv(save_path / "babel_test.csv",
                                 usecols=use_cols.babel_cols)

        use_cols.good_columns += [c for c in use_cols.rdkit_cols if c != 'id']
        rdkit_train = pd.read_csv(save_path / "rdkit_train.csv",
                                  usecols=use_cols.rdkit_cols)
        rdkit_test = pd.read_csv(save_path / "rdkit_test.csv",
                                 usecols=use_cols.rdkit_cols)

        coulomb_train = pd.read_csv(save_path /
                                    "coulomb_interaction_train.csv")
        coulomb_test = pd.read_csv(save_path / "coulomb_interaction_test.csv")

        bond_calc_train = unpickle(save_path / "bond_calc_feat_train.pkl")
        bond_calc_test = unpickle(save_path / "bond_calc_feat_test.pkl")

        ob_charges = pd.read_csv(save_path / "ob_charges.csv", index_col=0)

        tda_radius_df = pd.read_csv(save_path / "tda_radius_df.csv",
                                    index_col=0)

        tda_radius_df_03 = pd.read_csv(save_path / "tda_radius_df_v003.csv",
                                       index_col=0)

        pca_feat = unpickle(save_path / "pca_feat_df.pkl")

        ####################################################################################################
        # Feature Engineering

        train = pd.merge(
            train,
            scalar_coupling_contributions,
            how='left',
            left_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'],
            right_on=['molecule_name', 'atom_index_0', 'atom_index_1', 'type'])

        train = map_atom_info(train, 0, structures)
        train = map_atom_info(train, 1, structures)
        test = map_atom_info(test, 0, structures)
        test = map_atom_info(test, 1, structures)

        train_p_0 = train[['x_0', 'y_0', 'z_0']].values
        train_p_1 = train[['x_1', 'y_1', 'z_1']].values
        test_p_0 = test[['x_0', 'y_0', 'z_0']].values
        test_p_1 = test[['x_1', 'y_1', 'z_1']].values

        train['dist'] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
        test['dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
        train['dist_x'] = (train['x_0'] - train['x_1'])**2
        test['dist_x'] = (test['x_0'] - test['x_1'])**2
        train['dist_y'] = (train['y_0'] - train['y_1'])**2
        test['dist_y'] = (test['y_0'] - test['y_1'])**2
        train['dist_z'] = (train['z_0'] - train['z_1'])**2
        test['dist_z'] = (test['z_0'] - test['z_1'])**2

        train['type_0'] = train['type'].apply(lambda x: x[0])
        test['type_0'] = test['type'].apply(lambda x: x[0])

        train['abs_dist'] = np.linalg.norm(train_p_0 - train_p_1,
                                           axis=1,
                                           ord=1)
        test['abs_dist'] = np.linalg.norm(test_p_0 - test_p_1, axis=1, ord=1)
        dist12('dist_xy', 'x', 'y')
        dist12('dist_xz', 'x', 'z')
        dist12('dist_yz', 'y', 'z')

        atom_count = structures.groupby(['molecule_name',
                                         'atom']).size().unstack(fill_value=0)
        train = pd.merge(train,
                         atom_count,
                         how='left',
                         left_on='molecule_name',
                         right_on='molecule_name')
        test = pd.merge(test,
                        atom_count,
                        how='left',
                        left_on='molecule_name',
                        right_on='molecule_name')

        train = create_features(train)
        test = create_features(test)

        angle_df_train, angle_df_test = angle_feature_conv(structures)
        train = train.merge(angle_df_train, on="id", how="left")
        test = test.merge(angle_df_test, on="id", how="left")

        train = train.merge(train_add, on="id", how="left")
        test = test.merge(test_add, on="id", how="left")

        # train = train.merge(train_cos, on="id", how="left")
        # test = test.merge(test_cos, on="id", how="left")

        train = train.merge(babel_train, on="id", how="left")
        test = test.merge(babel_test, on="id", how="left")

        train = train.merge(rdkit_train, on="id", how="left")
        test = test.merge(rdkit_test, on="id", how="left")

        train = train.merge(coulomb_train, on="id", how="left")
        test = test.merge(coulomb_test, on="id", how="left")

        train = train.merge(bond_calc_train, on="id", how="left")
        test = test.merge(bond_calc_test, on="id", how="left")

        train = train.merge(tda_radius_df, on="molecule_name", how="left")
        test = test.merge(tda_radius_df, on="molecule_name", how="left")

        train = train.merge(tda_radius_df_03, on="molecule_name", how="left")
        test = test.merge(tda_radius_df_03, on="molecule_name", how="left")

        train = train.merge(pca_feat, on="molecule_name", how="left")
        test = test.merge(pca_feat, on="molecule_name", how="left")

        train = map_ob_charges(train, ob_charges, 0)
        train = map_ob_charges(train, ob_charges, 1)
        test = map_ob_charges(test, ob_charges, 0)
        test = map_ob_charges(test, ob_charges, 1)

        train = reduce_mem_usage(train)
        test = reduce_mem_usage(test)

        for f in ['atom_1', 'type_0', 'type']:
            if f in use_cols.good_columns:
                lbl = LabelEncoder()
                lbl.fit(list(train[f].values) + list(test[f].values))
                train[f] = lbl.transform(list(train[f].values))
                test[f] = lbl.transform(list(test[f].values))

        Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True,
                                                             exist_ok=True)
        to_pickle(
            save_path /
            f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
            train)
        to_pickle(
            save_path /
            f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl",
            test)
    else:
        sample_loaded = False
        prev_folder = f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
        if DEBUG:
            # v003_033
            train_path = Path(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl"
            )
            test_path = Path(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl"
            )

            if train_path.exists() and test_path.exists():
                print("sample loading")
                train = unpickle(train_path)
                test = unpickle(test_path)
                sample_loaded = True
                print("sample load finish")

        if not sample_loaded:
            print(f"loading previous dataest")
            print("train loading")
            train: pd.DataFrame = unpickle(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic.pkl",
            )
            assert "scalar_coupling_constant" in train.columns
            print("test loading")
            test: pd.DataFrame = unpickle(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic.pkl",
            )
            print(f"loading finished")

        if DEBUG and not sample_loaded:
            n_sample = 5000
            print(f"sampling {n_sample} rows.")
            train = train.sample(n=n_sample)
            test = test.sample(n=n_sample)
            Path(
                f"../processed/{prev_data_version}/{prev_data_version}_{prev_trial_no}"
            ).mkdir(parents=True, exist_ok=True)
            to_pickle(
                f"{prev_folder}/train_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl",
                train)
            to_pickle(
                f"{prev_folder}/test_concat_{prev_data_version}_{prev_trial_no}_basic_sampled.pkl",
                test)
            print("saved.")

        ###################################################################################################
        # add additional feature for trying

        # Path(save_path / f"{DATA_VERSION}_{TRIAL_NO}").mkdir(parents=True, exist_ok=True)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/train_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", train)
        # to_pickle(save_path / f"{DATA_VERSION}_{TRIAL_NO}/test_concat_{DATA_VERSION}_{TRIAL_NO}.pkl", test)
    return train, test
Esempio n. 4
0
                  how="left")  # .merge(test_cos,  on="id", how="left")
# train = train.merge(train_angle_add, on="id", how="left")
# test = test.merge(test_angle_add, on="id", how="left")
train = train.merge(train_add, on="id", how="left")
test = test.merge(test_add, on="id", how="left")

train = train.merge(babel_train, on="id", how="left")
test = test.merge(babel_test, on="id", how="left")

ob_charges = pd.read_csv("../processed/v003/ob_charges.csv", index_col=0)
train = map_ob_charges(train, 0)
train = map_ob_charges(train, 1)
test = map_ob_charges(test, 0)
test = map_ob_charges(test, 1)

train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

for f in ['atom_1', 'type_0', 'type']:
    if f in good_columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))

to_pickle(save_path / f"train_concat_v003_{DATA_VERSION}_{TRIAL_NO}.pkl",
          train)
to_pickle(save_path / f"test_concat_v003_{DATA_VERSION}_{TRIAL_NO}.pkl", test)

X = train[good_columns].copy()
y = train['scalar_coupling_constant']
Esempio n. 5
0
 def transform(self, X):
     X['distance'] = np.linalg.norm(X[['x_x', 'y_x', 'z_x']].values -
                                    X[['x_y', 'y_y', 'z_y']].values,
                                    axis=1)
     X['x_dist'] = X['x_x'] - X['x_y']
     X['y_dist'] = X['y_x'] - X['y_y']
     X['z_dist'] = X['z_x'] - X['z_y']
     X['x_dist_abs'] = np.absolute(X['x_dist'])
     X['y_dist_abs'] = np.absolute(X['y_dist'])
     X['z_dist_abs'] = np.absolute(X['z_dist'])
     X['inv_distance3'] = 1 / (X['distance']**3)
     X['dimension_x'] = np.absolute(
         X.groupby(['molecule_name'])['x_x'].transform('max') -
         X.groupby(['molecule_name'])['x_x'].transform('min'))
     X['dimension_y'] = np.absolute(
         X.groupby(['molecule_name'])['y_x'].transform('max') -
         X.groupby(['molecule_name'])['y_x'].transform('min'))
     X['dimension_z'] = np.absolute(
         X.groupby(['molecule_name'])['z_x'].transform('max') -
         X.groupby(['molecule_name'])['z_x'].transform('min'))
     X['molecule_dist_mean_x'] = X.groupby(
         ['molecule_name'])['dist_mean_x'].transform('mean')
     X['molecule_dist_mean_y'] = X.groupby(
         ['molecule_name'])['dist_mean_y'].transform('mean')
     X['molecule_dist_mean_bond_x'] = X.groupby(
         ['molecule_name'])['dist_mean_bond_x'].transform('mean')
     X['molecule_dist_mean_bond_y'] = X.groupby(
         ['molecule_name'])['dist_mean_bond_y'].transform('mean')
     X['molecule_dist_range_x'] = X.groupby(['molecule_name'])['dist_mean_x'].transform('max') - \
                                  X.groupby(['molecule_name'])['dist_mean_x'].transform('min')
     X['molecule_dist_range_y'] = X.groupby(['molecule_name'])['dist_mean_y'].transform('max') - \
                                  X.groupby(['molecule_name'])['dist_mean_y'].transform('min')
     X['molecule_dist_std_x'] = X.groupby(
         ['molecule_name'])['dist_mean_x'].transform('std')
     X['molecule_dist_std_y'] = X.groupby(
         ['molecule_name'])['dist_mean_y'].transform('std')
     X['molecule_atom_0_dist_mean'] = X.groupby(
         ['molecule_name', 'atom_x'])['distance'].transform('mean')
     X['molecule_atom_1_dist_mean'] = X.groupby(
         ['molecule_name', 'atom_y'])['distance'].transform('mean')
     X['molecule_atom_0_dist_std_diff'] = X.groupby([
         'molecule_name', 'atom_x'
     ])['distance'].transform('std') - X['distance']
     X['molecule_atom_1_dist_std_diff'] = X.groupby([
         'molecule_name', 'atom_y'
     ])['distance'].transform('std') - X['distance']
     X['molecule_type_dist_min'] = X.groupby(
         ['molecule_name', 'type'])['distance'].transform('min')
     X['molecule_type_dist_max'] = X.groupby(
         ['molecule_name', 'type'])['distance'].transform('max')
     X['molecule_dist_mean_no_bond_x'] = X.groupby(
         ['molecule_name'])['dist_mean_no_bond_x'].transform('mean')
     X['molecule_dist_mean_no_bond_y'] = X.groupby(
         ['molecule_name'])['dist_mean_no_bond_y'].transform('mean')
     X['molecule_atom_index_0_dist_min'] = X.groupby([
         'molecule_name', 'atom_index_0'
     ])['distance'].transform('min')  # new variable - dont include
     X['molecule_atom_index_0_dist_std'] = X.groupby([
         'molecule_name', 'atom_index_0'
     ])['distance'].transform('std')  # new variable - dont include
     X['molecule_atom_index_0_dist_min_div'] = X[
         'molecule_atom_index_0_dist_min'] / X[
             'distance']  # new variable - include
     X['molecule_atom_index_0_dist_std_div'] = X[
         'molecule_atom_index_0_dist_std'] / X[
             'distance']  # new variable - include
     X['molecule_atom_index_0_dist_mean'] = X.groupby([
         'molecule_name', 'atom_index_0'
     ])['distance'].transform('mean')  # new variable - include
     X['molecule_atom_index_0_dist_max'] = X.groupby([
         'molecule_name', 'atom_index_0'
     ])['distance'].transform('max')  # new variable - include
     X['molecule_atom_index_0_dist_mean_diff'] = X[
         'molecule_atom_index_0_dist_mean'] - X[
             'distance']  # new variable - include
     X['molecule_atom_index_1_dist_mean'] = X.groupby([
         'molecule_name', 'atom_index_1'
     ])['distance'].transform('mean')  # new variable - include
     X['molecule_atom_index_1_dist_max'] = X.groupby([
         'molecule_name', 'atom_index_1'
     ])['distance'].transform('max')  # new variable - include
     X['molecule_atom_index_1_dist_min'] = X.groupby([
         'molecule_name', 'atom_index_1'
     ])['distance'].transform('min')  # new variable - include
     X['molecule_atom_index_1_dist_std'] = X.groupby([
         'molecule_name', 'atom_index_1'
     ])['distance'].transform('std')  # new variable - dont include
     X['molecule_atom_index_1_dist_min_div'] = X[
         'molecule_atom_index_1_dist_min'] / X[
             'distance']  # new variable - include
     X['molecule_atom_index_1_dist_std_diff'] = X[
         'molecule_atom_index_1_dist_std'] - X[
             'distance']  # new variable - include
     X['molecule_atom_index_1_dist_mean_div'] = X[
         'molecule_atom_index_1_dist_mean'] / X[
             'distance']  # new variable - include
     X['molecule_atom_index_1_dist_min_diff'] = X[
         'molecule_atom_index_1_dist_min_div'] - X[
             'distance']  # new variable - include
     le = LabelEncoder()
     for feat in ['atom_x', 'atom_y']:
         le.fit(X[feat])
         X[feat] = le.transform(X[feat])
     X = reduce_mem_usage(X, verbose=False)
     return X
Esempio n. 6
0
    def transform(self, X):
        atom_rad = [self.atomic_radius[x] for x in X['atom'].values]
        X['rad'] = atom_rad
        position = X[['x', 'y', 'z']].values
        p_temp = position
        molec_name = X['molecule_name'].values
        m_temp = molec_name
        radius = X['rad'].values
        r_temp = radius
        bond = 0
        dist_keep = 0
        dist_bond = 0
        no_bond = 0
        dist_no_bond = 0
        dist_matrix = np.zeros((X.shape[0], 2 * 29))
        dist_matrix_bond = np.zeros((X.shape[0], 2 * 29))
        dist_matrix_no_bond = np.zeros((X.shape[0], 2 * 29))

        for i in range(29):
            p_temp = np.roll(p_temp, -1, axis=0)
            m_temp = np.roll(m_temp, -1, axis=0)
            r_temp = np.roll(r_temp, -1, axis=0)
            mask = (m_temp == molec_name)
            dist = np.linalg.norm(position - p_temp, axis=1) * mask
            dist_temp = np.roll(np.linalg.norm(position - p_temp, axis=1) *
                                mask,
                                i + 1,
                                axis=0)
            diff_radius_dist = (dist -
                                (radius + r_temp)) * (dist <
                                                      (radius + r_temp)) * mask
            diff_radius_dist_temp = np.roll(diff_radius_dist, i + 1, axis=0)
            bond += (dist < (radius + r_temp)) * mask
            bond_temp = np.roll((dist < (radius + r_temp)) * mask,
                                i + 1,
                                axis=0)
            no_bond += (dist >= (radius + r_temp)) * mask
            no_bond_temp = np.roll((dist >= (radius + r_temp)) * mask,
                                   i + 1,
                                   axis=0)
            bond += bond_temp
            no_bond += no_bond_temp
            dist_keep += dist * mask
            dist_matrix[:, 2 * i] = dist
            dist_matrix[:, 2 * i + 1] = dist_temp
            dist_matrix_bond[:, 2 * i] = dist * (dist <
                                                 (radius + r_temp)) * mask
            dist_matrix_bond[:, 2 * i + 1] = dist_temp * bond_temp
            dist_matrix_no_bond[:, 2 * i] = dist * (dist >
                                                    (radius + r_temp)) * mask
            dist_matrix_no_bond[:, 2 * i + 1] = dist_temp * no_bond_temp
        X['n_bonds'] = bond
        X['n_no_bonds'] = no_bond
        X['dist_mean'] = np.nanmean(np.where(dist_matrix == 0, np.nan,
                                             dist_matrix),
                                    axis=1)
        X['dist_median'] = np.nanmedian(np.where(dist_matrix == 0, np.nan,
                                                 dist_matrix),
                                        axis=1)
        X['dist_std_bond'] = np.nanstd(np.where(dist_matrix_bond == 0, np.nan,
                                                dist_matrix),
                                       axis=1)
        X['dist_mean_bond'] = np.nanmean(np.where(dist_matrix_bond == 0,
                                                  np.nan, dist_matrix),
                                         axis=1)
        X['dist_median_bond'] = np.nanmedian(np.where(dist_matrix_bond == 0,
                                                      np.nan, dist_matrix),
                                             axis=1)
        X['dist_mean_no_bond'] = np.nanmean(np.where(dist_matrix_no_bond == 0,
                                                     np.nan, dist_matrix),
                                            axis=1)
        X['dist_std_no_bond'] = np.nanstd(np.where(dist_matrix_no_bond == 0,
                                                   np.nan, dist_matrix),
                                          axis=1)
        X['dist_median_no_bond'] = np.nanmedian(np.where(
            dist_matrix_no_bond == 0, np.nan, dist_matrix),
                                                axis=1)
        X['dist_std'] = np.nanstd(np.where(dist_matrix == 0, np.nan,
                                           dist_matrix),
                                  axis=1)
        X['dist_min'] = np.nanmin(np.where(dist_matrix == 0, np.nan,
                                           dist_matrix),
                                  axis=1)
        X['dist_max'] = np.nanmax(np.where(dist_matrix == 0, np.nan,
                                           dist_matrix),
                                  axis=1)
        X['range_dist'] = np.absolute(X['dist_max'] - X['dist_min'])
        X['dist_bond_min'] = np.nanmin(np.where(dist_matrix_bond == 0, np.nan,
                                                dist_matrix),
                                       axis=1)
        X['dist_bond_max'] = np.nanmax(np.where(dist_matrix_bond == 0, np.nan,
                                                dist_matrix),
                                       axis=1)
        X['range_dist_bond'] = np.absolute(X['dist_bond_max'] -
                                           X['dist_bond_min'])
        X['dist_no_bond_min'] = np.nanmin(np.where(dist_matrix_no_bond == 0,
                                                   np.nan, dist_matrix),
                                          axis=1)
        X['dist_no_bond_max'] = np.nanmax(np.where(dist_matrix_no_bond == 0,
                                                   np.nan, dist_matrix),
                                          axis=1)
        X['range_dist_no_bond'] = np.absolute(X['dist_no_bond_max'] -
                                              X['dist_no_bond_min'])
        X['n_diff'] = pd.DataFrame(np.around(dist_matrix_bond,
                                             5)).nunique(axis=1).values  # 5
        X = reduce_mem_usage(X, verbose=False)
        return X
Esempio n. 7
0
cat_features = ['type', 'atom_x', 'atom_y']
atomic_radius = {'H': 0.43, 'C': 0.82, 'N': 0.8, 'O': 0.78, 'F': 0.76}
electronegativity = {'H': 2.2, 'C': 2.55, 'N': 3.04, 'O': 3.44, 'F': 3.98}

t0 = time()
struct = pd.read_csv('../input/structures.csv')
pipeline_model1 = make_pipeline(
    MoreStructureProperties(atomic_radius, electronegativity))
pipeline_model2 = make_pipeline(MakeMoreFeatures())
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
struct = pd.read_csv('../input/structures.csv')
structures_yukawa = pd.read_csv('../input/structures_yukawa.csv')
struct = pd.concat([struct, structures_yukawa], axis=1)
del structures_yukawa
struct = reduce_mem_usage(struct, verbose=False)
gc.collect()
train = get_features(train, struct.copy())
test = get_features(test, struct.copy())
y = train['scalar_coupling_constant']
del struct
gc.collect()

struct = pd.read_csv('../input/structures.csv')
struct = pipeline_model1.fit_transform(struct)
train = feat_from_structures(train, struct)
train = pipeline_model2.fit_transform(
    train.drop(['scalar_coupling_constant'], axis=1),
    train['scalar_coupling_constant'])
test = feat_from_structures(test, struct)
test = pipeline_model2.transform(test)