def make_adjm(self): ''' Prepare weighted adjacency matrix by using coordinate info in structures csv file for each molecule. Returns: A dictionary containing 2D weighted adjacecny matrix for each molecule in structures csv file. ''' self.dict_adjM = dict() grp_dfstructs = self.dfXstructs.groupby('molecule_name') i_key = 0 len_key = len(grp_dfstructs) for key, item in grp_dfstructs: dfstructs1 = grp_dfstructs.get_group(key) dfstructs1 = dfstructs1.set_index('atom_index') adjM_temp = np.full((self.N, self.N), self.far_dist) for an1 in dfstructs1.index.values: for an2 in dfstructs1.index.values: if an1 != an2: dX = (dfstructs1.iloc[an1, 2] - dfstructs1.iloc[an2, 2]) dY = (dfstructs1.iloc[an1, 3] - dfstructs1.iloc[an2, 3]) dZ = (dfstructs1.iloc[an1, 4] - dfstructs1.iloc[an2, 4]) dist = np.sqrt(dX**2 + dY**2 + dZ**2) if dist > 0: adjM_temp[an1, an2] = 1 / (dist**self.p) else: print('distance is 1/zero but atoms are different', an1, an2) else: adjM_temp[an1, an2] = self.self_dist self.dict_adjM[key] = adjM_temp i_key += 1 if not (i_key % 5000): print(i_key, '/', len_key, ' molecules are processed') cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_dict_adjM.pkl'), self.dict_adjM) self.dict_adjMfn = 'tmp_dict_adjM.pkl' self.dict_adjM = {}
def make_X(self, dfTrain1): ''' Creates X feauture matrices from dataset. Inputs: dfTrain1: Train dataframe with atomic_index_ and type are made tuple. Returns: Saves prepared data in a temporary folder. ''' atom_ind_type = 'ai1_type' atom_ind_node = 'atom_index_0' dfTrain2 = pd.get_dummies( dfTrain1[['molecule_name', atom_ind_node, atom_ind_type]], columns=[atom_ind_type]) print('CREATING THE FEATURE MATRIX') if self.dfXtest is None: dfTrain2 = dfTrain2.groupby(['molecule_name', atom_ind_node]).sum() self.X_dict = self.fn_matrix(dfTrain2, self.F, self.N) print('FEATURE MATRIX DONE', 'dict length is:', len(self.X_dict.keys()), 'dict item shape is:', list(self.X_dict.values())[0].shape) cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_X_dict.pkl'), self.X_dict) self.X_dictfn = 'tmp_X_dict.pkl' self.X_dict = {} else: dfTrain3 = dfTrain2.xs('train').groupby( ['molecule_name', atom_ind_node]).sum() self.X_dict_trn = self.fn_matrix(dfTrain3, self.F, self.N) print('FEATURE MATRIX DONE FOR TRAIN, DOING TEST NOW', 'dict length is:', len(self.X_dict_trn.keys()), 'dict item shape is:', list(self.X_dict_trn.values())[0].shape) cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_X_dict_trn.pkl'), self.X_dict_trn) self.X_dict_trnfn = 'tmp_X_dict_trn.pkl' self.X_dict_trn = {} dfTrain3 = dfTrain2.xs('test').groupby( ['molecule_name', atom_ind_node]).sum() self.X_dict_tst = self.fn_matrix(dfTrain3, self.F, self.N) print('FEATURE MATRIX DONE FOR TEST', 'dict length is:', len(self.X_dict_tst.keys()), 'dict item shape is:', list(self.X_dict_tst.values())[0].shape) cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_X_dict_tst.pkl'), self.X_dict_tst) self.X_dict_tstfn = 'tmp_X_dict_tst.pkl' self.X_dict_tst = {}
def scc_test_slice_save(all_fn, save_fn, tst_end=None, tst_start=0, isshuffle=True): ''' Slices only the test dataset from full train dataset Inputs: all_fn: pickle file that has full data to be sliced save_fn: a prefix to save output files tst_end: test set end index tst_start: test set start index isshuffle: enable to shuffle full dataset before slicing Returns: Sliced test data as separate files for input to graph model and saved in files. ''' test_all = cm.pklload(all_fn) keyslst = list(test_all.keys()) if isshuffle: random.shuffle(keyslst) tst_X_fm = [] tst_X_adm = [] tst_Y_scc = [] tst_Y_id = [] if tst_end is None: keyslst_tst = keyslst[tst_start:] tst_end = len(keyslst_tst) else: keyslst_tst = keyslst[tst_start:tst_end] for key in keyslst_tst: tst_X_fm.append(test_all[key][0]) tst_X_adm.append(test_all[key][1]) tst_Y_scc.append(test_all[key][2]) tst_Y_id.append(test_all[key][3]) cm.pklsave(save_fn + '_X_' + str(tst_end - tst_start) + '_test.pkl', tst_X_fm) cm.pklsave(save_fn + '_Xadjm_' + str(tst_end - tst_start) + '_test.pkl', tst_X_adm) cm.pklsave(save_fn + '_Yid_' + str(tst_end - tst_start) + '_test.pkl', tst_Y_id) cm.pklsave(save_fn + '_Yscc_' + str(tst_end - tst_start) + '_test.pkl', tst_Y_scc)
def scc_comb_save(self): ''' Saves train and test data in a pickle file ''' self.comb_fn = self.comb_fn if self.comb_fn is not None else 'default_comb_data' if self.dfXtest is None: cm.pklsave(self.comb_fn + '_raw.pkl', self.comb_dict) else: cm.pklsave(self.comb_fn + '_trn_raw.pkl', self.comb_dict_trn) cm.pklsave(self.comb_fn + '_tst_raw.pkl', self.comb_dict_tst) return self.comb_fn
def scc_trnval_slice_save(all_fn, save_fn, trn_end, val_end, tst_end, trn_start=0, val_start=None, tst_start=None, isshuffle=True): ''' Slices the train, val and test data from a preprocessed dataset. Inputs: all_fn: pickle file that has full data to be sliced save_fn: a prefix to save output files trn_end: train set end index val_end: validation set end index tst_end: test set end index trn_start: train set start index val_start: validation set start index tst_start: test set start index isshuffle: enable to shuffle full dataset before slicing Returns: Sliced train, validation and test data as separate files for input to graph model and saved in files. ''' train_all = cm.pklload(all_fn) keyslst = list(train_all.keys()) if isshuffle: random.shuffle(keyslst) trn_X_fm = [] trn_X_adm = [] trn_Y_scc = [] trn_Y_id = [] val_X_fm = [] val_X_adm = [] val_Y_scc = [] val_Y_id = [] tst_X_fm = [] tst_X_adm = [] tst_Y_scc = [] tst_Y_id = [] if trn_start is not None: keyslst_trn = keyslst[trn_start:trn_end] for key in keyslst_trn: trn_X_fm.append(train_all[key][0]) trn_X_adm.append(train_all[key][1]) trn_Y_scc.append(train_all[key][2]) trn_Y_id.append(train_all[key][3]) cm.pklsave(save_fn + '_X_' + str(trn_end - trn_start) + '_trn.pkl', trn_X_fm) cm.pklsave(save_fn + '_Xadjm_' + str(trn_end - trn_start) + '_trn.pkl', trn_X_adm) cm.pklsave(save_fn + '_Y_' + str(trn_end - trn_start) + '_trn.pkl', trn_Y_scc) cm.pklsave(save_fn + '_Yid_' + str(trn_end - trn_start) + '_trn.pkl', trn_Y_id) if val_start is not None: keyslst_val = keyslst[val_start:val_end] for key in keyslst_val: val_X_fm.append(train_all[key][0]) val_X_adm.append(train_all[key][1]) val_Y_scc.append(train_all[key][2]) val_Y_id.append(train_all[key][3]) cm.pklsave(save_fn + '_X_' + str(val_end - val_start) + '_val.pkl', val_X_fm) cm.pklsave(save_fn + '_Xadjm_' + str(val_end - val_start) + '_val.pkl', val_X_adm) cm.pklsave(save_fn + '_Y_' + str(val_end - val_start) + '_val.pkl', val_Y_scc) cm.pklsave(save_fn + '_Yid_' + str(val_end - val_start) + '_val.pkl', val_Y_id) if tst_start is not None: keyslst_tst = keyslst[tst_start:tst_end] for key in keyslst_tst: tst_X_fm.append(train_all[key][0]) tst_X_adm.append(train_all[key][1]) tst_Y_scc.append(train_all[key][2]) tst_Y_id.append(train_all[key][3]) cm.pklsave(save_fn + '_X_' + str(tst_end - tst_start) + '_test.pkl', tst_X_fm) cm.pklsave( save_fn + '_Xadjm_' + str(tst_end - tst_start) + '_test.pkl', tst_X_adm) cm.pklsave(save_fn + '_Y_' + str(tst_end - tst_start) + '_test.pkl', tst_Y_scc) cm.pklsave(save_fn + '_Yid_' + str(tst_end - tst_start) + '_test.pkl', tst_Y_id)
def make_YID(self, dfTrain1): ''' Creates Y label id vector from dataset. Inputs: dfTrain1: Train dataframe with atomic_index_ and type are made tuple. Returns: Saves prepared data in a temporary folder. ''' atom_ind_type = 'ai1_type' atom_ind_node = 'atom_index_0' print('CREATING ID VECTOR') dfTrain2 = pd.get_dummies( dfTrain1[['id', 'molecule_name', atom_ind_node, atom_ind_type]], columns=[atom_ind_type]) dfTrain2.iloc[:, 3:] = dfTrain2.iloc[:, 3:].multiply(dfTrain2.iloc[:, 0], axis='index') if self.dfXtest is None: dfTrain3 = dfTrain2.iloc[:, np.r_[1, 2, 3:dfTrain2.shape[1]]].groupby( ['molecule_name', atom_ind_node]).sum() self.YID_dict = self.fn_matrix(dfTrain3, self.F, self.N, isflatten=True) print('ID VECTOR DONE', 'dict length is:', len(self.YID_dict.keys()), 'dict item shape is:', list(self.YID_dict.values())[0].shape) cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_YID_dict.pkl'), self.YID_dict) self.YID_dictfn = 'tmp_YID_dict.pkl' self.YID_dict = {} else: print('CREATING ID VECTOR FOR TRAIN') dfTrain3 = dfTrain2.xs('train').iloc[:, np.r_[ 1, 2, 3:dfTrain2.shape[1]]].groupby(['molecule_name', atom_ind_node]).sum() self.YID_dict_trn = self.fn_matrix( dfTrain3, self.F, self.N, isflatten=True) # ids as flat vector, keys are molecule names print('ID VECTOR DONE', 'dict length is:', len(self.YID_dict_trn.keys()), 'dict item shape is:', list(self.YID_dict_trn.values())[0].shape) cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_YID_dict_trn.pkl'), self.YID_dict_trn) self.YID_dict_trnfn = 'tmp_YID_dict_trn.pkl' self.YID_dict_trn = {} print('CREATING ID VECTOR FOR TEST') dfTrain3 = dfTrain2.xs('test').iloc[:, np.r_[ 1, 2, 3:dfTrain2.shape[1]]].groupby(['molecule_name', atom_ind_node]).sum() self.YID_dict_tst = self.fn_matrix(dfTrain3, self.F, self.N, isflatten=True) print('ID VECTOR DONE', 'dict length is:', len(self.YID_dict_tst.keys()), 'dict item shape is:', list(self.YID_dict_tst.values())[0].shape) cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_YID_dict_tst.pkl'), self.YID_dict_tst) self.YID_dict_tstfn = 'tmp_YID_dict_tst.pkl' self.YID_dict_tst = {}
def make_Y(self, dfTrain1): ''' Creates Y label vectors from dataset. Inputs: dfTrain1: Train dataframe with atomic_index_ and type are made tuple. Returns: Saves prepared data in a temporary folder. ''' atom_ind_type = 'ai1_type' atom_ind_node = 'atom_index_0' print('CREATING SCALAR COUPLING VALUE Y VALUE VECTOR') dfTrain2 = pd.get_dummies(dfTrain1[[ 'id', 'molecule_name', atom_ind_node, 'scalar_coupling_constant', atom_ind_type ]], columns=[atom_ind_type]) dfTrain2.iloc[:, 4:] = dfTrain2.iloc[:, 4:].multiply(dfTrain2.iloc[:, 3], axis='index') if self.dfXtest is None: dfTrain3 = dfTrain2.iloc[:, np.r_[1, 2, 4:dfTrain2.shape[1]]].groupby( ['molecule_name', atom_ind_node]).sum() self.Y_dict = self.fn_matrix(dfTrain3, self.F, self.N, isflatten=True) print('SCALAR COUPLING CONSTANT Y VECTOR DONE', 'dict length is:', len(self.Y_dict.keys()), 'dict item shape is (cls_num):', list(self.Y_dict.values())[0].shape) self.cls_num = list(self.Y_dict.values())[0].shape cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_Y_dict.pkl'), self.Y_dict) self.Y_dictfn = 'tmp_Y_dict.pkl' self.Y_dict = {} else: dfTrain3 = dfTrain2.xs('train').iloc[:, np.r_[ 1, 2, 4:dfTrain2.shape[1]]].groupby(['molecule_name', atom_ind_node]).sum() self.Y_dict_trn = self.fn_matrix(dfTrain3, self.F, self.N, isflatten=True) print('SCALAR COUPLING CONSTANT Y VECTOR DONE FOR TRAIN', 'dict length is:', len(self.Y_dict_trn.keys()), 'dict item shape is (cls_num):', list(self.Y_dict_trn.values())[0].shape) self.cls_num = list(self.Y_dict_trn.values())[0].shape cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_Y_dict_trn.pkl'), self.Y_dict_trn) self.Y_dict_trnfn = 'tmp_Y_dict_trn.pkl' self.Y_dict_trn = {} print('DUMMMY LABELS FOR TEST DATA ') dfTrain3 = dfTrain2.xs('test').iloc[:, np.r_[ 1, 2, 4:dfTrain2.shape[1]]].groupby(['molecule_name', atom_ind_node]).sum() self.Y_dict_tst = self.fn_matrix(dfTrain3, self.F, self.N, isflatten=True) print('DUMMY SCALAR COUPLING CONSTANT Y VECTOR DONE FOR TEST', 'dict length is:', len(self.Y_dict_tst.keys()), 'dict item shape is (cls_num):', list(self.Y_dict_tst.values())[0].shape) cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_Y_dict_tst.pkl'), self.Y_dict_tst) self.Y_dict_tstfn = 'tmp_Y_dict_tst.pkl' self.Y_dict_tst = {}