def make_adjm(self):
        '''
        Prepare weighted adjacency matrix by using coordinate info in structures csv file for each molecule.

        Returns:
        A dictionary containing 2D weighted adjacecny matrix for each molecule in structures csv file.
        '''

        self.dict_adjM = dict()
        grp_dfstructs = self.dfXstructs.groupby('molecule_name')
        i_key = 0
        len_key = len(grp_dfstructs)
        for key, item in grp_dfstructs:
            dfstructs1 = grp_dfstructs.get_group(key)
            dfstructs1 = dfstructs1.set_index('atom_index')
            adjM_temp = np.full((self.N, self.N), self.far_dist)
            for an1 in dfstructs1.index.values:
                for an2 in dfstructs1.index.values:
                    if an1 != an2:
                        dX = (dfstructs1.iloc[an1, 2] -
                              dfstructs1.iloc[an2, 2])
                        dY = (dfstructs1.iloc[an1, 3] -
                              dfstructs1.iloc[an2, 3])
                        dZ = (dfstructs1.iloc[an1, 4] -
                              dfstructs1.iloc[an2, 4])
                        dist = np.sqrt(dX**2 + dY**2 + dZ**2)
                        if dist > 0:
                            adjM_temp[an1, an2] = 1 / (dist**self.p)
                        else:
                            print('distance is 1/zero but atoms are different',
                                  an1, an2)
                    else:
                        adjM_temp[an1, an2] = self.self_dist
            self.dict_adjM[key] = adjM_temp
            i_key += 1
            if not (i_key % 5000):
                print(i_key, '/', len_key, ' molecules are processed')

        cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_dict_adjM.pkl'),
                   self.dict_adjM)

        self.dict_adjMfn = 'tmp_dict_adjM.pkl'
        self.dict_adjM = {}
    def make_X(self, dfTrain1):
        '''
        Creates X feauture matrices from dataset.

        Inputs:
        dfTrain1: Train dataframe with atomic_index_ and type are made tuple.

        Returns:
        Saves prepared data in a temporary folder.

        '''

        atom_ind_type = 'ai1_type'
        atom_ind_node = 'atom_index_0'

        dfTrain2 = pd.get_dummies(
            dfTrain1[['molecule_name', atom_ind_node, atom_ind_type]],
            columns=[atom_ind_type])

        print('CREATING THE FEATURE MATRIX')
        if self.dfXtest is None:
            dfTrain2 = dfTrain2.groupby(['molecule_name', atom_ind_node]).sum()
            self.X_dict = self.fn_matrix(dfTrain2, self.F, self.N)
            print('FEATURE MATRIX DONE', 'dict length is:',
                  len(self.X_dict.keys()), 'dict item shape is:',
                  list(self.X_dict.values())[0].shape)

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_X_dict.pkl'),
                       self.X_dict)

            self.X_dictfn = 'tmp_X_dict.pkl'
            self.X_dict = {}
        else:
            dfTrain3 = dfTrain2.xs('train').groupby(
                ['molecule_name', atom_ind_node]).sum()
            self.X_dict_trn = self.fn_matrix(dfTrain3, self.F, self.N)
            print('FEATURE MATRIX DONE FOR TRAIN, DOING TEST NOW',
                  'dict length is:', len(self.X_dict_trn.keys()),
                  'dict item shape is:',
                  list(self.X_dict_trn.values())[0].shape)

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_X_dict_trn.pkl'),
                       self.X_dict_trn)

            self.X_dict_trnfn = 'tmp_X_dict_trn.pkl'
            self.X_dict_trn = {}

            dfTrain3 = dfTrain2.xs('test').groupby(
                ['molecule_name', atom_ind_node]).sum()
            self.X_dict_tst = self.fn_matrix(dfTrain3, self.F, self.N)

            print('FEATURE MATRIX DONE FOR TEST', 'dict length is:',
                  len(self.X_dict_tst.keys()), 'dict item shape is:',
                  list(self.X_dict_tst.values())[0].shape)

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_X_dict_tst.pkl'),
                       self.X_dict_tst)

            self.X_dict_tstfn = 'tmp_X_dict_tst.pkl'
            self.X_dict_tst = {}
def scc_test_slice_save(all_fn,
                        save_fn,
                        tst_end=None,
                        tst_start=0,
                        isshuffle=True):
    '''
    Slices only the test dataset from full train dataset

    Inputs:
    all_fn: pickle file that has full data to be sliced
    save_fn: a prefix to save output files
    tst_end: test set end index
    tst_start: test set start index
    isshuffle: enable to shuffle full dataset before slicing

    Returns:
    Sliced test data as separate files for input to graph model and saved in files.
    '''
    test_all = cm.pklload(all_fn)
    keyslst = list(test_all.keys())
    if isshuffle:
        random.shuffle(keyslst)

    tst_X_fm = []
    tst_X_adm = []
    tst_Y_scc = []
    tst_Y_id = []

    if tst_end is None:
        keyslst_tst = keyslst[tst_start:]
        tst_end = len(keyslst_tst)
    else:
        keyslst_tst = keyslst[tst_start:tst_end]

    for key in keyslst_tst:
        tst_X_fm.append(test_all[key][0])
        tst_X_adm.append(test_all[key][1])
        tst_Y_scc.append(test_all[key][2])
        tst_Y_id.append(test_all[key][3])
    cm.pklsave(save_fn + '_X_' + str(tst_end - tst_start) + '_test.pkl',
               tst_X_fm)
    cm.pklsave(save_fn + '_Xadjm_' + str(tst_end - tst_start) + '_test.pkl',
               tst_X_adm)
    cm.pklsave(save_fn + '_Yid_' + str(tst_end - tst_start) + '_test.pkl',
               tst_Y_id)
    cm.pklsave(save_fn + '_Yscc_' + str(tst_end - tst_start) + '_test.pkl',
               tst_Y_scc)
 def scc_comb_save(self):
     '''
     Saves train and test data in a pickle file
     '''
     self.comb_fn = self.comb_fn if self.comb_fn is not None else 'default_comb_data'
     if self.dfXtest is None:
         cm.pklsave(self.comb_fn + '_raw.pkl', self.comb_dict)
     else:
         cm.pklsave(self.comb_fn + '_trn_raw.pkl', self.comb_dict_trn)
         cm.pklsave(self.comb_fn + '_tst_raw.pkl', self.comb_dict_tst)
     return self.comb_fn
def scc_trnval_slice_save(all_fn,
                          save_fn,
                          trn_end,
                          val_end,
                          tst_end,
                          trn_start=0,
                          val_start=None,
                          tst_start=None,
                          isshuffle=True):
    '''
    Slices the train, val and test data from a preprocessed dataset.

    Inputs:
    all_fn: pickle file that has full data to be sliced
    save_fn: a prefix to save output files
    trn_end: train set end index
    val_end: validation set end index
    tst_end: test set end index
    trn_start: train set start index
    val_start: validation set start index
    tst_start: test set start index
    isshuffle: enable to shuffle full dataset before slicing

    Returns:
    Sliced train, validation and test data as separate files for
    input to graph model and saved in files.
    '''
    train_all = cm.pklload(all_fn)
    keyslst = list(train_all.keys())
    if isshuffle:
        random.shuffle(keyslst)

    trn_X_fm = []
    trn_X_adm = []
    trn_Y_scc = []
    trn_Y_id = []
    val_X_fm = []
    val_X_adm = []
    val_Y_scc = []
    val_Y_id = []
    tst_X_fm = []
    tst_X_adm = []
    tst_Y_scc = []
    tst_Y_id = []

    if trn_start is not None:
        keyslst_trn = keyslst[trn_start:trn_end]
        for key in keyslst_trn:
            trn_X_fm.append(train_all[key][0])
            trn_X_adm.append(train_all[key][1])
            trn_Y_scc.append(train_all[key][2])
            trn_Y_id.append(train_all[key][3])
        cm.pklsave(save_fn + '_X_' + str(trn_end - trn_start) + '_trn.pkl',
                   trn_X_fm)
        cm.pklsave(save_fn + '_Xadjm_' + str(trn_end - trn_start) + '_trn.pkl',
                   trn_X_adm)
        cm.pklsave(save_fn + '_Y_' + str(trn_end - trn_start) + '_trn.pkl',
                   trn_Y_scc)
        cm.pklsave(save_fn + '_Yid_' + str(trn_end - trn_start) + '_trn.pkl',
                   trn_Y_id)

    if val_start is not None:
        keyslst_val = keyslst[val_start:val_end]
        for key in keyslst_val:
            val_X_fm.append(train_all[key][0])
            val_X_adm.append(train_all[key][1])
            val_Y_scc.append(train_all[key][2])
            val_Y_id.append(train_all[key][3])
        cm.pklsave(save_fn + '_X_' + str(val_end - val_start) + '_val.pkl',
                   val_X_fm)
        cm.pklsave(save_fn + '_Xadjm_' + str(val_end - val_start) + '_val.pkl',
                   val_X_adm)
        cm.pklsave(save_fn + '_Y_' + str(val_end - val_start) + '_val.pkl',
                   val_Y_scc)
        cm.pklsave(save_fn + '_Yid_' + str(val_end - val_start) + '_val.pkl',
                   val_Y_id)

    if tst_start is not None:
        keyslst_tst = keyslst[tst_start:tst_end]
        for key in keyslst_tst:
            tst_X_fm.append(train_all[key][0])
            tst_X_adm.append(train_all[key][1])
            tst_Y_scc.append(train_all[key][2])
            tst_Y_id.append(train_all[key][3])
        cm.pklsave(save_fn + '_X_' + str(tst_end - tst_start) + '_test.pkl',
                   tst_X_fm)
        cm.pklsave(
            save_fn + '_Xadjm_' + str(tst_end - tst_start) + '_test.pkl',
            tst_X_adm)
        cm.pklsave(save_fn + '_Y_' + str(tst_end - tst_start) + '_test.pkl',
                   tst_Y_scc)
        cm.pklsave(save_fn + '_Yid_' + str(tst_end - tst_start) + '_test.pkl',
                   tst_Y_id)
    def make_YID(self, dfTrain1):
        '''
        Creates Y label id vector from dataset.

        Inputs:
        dfTrain1: Train dataframe with atomic_index_ and type are made tuple.

        Returns:
        Saves prepared data in a temporary folder.

        '''

        atom_ind_type = 'ai1_type'
        atom_ind_node = 'atom_index_0'
        print('CREATING ID  VECTOR')

        dfTrain2 = pd.get_dummies(
            dfTrain1[['id', 'molecule_name', atom_ind_node, atom_ind_type]],
            columns=[atom_ind_type])

        dfTrain2.iloc[:, 3:] = dfTrain2.iloc[:, 3:].multiply(dfTrain2.iloc[:,
                                                                           0],
                                                             axis='index')

        if self.dfXtest is None:
            dfTrain3 = dfTrain2.iloc[:,
                                     np.r_[1, 2, 3:dfTrain2.shape[1]]].groupby(
                                         ['molecule_name',
                                          atom_ind_node]).sum()
            self.YID_dict = self.fn_matrix(dfTrain3,
                                           self.F,
                                           self.N,
                                           isflatten=True)

            print('ID VECTOR DONE', 'dict length is:',
                  len(self.YID_dict.keys()), 'dict item shape is:',
                  list(self.YID_dict.values())[0].shape)

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_YID_dict.pkl'),
                       self.YID_dict)

            self.YID_dictfn = 'tmp_YID_dict.pkl'
            self.YID_dict = {}

        else:
            print('CREATING ID  VECTOR FOR TRAIN')
            dfTrain3 = dfTrain2.xs('train').iloc[:, np.r_[
                1, 2,
                3:dfTrain2.shape[1]]].groupby(['molecule_name',
                                               atom_ind_node]).sum()
            self.YID_dict_trn = self.fn_matrix(
                dfTrain3, self.F, self.N,
                isflatten=True)  # ids as flat vector, keys are molecule names
            print('ID VECTOR DONE', 'dict length is:',
                  len(self.YID_dict_trn.keys()), 'dict item shape is:',
                  list(self.YID_dict_trn.values())[0].shape)

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_YID_dict_trn.pkl'),
                       self.YID_dict_trn)

            self.YID_dict_trnfn = 'tmp_YID_dict_trn.pkl'
            self.YID_dict_trn = {}

            print('CREATING ID  VECTOR FOR TEST')
            dfTrain3 = dfTrain2.xs('test').iloc[:, np.r_[
                1, 2,
                3:dfTrain2.shape[1]]].groupby(['molecule_name',
                                               atom_ind_node]).sum()
            self.YID_dict_tst = self.fn_matrix(dfTrain3,
                                               self.F,
                                               self.N,
                                               isflatten=True)
            print('ID VECTOR DONE', 'dict length is:',
                  len(self.YID_dict_tst.keys()), 'dict item shape is:',
                  list(self.YID_dict_tst.values())[0].shape)

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_YID_dict_tst.pkl'),
                       self.YID_dict_tst)

            self.YID_dict_tstfn = 'tmp_YID_dict_tst.pkl'
            self.YID_dict_tst = {}
    def make_Y(self, dfTrain1):
        '''
        Creates Y label vectors from dataset.

        Inputs:
        dfTrain1: Train dataframe with atomic_index_ and type are made tuple.

        Returns:
        Saves prepared data in a temporary folder.

        '''

        atom_ind_type = 'ai1_type'
        atom_ind_node = 'atom_index_0'
        print('CREATING SCALAR COUPLING VALUE Y VALUE VECTOR')

        dfTrain2 = pd.get_dummies(dfTrain1[[
            'id', 'molecule_name', atom_ind_node, 'scalar_coupling_constant',
            atom_ind_type
        ]],
                                  columns=[atom_ind_type])
        dfTrain2.iloc[:, 4:] = dfTrain2.iloc[:, 4:].multiply(dfTrain2.iloc[:,
                                                                           3],
                                                             axis='index')

        if self.dfXtest is None:
            dfTrain3 = dfTrain2.iloc[:,
                                     np.r_[1, 2, 4:dfTrain2.shape[1]]].groupby(
                                         ['molecule_name',
                                          atom_ind_node]).sum()
            self.Y_dict = self.fn_matrix(dfTrain3,
                                         self.F,
                                         self.N,
                                         isflatten=True)

            print('SCALAR COUPLING CONSTANT Y VECTOR DONE', 'dict length is:',
                  len(self.Y_dict.keys()), 'dict item shape is (cls_num):',
                  list(self.Y_dict.values())[0].shape)
            self.cls_num = list(self.Y_dict.values())[0].shape

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_Y_dict.pkl'),
                       self.Y_dict)

            self.Y_dictfn = 'tmp_Y_dict.pkl'
            self.Y_dict = {}

        else:
            dfTrain3 = dfTrain2.xs('train').iloc[:, np.r_[
                1, 2,
                4:dfTrain2.shape[1]]].groupby(['molecule_name',
                                               atom_ind_node]).sum()
            self.Y_dict_trn = self.fn_matrix(dfTrain3,
                                             self.F,
                                             self.N,
                                             isflatten=True)

            print('SCALAR COUPLING CONSTANT Y VECTOR DONE FOR TRAIN',
                  'dict length is:', len(self.Y_dict_trn.keys()),
                  'dict item shape is (cls_num):',
                  list(self.Y_dict_trn.values())[0].shape)
            self.cls_num = list(self.Y_dict_trn.values())[0].shape

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_Y_dict_trn.pkl'),
                       self.Y_dict_trn)

            self.Y_dict_trnfn = 'tmp_Y_dict_trn.pkl'
            self.Y_dict_trn = {}

            print('DUMMMY LABELS FOR TEST DATA ')
            dfTrain3 = dfTrain2.xs('test').iloc[:, np.r_[
                1, 2,
                4:dfTrain2.shape[1]]].groupby(['molecule_name',
                                               atom_ind_node]).sum()
            self.Y_dict_tst = self.fn_matrix(dfTrain3,
                                             self.F,
                                             self.N,
                                             isflatten=True)

            print('DUMMY SCALAR COUPLING CONSTANT Y VECTOR DONE FOR TEST',
                  'dict length is:', len(self.Y_dict_tst.keys()),
                  'dict item shape is (cls_num):',
                  list(self.Y_dict_tst.values())[0].shape)

            cm.pklsave(os.path.join(self.tmpsavepath, 'tmp_Y_dict_tst.pkl'),
                       self.Y_dict_tst)

            self.Y_dict_tstfn = 'tmp_Y_dict_tst.pkl'
            self.Y_dict_tst = {}