def atom_pair_fp(self):
        df = pd.read_csv(self.csv_path)
        smiles_list = df['Smiles'].tolist()

        fingerprints = []
        not_found = []

        for i in tqdm(range(len(smiles_list))):
            try:

                mol = Chem.MolFromSmiles(smiles_list[i])
                fp = Pairs.GetAtomPairFingerprintAsIntVect(mol)
                fp._sumCache = fp.GetTotalVal(
                )  #Bit vector here will be huge, which is why taking TotalVal()
                #             bits = fp.ToBitString()
                #             bits_array = (np.fromstring(fp.ToBitString(),'u1') - ord('0'))
                fingerprints.append(fp._sumCache)
                print('fing', fingerprints)
            except:

                fingerprints.append(np.nan)
                not_found.append(i)
                pass

        df.drop(not_found, axis=0, inplace=True)

        print('Number of FPs not found: {}'.format(len(not_found)))

        df.reset_index(drop=True, inplace=True)
        labelencoder = LabelEncoder()
        Y = labelencoder.fit_transform(df['Label'].values)
        Y = Y.reshape(Y.shape[0], 1)

        print('Output shape: {}'.format(Y.shape))

        fp_array = (np.asarray((fingerprints), dtype=object))
        X = np.delete(fp_array, not_found, axis=0)
        X = np.vstack(X).astype(np.float32)
        print('Typeof X', type(X))
        print(X)
        print('Input shape: {}'.format(X.shape))

        final_array = np.concatenate((X, Y), axis=1)

        # Removing rows, from final_array, where duplicate FPs are present
        final_array_slice = final_array[:, 0:(final_array.shape[1] - 1)]
        _, unq_row_indices = np.unique(final_array_slice,
                                       return_index=True,
                                       axis=0)
        final_array_unique = final_array[unq_row_indices]

        print(
            'Number of Duplicate FPs: {}'.format(final_array.shape[0] -
                                                 final_array_unique.shape[0]))

        print('Final Numpy array shape: {}'.format(final_array_unique.shape))
        print('Type of final array: {}'.format(type(final_array_unique)))
        final_numpy_array = np.asarray((final_array_unique), dtype=np.float32)

        return final_numpy_array
Ejemplo n.º 2
0
def atom(SMILES):
    ms = [Chem.MolFromSmiles(i) for i in SMILES]
    fp = [Pairs.GetAtomPairFingerprintAsIntVect(x) for x in ms]
    return fp
Ejemplo n.º 3
0
def BuildAtomPairFP(mol):
    from rdkit.Chem.AtomPairs import Pairs
    fp = Pairs.GetAtomPairFingerprintAsIntVect(mol)
    fp._sumCache = fp.GetTotalVal()
    return fp
 def Calc_AtomPairs_Int(self): #type error
     pairFps_int = [Pairs.GetAtomPairFingerprintAsIntVect(x) for x in self.sd]
     return pairFps_int