def test9ToNumpy(self): import numpy for typ in (DataStructs.ExplicitBitVect, ): bv = typ(32) bv.SetBit(0) bv.SetBit(1) bv.SetBit(17) bv.SetBit(23) bv.SetBit(31) arr = numpy.zeros((32, ), 'i') DataStructs.ConvertToNumpyArray(bv, arr) for i in range(bv.GetNumBits()): self.assertEqual(bv[i], arr[i]) for typ in (DataStructs.IntSparseIntVect, DataStructs.LongSparseIntVect, DataStructs.UIntSparseIntVect, DataStructs.ULongSparseIntVect): iv = typ(32) iv[0] = 1 iv[1] = 1 iv[17] = 1 iv[23] = 1 iv[31] = 1 arr = numpy.zeros((32, ), 'i') DataStructs.ConvertToNumpyArray(iv, arr) for i in range(iv.GetLength()): self.assertEqual(iv[i], arr[i])
def rf_validate(enzymes, success, candidate): # generate fingeprints: Morgan fingerprint with radius 2 mols = [] for i in enzymes: mols.append( Chem.MolFromSmiles(i) ) fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in mols] # convert the RDKit explicit vectors into numpy arrays np_fps = [] for fp in fps: arr = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) # get a random forest classifiert with 100 trees rf = RandomForestClassifier(n_estimators=100, random_state=1123) # train the random forest # with the first two molecules being actives (class 1) and # the last two being inactives (class 0) ys_fit = success rf.fit(np_fps, ys_fit) # use the random forest to predict a new molecule m6 = Chem.MolFromSmiles(candidate) fp = numpy.zeros((1,)) DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m6, 2), fp) return rf.predict_proba(fp)[0][1]
def create_rxn_Morgan2FP(rsmi, psmi, rxnfpsize=2048, pfpsize=2048, useFeatures=False, calculate_rfp=True): """Create a rxn Morgan (r=2) fingerprint as bit vector from SMILES string lists of reactants and products""" # Modified from Schneider's code (2014) if calculate_rfp is True: rsmi = rsmi.encode('utf-8') try: mol = Chem.MolFromSmiles(rsmi) except Exception as e: return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect(mol=mol, radius=2, nBits=rxnfpsize, useFeatures=False, useChirality=True) fp = np.empty(rxnfpsize, dtype=np.bool) DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build reactant fp due to {}".format(e)) return rfp = fp else: rfp = None psmi = psmi.encode('utf-8') try: mol = Chem.MolFromSmiles(psmi) except Exception as e: print(psmi) return try: fp_bit = AllChem.GetMorganFingerprintAsBitVect(mol=mol, radius=2, nBits=pfpsize, useFeatures=False, useChirality=True) fp = np.empty(pfpsize, dtype=np.bool) DataStructs.ConvertToNumpyArray(fp_bit, fp) except Exception as e: print("Cannot build product fp due to {}".format(e)) return pfp = fp return [pfp, rfp]
def mol_train_test(dataset, labels, test_size=0.1, random_state=2019, nbits=1024): # TAKING WRONG INCHIS all_mols = [ Chem.MolFromSmiles(SMILES_string) for SMILES_string in dataset['SMILES'] ] drop_index = [i for i, mol in enumerate(all_mols) if mol == None] # FINDING WRONG INCHIS # DROP FROM MOLS, lABELS, AND DATASET if len(drop_index) != 0: labels = labels.drop(drop_index).reset_index(drop=True) dataset = dataset.drop(drop_index).reset_index(drop=True) all_mols = [ Chem.MolFromSmiles(SMILES_string) for SMILES_string in dataset['SMILES'] ] ### FIND BETTER WAY TO NOT CALCULATE AGAIN!!!! # TRAIN-TEST SPLITS train_mols, test_mols, y_train, y_test = train_test_split(all_mols, labels, test_size=test_size\ , random_state=random_state) # CONVERT TRAINING MOLECULES INTO FINGERPRINT AS 256BITS VECTORS bi = {} fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(m, radius=2, bitInfo= bi, nBits=nbits) \ for m in train_mols] # PUT ALL EACH OF THE CORRESPONDING 256BITS FINGERPRINTS INTO A LIST train_fps_array = [] for fp in fps: arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray(fp, arr) train_fps_array.append(arr) # CONVERT InChi STRINGS INTO MOLECULES FOR TEST DATA test_fps = [rdMolDescriptors.GetMorganFingerprintAsBitVect(test_m, radius=2, bitInfo= bi, nBits=nbits) \ for test_m in test_mols] #Convert testing fingerprints into binary, and put all testing binaries into arrays test_np_fps_array = [] for test_fp in test_fps: test_arr = np.zeros((1, ), dtype=int) DataStructs.ConvertToNumpyArray(test_fp, test_arr) test_np_fps_array.append(test_arr) return dataset, labels, all_mols, y_train, y_test, train_fps_array, test_np_fps_array
def SMILES_2_ECFP(smiles, radius=3, bit_len=4096, index=None): """ This function transforms a list of SMILES strings into a list of ECFP with radius 3. ---------- smiles: List of SMILES strings to transform Returns ------- This function return the SMILES strings transformed into a vector of 4096 elements """ fps = np.zeros((len(smiles), bit_len)) for i, smile in enumerate(smiles): mol = Chem.MolFromSmiles(smile) arr = np.zeros((1, )) try: mol = MurckoScaffold.GetScaffoldForMol(mol) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len) DataStructs.ConvertToNumpyArray(fp, arr) fps[i, :] = arr except: print(smile) fps[i, :] = [0] * bit_len return pd.DataFrame(fps, index=(smiles if index is None else index))
def calculate_ecfp4(mol, nBits=1024): rdmol = to_rdkit_Mol(mol) Chem.rdmolops.SanitizeMol(rdmol) fp = Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect(rdmol, 2, nBits=nBits) arr = np.array([]) DataStructs.ConvertToNumpyArray(fp, arr) return arr
def __call__(self, smiles, radius=3, bit_len=4096, scaffold=0): fps = np.zeros((len(smiles), bit_len)) for i, smile in enumerate(smiles): mol = Chem.MolFromSmiles(smile) arr = np.zeros((1, )) try: if scaffold == 1: mol = MurckoScaffold.GetScaffoldForMol(mol) elif scaffold == 2: mol = MurckoScaffold.MakeScaffoldGeneric(mol) if not mol: raise Exception( f'Failed to calculate Morgan fingerprint (creating RDKit instance from smiles failed: {smile})' ) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=bit_len) DataStructs.ConvertToNumpyArray(fp, arr) fps[i, :] = arr except Exception as exp: # TODO: use a more specific exception related to descriptor errors # traceback.print_exc() self.builder.errors.append(exp) fps[i, :] = [0] * bit_len return pd.DataFrame(fps)
def generate_fingerprints(smile): mol = MolFromSmiles(smile) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=1024) array = np.zeros((0,), dtype=np.int8) DataStructs.ConvertToNumpyArray(fp, array) #print(array) return array
def main(): """Run the main function.""" outfile = open(sys.argv[1] + "_predictions.txt", 'w') outfile.write("Chemical Name\tPrediction\n") model = load_model("pparg_ligand_model.h5") dataframe = pandas.read_csv(sys.argv[1], sep="\t") mols = [] fps = [] for index, row in dataframe.iterrows(): mol = Chem.MolFromSmiles(row['SMILES']) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2) mols.append(mol) fps.append(fp) np_fps = [] for fp in fps: arr = numpy.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) np_fps_array = numpy.array(np_fps) predictions = model.predict(np_fps_array, batch_size=5) i = 0 for prediction in predictions: y_prediction = '' if (prediction < 0.50): y_prediction = "ligand" else: y_prediction = "not_ligand" outfile.write(dataframe['Chemical_Name'][i] + "\t" + y_prediction + "\n") i += 1 outfile.close()
def morgan_fingp(fname): nbits = 1024 radius = 2 #fp = [] fsplit = fname.split('/')[-1] #to_skip = done_dict[fsplit] ref2 = open(fp + '/' + fn + '/' + fsplit, 'a') #print(fname,to_skip) with open(fname, 'r') as ref: ref.readline() #for count in range(to_skip): # ref.readline() for line in ref: smile, zin_id = line.rstrip().split() arg = np.zeros((1, )) try: DataStructs.ConvertToNumpyArray( AllChem.GetMorganFingerprintAsBitVect( Chem.MolFromSmiles(smile), radius, nBits=nbits), arg) ref2.write( (',').join([zin_id] + [str(elem) for elem in np.where(arg == 1)[0]])) ref2.write('\n') except: print(line) pass
def fp_matrix(self, fp): matrix_fp = [] for f in fp: arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(f, arr) matrix_fp.append(arr) return matrix_fp
def computeFP(x): #compute depth-2 morgan fingerprint hashed to 2048 bits fp = Chem.GetMorganFingerprintAsBitVect(x, 2, nBits=2048) res = numpy.zeros(len(fp), numpy.int32) #convert the fingerprint to a numpy array and wrap it into the dummy container DataStructs.ConvertToNumpyArray(fp, res) return FP(res)
def compute_morgan_fingerprints(smiles, fingerprint_length, fingerprint_radius): """Get Morgan Fingerprint of a specific SMILES string. Adapted from: <https://github.com/google-research/google-research/blob/ dfac4178ccf521e8d6eae45f7b0a33a6a5b691ee/mol_dqn/chemgraph/dqn/deep_q_networks.py#L750> Args: smiles: String. The SMILES string of the molecule. fingerprint_length (int): Bit-length of fingerprint fingerprint_radius (int): Radius used to compute fingerprint Returns: np.array. shape = [hparams.fingerprint_length]. The Morgan fingerprint. """ if smiles is None: # No smiles string return np.zeros((fingerprint_length,)) molecule = Chem.MolFromSmiles(smiles) if molecule is None: # Invalid smiles string return np.zeros((fingerprint_length,)) # Compute the fingerprint fingerprint = AllChem.GetMorganFingerprintAsBitVect( molecule, fingerprint_radius, fingerprint_length) arr = np.zeros((1,)) # ConvertToNumpyArray takes ~ 0.19 ms, while # np.asarray takes ~ 4.69 ms DataStructs.ConvertToNumpyArray(fingerprint, arr) return arr
def Xyfromdf(df, return_y): """Generate X (design matrix) and y (labels) for training a machine learning model from a dataframe of structures. The structures are encoded using their Morgan fingerprint. Args: df: a Pandas DataFrame with columns "Compound ID", "Structure", "IC50" return_y: whether or not to return labels y. Returns: 2-tuple (X,y) where X is a dataframe and y is a series if return_y is True. Otherwise just X """ # generate fingeprints: Morgan fingerprint with radius 2 fps = [ AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 2) for smile in df["Structure"] ] # convert the RDKit explicit vectors into numpy arrays np_fps = [np.zeros((1, )) for fp in fps] for i, fp in enumerate(fps): DataStructs.ConvertToNumpyArray(fp, np_fps[i]) X = pd.DataFrame(np.array(np_fps)) if return_y: y = np.log(df["IC50"]) assert y.isna().sum() == 0 return X, y else: return X
def morgan(self, radius, size=None): """ Calculates circular fingerprints and renders them as a DataFrame, so that it is easier to handle. : radius (int): radius = 2 ~ ECFP4, radius = 3 ~ ECFP6 : size (int, optional): number of bits to generate. If None, it will be assigned the standard value of 2048 """ if size is None: size = 2048 fps = [ AllChem.GetMorganFingerprintAsBitVect(m, radius, size) for m in self.smiles_converted ] np_fps = [] for fp in fps: arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) df = pd.DataFrame(np_fps) return df
def get_morgan_fp(self, mol: Mol, radius: int = 2, nBits: int = 1024, invariants: List[AtomPairsParameters] = [], fromAtoms: List[AtomPairsParameters] = [], useChirality: bool = False, useBondTypes: bool = True, useFeatures: bool = False, bitInfo: AtomPairsParameters = {}, includeRedundantEnvironments: bool = False) -> array: """ Function to generate a set of fingerprints from a single molecule Parameters: Same parameters as: https://www.rdkit.org/docs/source/rdkit.Chem.rdMolDescriptors.html#rdkit.Chem.rdMolDescriptors.GetMorganFingerprintAsBitVect Returns: fingerprints_vector (array): numpy array containing fingerprints for the molecule """ arr = np.zeros((1, )) fp = AllChem.GetMorganFingerprintAsBitVect(mol=mol, radius=radius, nBits=nBits, invariants=invariants, fromAtoms=fromAtoms, useChirality=useChirality, useBondTypes=useBondTypes, useFeatures=useFeatures, bitInfo=bitInfo) DataStructs.ConvertToNumpyArray(fp, arr) arr = np.array( [len(bitInfo[x]) if x in bitInfo else 0 for x in range(nBits)]) return arr
def convert_bitvec_to_array(bitvec: list) -> np.ndarray: """Convert bit vector fingerprint to numpy array.""" features = np.zeros(1, ) DataStructs.ConvertToNumpyArray(bitvec, features) return features
def fingerprints_from_mol(cls, mol): # use ECFP4 features_vec = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048) features = np.zeros((1, )) DataStructs.ConvertToNumpyArray(features_vec, features) return features.reshape(1, -1)
def featurizer(folder, filename): folderpath = folder data = pd.read_csv(folderpath + filename, sep=',') alias = [] ic50 = [] fps = [] #convert smiles to rkd objects for i in range(data.shape[0]): try: arr = np.zeros((1, )) compound = Chem.MolFromSmiles(data.ix[i, 1]) fp = AllChem.GetMorganFingerprintAsBitVect(compound, 3, 4096) DataStructs.ConvertToNumpyArray(fp, arr) fps.append(arr) alias.append(data.ix[i, 0]) ic50.append(data.ix[i, 2]) except: print(i) print(data.ix[i, 0]) #create dataframe to store fingerprinters and write to csv file df = pd.DataFrame(fps, index=alias) df.insert(0, "ic50", ic50) df.to_csv(folderpath + 'fingerprinters_4096.csv')
def getNumpy(inlist): outlist = [] for i in inlist: arr = numpy.zeros((3, ), tree.DTYPE) DataStructs.ConvertToNumpyArray(i[1], arr) outlist.append(arr) return outlist
def get_circular_fp(smile, radius=6, fp_len=128): mol = Chem.MolFromSmiles(smile) fingerprint = Chem.AllChem.GetMorganFingerprintAsBitVect( mol, radius, fp_len) arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(fingerprint, arr) return arr
def smile2fp(smile: str) -> Any: """ Calculates one fingerprint from a SMILE :param smile: Input SMILE :return: List of bits if conversion is successfull, None otherwise """ # generate morgan fp (circular, ecfp) # smile = df['smiles'][1] # mol = Chem.MolFromSmiles(smile) # from rdkit.Chem import AllChem # morgan = AllChem.GetMorganFingerprintAsBitVect(mol, 2) # npa = np.zeros((0,), dtype=np.bool) # from rdkit import DataStructs # DataStructs.ConvertToNumpyArray(morgan, npa) npa = np.zeros((0,), dtype=np.bool) try: DataStructs.ConvertToNumpyArray( AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 2, nBits=fp_size), npa) return npa except: return None
def main(smiles, output, top = 10, model = '10uM'): if model == '1uM': morgan_nb = joblib.load('Data/models_23/1uM/mNB_1uM_all.pkl') else: morgan_nb = joblib.load('Data/models_23/10uM/mNB_10uM_all.pkl') classes = list(morgan_nb.targets) mol = Chem.MolFromSmiles(smiles) fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits = 2048) res = np.zeros(len(fp), np.int32) DataStructs.ConvertToNumpyArray(fp, res) probas = list(morgan_nb.predict_proba(res.reshape(1,-1))[0]) predictions = pd.DataFrame(list(zip(classes, probas)), columns=['id','probas']) top_pred = predictions.sort_values(by='probas', ascending = False).head(top) plist = [] for i, e in enumerate(top_pred['id']): plist.append(fetch_WS(e)) target_info = pd.DataFrame(plist, columns=['id', 'name', 'organism']) result = pd.merge(top_pred, target_info) result.to_csv(output)
def rdkit_numpy_convert(fp): output = [] for f in fp: arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(f, arr) output.append(arr) return np.asarray(output)
def morgan_fingerprint( df: pd.DataFrame, mols_col: str, radius: int = 3, nbits: int = 2048, kind: str = "counts", ): """ Convert a column of RDKIT Mol objects into Morgan Fingerprints. Returns a new dataframe without any of the original data. This is intentional, as Morgan fingerprints are usually high-dimensional features. Method chaining usage: .. code-block:: python df = pd.DataFrame(...) morgans = df.morgan_fingerprint(mols_col='mols', radius=3, nbits=2048) If you wish to join the Morgans back into the original dataframe, this can be accomplished by doing a `join`, becuase the indices are preserved: ..code-block:: python joined = df.join(morgans) :param df: A pandas DataFrame. :param mols_col: The name of the column that has the RDKIT mol objects :param radius: Radius of Morgan fingerprints. Defaults to 3. :param nbits: The length of the fingerprints. Defaults to 2048. :param kind: Whether to return counts or bits. Defaults to counts. :returns: A pandas DataFrame """ acceptable_kinds = ["counts", "bits"] if kind not in acceptable_kinds: raise ValueError(f"`kind` must be one of {acceptable_kinds}") if kind == "bits": fps = [ GetMorganFingerprintAsBitVect(m, radius, nbits) for m in df[mols_col] ] elif kind == "counts": fps = [ GetHashedMorganFingerprint(m, radius, nbits) for m in df[mols_col] ] np_fps = [] for fp in fps: arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) np_fps = np.vstack(np_fps) fpdf = pd.DataFrame(np_fps) fpdf.index = df.index return fpdf
def fp_to_pandas(fp, drug_names): fp_np = [] for fp in fp: arr = np.zeros((1,)) DataStructs.ConvertToNumpyArray(fp, arr) fp_np.append(arr) fp_df = pd.DataFrame(fp_np, index=drug_names) return fp_df
def fingerprintsToNPArr(fps): # print fps np_fps = [] for fp in fps: arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(fp, arr) np_fps.append(arr) return np_fps
def get_numpy_fingerprint_from_smiles(smiles): """Get Morgan Fingerprint as NumPy vector from SMILES string""" mol = Chem.MolFromSmiles(smiles) fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 3) finger_container = np.empty(fingerprint.GetNumBits()) DataStructs.ConvertToNumpyArray(fingerprint, finger_container) return finger_container
def _maccs_keys(self, molecules: List, parameters: {}): fingerprints = [] fps = [MACCSkeys.GenMACCSKeys(mol) for mol in molecules] for fp in fps: fp_np = np.zeros((1, ), dtype=np.int32) DataStructs.ConvertToNumpyArray(fp, fp_np) fingerprints.append(fp_np) return fingerprints
def bitvect_as_np_array(self): """Transforms the calculated 2048-bit bitvector into a np array. Returns a 1 x 2048 array. """ import numpy as np arr = np.zeros((1, )) DataStructs.ConvertToNumpyArray(self.bitvect, arr) return arr