def calc_similarity(compound_one, compound_two): if compound_one in joint_sim: if compound_two in joint_sim[compound_one]: return joint_sim[compound_one][compound_two] else: joint_sim[compound_one] = dict() if compound_two not in joint_sim: joint_sim[compound_two] = dict() if cg_props[compound_one.lower()]["type"] != cg_props[compound_one.lower()]["type"]: joint_sim[compound_one][compound_two] = 0.0 joint_sim[compound_two][compound_one] = 0.0 return 0.0 from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols from rdkit import Chem mol_one = Chem.MolFromSmiles(str(cg_props[compound_one.lower()]["smiles"])) mol_two = Chem.MolFromSmiles(str(cg_props[compound_two.lower()]["smiles"])) fp_1 = FingerprintMols.FingerprintMol(mol_one) fp_2 = FingerprintMols.FingerprintMol(mol_two) similarity = DataStructs.FingerprintSimilarity(fp_1, fp_2) joint_sim[compound_one][compound_two] = similarity joint_sim[compound_two][compound_one] = similarity return similarity
def __init__(self, moli, molj): """ Inizialization function Parameters ---------- moli : RDKit molecule object the first molecule used to perform the Figureprint calculation molj : RDKit molecule object the second molecule used to perform the Figureprint calculation options : argparse python object the list of user options """ # Set logging level and format logging.basicConfig(format='%(levelname)s:\t%(message)s', level=logging.INFO) # Local pointers to the passed molecules self.moli = moli self.molj = molj if not options.verbose == 'pedantic': lg = RDLogger.logger() lg.setLevel(RDLogger.CRITICAL) self.fps_moli = FingerprintMols.FingerprintMol(self.moli) self.fps_molj = FingerprintMols.FingerprintMol(self.molj) self.fps_tan = DataStructs.FingerprintSimilarity( self.fps_moli, self.fps_molj)
def pipe_sim_filter(stream, query, cutoff=0.8, summary=None, comp_id="pipe_sim_filter"): """Filter for compounds that have a similarity greater or equal than `cutoff` to the `query` Smiles. If the field `FP_b64` (e.g. pre-calculated) is present, this will be used, otherwise the fingerprint will be generated on-the-fly (much slower).""" rec_counter = 0 query_mol = Chem.MolFromSmiles(query) if not query_mol: print("* {} ERROR: could not generate query from SMILES.".format( comp_id)) return None query_fp = FingerprintMols.FingerprintMol(query_mol) for rec in stream: if "mol" not in rec: continue if "FP_b64" in rec: # use the pre-defined fingerprint if it is present in the stream mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"])) else: mol_fp = FingerprintMols.FingerprintMol(rec["mol"]) sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp) if sim >= cutoff: rec_counter += 1 if summary is not None: summary[comp_id] = rec_counter yield rec
def test__init__(self): from rdkit.Chem.Fingerprints import FingerprintMols ms = [ Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC') ] fps = [FingerprintMols.FingerprintMol(x) for x in ms] self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]), 0.6, places=2) details = FingerprinterDetails() fpArgs = details.__dict__ fps = [] for i, x in enumerate(ms, 1): fpArgs['fpSize'] = 16 * i fps.append(FingerprintMols.FingerprintMol(x, **fpArgs)) self.assertAlmostEqual(FingerprintSimilarity(fps[0], fps[1]), 0.555, places=2) self.assertAlmostEqual(FingerprintSimilarity(fps[1], fps[0]), 0.555, places=2) fpArgs['fpSize'] = 1024 fpArgs['tgtDensity'] = 0.8 fp = FingerprintMols.FingerprintMol(ms[0], **fpArgs) self.assertEqual(len(fp), 64) fp = DataStructs.FoldToTargetDensity(fp, density=0.1, minLength=2) self.assertEqual(len(fp), 4)
def _mols_similarity_base_r0(ms_smiles_mid, ms_smiles_base): """ Input: dictionary type required such as {nick name: smiles code, ...} """ from rdkit import Chem from rdkit import DataStructs from rdkit.Chem.Fingerprints import FingerprintMols # processing for mid print("Target: " + ms_smiles_mid.keys()) ms_mid = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_mid.values()] # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')] fps_mid = [FingerprintMols.FingerprintMol(x) for x in ms_mid] # processing for base print("Base: " + ms_smiles_base.keys()) ms_base = [Chem.MolFromSmiles(m_sm) for m_sm in ms_smiles_base.values()] # [Chem.MolFromSmiles('CCOC'), Chem.MolFromSmiles('CCO'), Chem.MolFromSmiles('COC')] fps_base = [FingerprintMols.FingerprintMol(x) for x in ms_base] for (bx, f_b) in enumerate(fps_base): for (dx, f_d) in enumerate(fps_mid): print("Base:{0}, Target:{1}".format(ms_smiles_base.keys()[bx], ms_smiles_mid.keys()[dx])) print(DataStructs.FingerprintSimilarity(f_b, f_d))
def molecular_similarity(best, parent_candidates, all=False): """ returns a similarity score (0-1) of best with the closest molecular relative in parent_candidates Parameters ---------- best : object Chromosome object, the current mutated candidate parent_candidates : array parent pool of molecules to compare with best. These are represented by SMILES all : boolean, optional, default = False default behavior is false and the tanimoto similarity score is returned. If True tanimoto, dice, cosine, sokal, kulczynski, and mcconnaughey similarities are returned Returns ---------- similarity_score : float similarity_index : int if all=False the best tanimoto similarity score as well as the index of the closest molecular relative are returned if all=True an array of best scores and indeces of the closest molecular relative are returned """ scores = [] if all: indices = [] metrics = [ DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, DataStructs.CosineSimilarity, DataStructs.SokalSimilarity, DataStructs.KulczynskiSimilarity, DataStructs.McConnaugheySimilarity ] for j in range(len(metrics)): scores_micro = [] for i in range(len(parent_candidates)): ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])] fps = [FingerprintMols.FingerprintMol(x) for x in ms] score = DataStructs.FingerprintSimilarity(fps[0], fps[1], metric=metrics[j]) scores_micro.append(score) scores.append(max(scores_micro)) indices.append(scores_micro.index(max(scores_micro))) return scores, indices else: for i in range(len(parent_candidates)): ms = [best.Mol, Chem.MolFromSmiles(parent_candidates[i])] fps = [FingerprintMols.FingerprintMol(x) for x in ms] score = DataStructs.FingerprintSimilarity(fps[0], fps[1]) scores.append(score) return max(scores), scores.index(max(scores))
def cal_pairwise_tanimoto(pair): i, j = (pair) x = Chem.MolFromSmiles(smiles[i]) y = Chem.MolFromSmiles(smiles[j]) fps1 = FingerprintMols.FingerprintMol(x) fps2 = FingerprintMols.FingerprintMol(y) tani = DataStructs.TanimotoSimilarity(fps1, fps2) values = (str(i) + "," + str(j) + "," + str(tani) + "\n") return values
def build_reactions(perturbations_all_paths, mcs_neighbours): # loop over each perturbation in the list and load the pdb files: perturbation_reactions = [] ########################################### perturbations_unnested = list( itertools.chain.from_iterable(perturbations_all_paths)) all_members = [] all_members_FPs = [] for member in perturbations_unnested: member_pdb_file = open(member, 'r').read() all_members.append(rdmolfiles.MolFromPDBBlock(member_pdb_file)) all_members_FPs.append([ FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member_pdb_file)) ]) first_pair = perturbations_all_paths[0] member1_pdb_file = open(first_pair[0], 'r').read() member2_pdb_file = open(first_pair[1], 'r').read() size_member1 = rdMolDescriptors.CalcExactMolWt( rdmolfiles.MolFromPDBBlock(member1_pdb_file)) size_member2 = rdMolDescriptors.CalcExactMolWt( rdmolfiles.MolFromPDBBlock(member2_pdb_file)) if size_member1 >= size_member2: query_member = FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member1_pdb_file)) else: query_member = FingerprintMols.FingerprintMol( rdmolfiles.MolFromPDBBlock(member2_pdb_file)) similarities = [ AllChem.DataStructs.FingerprintSimilarity(query_member, target_fp[0]) for target_fp in all_members_FPs ] similarities_to_query = dict(zip(all_members, similarities)) mol_similarities = dict( sorted(similarities_to_query.items(), key=lambda kv: kv[1], reverse=True)) ordered_mol_similarities = {} for key, value in mol_similarities.items(): if value not in ordered_mol_similarities.values(): ordered_mol_similarities[key] = value similar_hits = [] for key, value in ordered_mol_similarities.items(): similar_hits.append(key) neighbours = similar_hits[:5] print(neighbours)
def tanimotoComparison(pred_prod_list, true_prod_list): # Return the tanimoto score pred_mol = Chem.MolFromSmiles(pred_prod_list) answer_mol = Chem.MolFromSmiles(true_prod_list) pred_fps = FingerprintMols.FingerprintMol(pred_mol) answer_fps = FingerprintMols.FingerprintMol(answer_mol) return DataStructs.FingerprintSimilarity(pred_fps, answer_fps)
def TakeInput(filepath, hmdb_filepath, OR_name): positive_Cancer = extractPositiveOnes(filepath) data_hmdb = pd.read_csv(hmdb_filepath, encoding="ISO-8859-1") positive_Cancer = pd.read_csv(filepath, encoding="ISO-8859-1") hmdb_names = data_hmdb['NAME'] hmdb_SMILES = data_hmdb['SMILES'] positive_Cancer_SMILES = positive_Cancer['Smiles'] positive_Cancer_Names = positive_Cancer["Ligand"] hmdb_data = pd.concat([hmdb_SMILES, hmdb_names], axis=1) dataframe = pd.concat([positive_Cancer_SMILES, positive_Cancer_Names], axis=1) Cancer_clean_data = dataframe.drop_duplicates() Cancer_clean_data = Cancer_clean_data.reset_index(drop=True) df1 = pd.DataFrame({ "Cancer_Molecule": [], "Cancer_SMILES": [], "HMDB_Molecule": [], "HMDB_SMILES": [], "TANIMOTO_Similarity_Value": [] }) hmdb_data = hmdb_data.reset_index(drop=True) k = 0 for i in range(len(Cancer_clean_data)): # df1=df1.iloc[0:0] # df1= pd.DataFrame({"Cancer_clean_data_Molecule":[],"Cancer_clean_data_SMILES":[],"HMDB_Molecule":[],"HMDB_SMILES":[],"TANIMOTO_Similarity_Value":[]}) y = Chem.MolFromSmiles(Cancer_clean_data['Smiles'][i]) fps1 = FingerprintMols.FingerprintMol(y) for j in range(len(hmdb_data)): try: x = Chem.MolFromSmiles(hmdb_data['SMILES'][j]) fps2 = FingerprintMols.FingerprintMol(x) sim_val = DataStructs.FingerprintSimilarity(fps1, fps2) if sim_val >= 0.85: # threshold for similarity value df1.loc[k] = [ Cancer_clean_data['Ligand'][i], Cancer_clean_data['Smiles'][i], hmdb_data['NAME'][j], hmdb_data['SMILES'][j], sim_val ] k = k + 1 except: print("WARNING") print("Comparison Done for Ligand :" + str(i)) df1.to_csv("Final_test_set_" + OR_name + ".csv") Ligand = df1["Cancer_clean_data_Molecule"] Smiles = df1["Cancer_clean_data_SMILES"] Activation_Status = [] Shortlisted_Metabolites = pd.DataFrame( list(zip(Smiles, Ligand, Activation_Status)), columns=['Smiles', 'Ligand', 'Activation Status']) Shortlisted_Metabolites = Shortlisted_Metabolites.drop_duplicates( subset='Ligand', keep='first') Shortlisted_Metabolites.to_csv("Shortlisted_Metabolites" + OR_Name + ".csv") print("Shortlisted_Metabolites" + OR_Name + ".csv" + " has been saved") print("Congrats! Final_test_set_" + OR_Name + ".csv has been successfully saved!")
def tanimoto_score(mol1, mol2): """Compute the similarity via Tanimoto fingerprints for mol1 and mol2.""" from rdkit.Chem.Fingerprints import FingerprintMols from rdkit import DataStructs fp1 = FingerprintMols.FingerprintMol(mol1) fp2 = FingerprintMols.FingerprintMol(mol2) return 1.0 / (DataStructs.FingerprintSimilarity(fp1, fp2) + 1e-15)
def combine_via_chemsimilarity(cypfile,cypreact): checked_file = open("final_.csv","w",newline='') csv_writer_checked = csv.writer(checked_file,quoting=csv.QUOTE_ALL) # this file is for keep track of duplicate compounds # if drugbank state the compound is inhibitor but chembl state substrate; then it needs to investigate checked_file_2 = open("duplicates.csv","w",newline='') csv_writer_checked_2 = csv.writer(checked_file,quoting=csv.QUOTE_ALL) # currently only support Drugbank data and ChEMBL data # later could add self-annotating data drugbank_csv = open(cypfile, newline='') drugbank_csvreader = csv.reader(drugbank_csv, delimiter=',') ChEMBL_csv = open(cypreact, newline='') ChEMBL_csvreader = csv.reader(ChEMBL_csv, delimiter=',') DRUGBANK = [] for row in drugbank_csvreader: DRUGBANK.append(row) CHEMBL = [] for row in ChEMBL_csvreader: CHEMBL.append(row) # if selecting drugbank compound exist in chembl; # print it/ save it to file # later need automatically store into file for cl in CHEMBL: for db in DRUGBANK: mol_object_c = Chem.MolFromSmiles(cl[1]) mol_object_d = Chem.MolFromSmiles(db[1]) fps_c = FingerprintMols.FingerprintMol(mol_object_c) fps_d = FingerprintMols.FingerprintMol(mol_object_d) similiarty = DataStructs.FingerprintSimilarity(fps_c,fps_d) if similiarty == 1: single_list = ["Duplicates"] csv_writer_checked_2.writerow(single_list) csv_writer_checked_2.writerow(cl) csv_writer_checked_2.writerow(db) # db_list = list(db) # csv_writer.writerow(db_list) DRUGBANK.remove(db) checked_file_2.close() print("remaining compound from drugbank is: "+str(len(DRUGBANK))) for i in CHEMBL: csv_writer_checked.writerow(i) for i in DRUGBANK: csv_writer_checked.writerow(i) # csv_write_file.close() checked_file.close() print("similarity check done ...") return None
def caculate_similarity_fingerprint(smiles_A, smiles_B): try: m1 = Chem.MolFromSmiles(smiles_A) m2 = Chem.MolFromSmiles(smiles_B) f1 = FingerprintMols.FingerprintMol(m1) f2 = FingerprintMols.FingerprintMol(m2) similaritt_f1_f2 = DataStructs.FingerprintSimilarity(f1, f2) return round(similaritt_f1_f2, 4) except: return -1
def calc_tanimoto(self, reference): """ Determin the tanimoto similarity score based on the rd mol fingureprint """ try: fps_ref = FingerprintMols.FingerprintMol(reference.rd_mol) fps_self = FingerprintMols.FingerprintMol(self.rd_mol) tanimoto = DataStructs.FingerprintSimilarity(fps_ref, fps_self) return tanimoto except: logger.exception('Caught exception attempting to run rdkit FingerprintMols.FingerprintMol or DataStructs.FingerprintSimilarity') return None
def compute_single_tanimoto_metric(first_smile, second_smile): if first_smile is None or second_smile is None: return 0. first_mol = MolFromSmiles(first_smile) second_mol = MolFromSmiles(second_smile) if first_mol is None or second_mol is None: return 0. tanimoto_similarity = DataStructs.FingerprintSimilarity( FingerprintMols.FingerprintMol(first_mol), FingerprintMols.FingerprintMol(second_mol), metric=DataStructs.TanimotoSimilarity) return tanimoto_similarity
def is_similar_reagent(rgt1, rgt2, list_of_metal_atoms, list_of_full_metal_names): if rgt1 == rgt2: return True elif 'Reaxys ID' in rgt1 or 'Reaxys ID' in rgt2: return False else: #if have metal atoms compare the rgt1_metal = 100 rgt2_metal = 101 if any(metal in rgt1 for metal in list_of_full_metal_names): rgt1_metal = [ list_of_full_metal_names.index(metal) for metal in list_of_full_metal_names if metal in rgt1 ] elif any(metal in rgt1 for metal in list_of_metal_atoms): rgt1_metal = [ list_of_metal_atoms.index(metal) for metal in list_of_metal_atoms if metal in rgt1 ] if any(metal in rgt2 for metal in list_of_full_metal_names): rgt2_metal = [ list_of_full_metal_names.index(metal) for metal in list_of_full_metal_names if metal in rgt2 ] elif any(metal in rgt2 for metal in list_of_metal_atoms): rgt2_metal = [ list_of_metal_atoms.index(metal) for metal in list_of_metal_atoms if metal in rgt2 ] if rgt1_metal == rgt2_metal: return True if 'Reaxys' in rgt1 or 'Reaxys' in rgt2: return False try: mol1 = Chem.MolFromSmiles(rgt1) mol2 = Chem.MolFromSmiles(rgt2) fp1 = FingerprintMols.FingerprintMol(mol1) fp2 = FingerprintMols.FingerprintMol(mol2) except: print('cannot calculate fp') return False if not any(list(fp1)) or not any(list(fp2)): return False similarity = DataStructs.FingerprintSimilarity(fp1, fp2) if similarity >= 1.0: return True else: return False
def create_fpmols(smiles: List[str] or str) -> List or str: if isinstance(smiles, list): smiles = [Chem.MolFromSmiles(smile) for smile in smiles] fpmols: List = [ FingerprintMols.FingerprintMol(smile) for smile in smiles ] elif isinstance(smiles, str): smiles = Chem.MolFromSmiles(smiles) fpmols: str = FingerprintMols.FingerprintMol(smiles) else: raise ValueError(f'{type(smiles)} is not supported') return fpmols
def mtansr(self, ): """ This rule computes the structural similarity between the two passed molecules using the tanimoto score. Returns ------- scr_tan : float the rule score """ fps_moli = FingerprintMols.FingerprintMol(self.moli) fps_molj = FingerprintMols.FingerprintMol(self.molj) scr_tan = DataStructs.FingerprintSimilarity(fps_moli, fps_molj) return scr_tan
def pipe_sim_filter(stream, query, cutoff=80, summary=None, comp_id="pipe_sim_filter"): """Filter for compounds that have a similarity greater or equal than `cutoff` (in percent) to the `query` Smiles. If the field `FP_b64` (e.g. pre-calculated) is present, this will be used, otherwise the fingerprint of the Murcko scaffold will be generated on-the-fly (much slower).""" rec_counter = 0 query_mol = Chem.MolFromSmiles(query) if not query_mol: print("* {} ERROR: could not generate query from SMILES.".format( comp_id)) return None murcko_mol = MurckoScaffold.GetScaffoldForMol(query_mol) if USE_FP == "morgan": query_fp = Desc.rdMolDescriptors.GetMorganFingerprintAsBitVect( murcko_mol, 2) elif USE_FP == "avalon": query_fp = pyAv.GetAvalonFP(murcko_mol, 1024) else: query_fp = FingerprintMols.FingerprintMol(murcko_mol) for rec in stream: if "mol" not in rec: continue if "FP_b64" in rec: # use the pre-defined fingerprint if it is present in the stream mol_fp = pickle.loads(b64.b64decode(rec["FP_b64"])) else: murcko_mol = MurckoScaffold.GetScaffoldForMol(rec["mol"]) if USE_FP == "morgan": mol_fp = Desc.rdMolDescriptors.GetMorganFingerprintAsBitVect( murcko_mol, 2) elif USE_FP == "avalon": mol_fp = pyAv.GetAvalonFP(murcko_mol, 1024) else: mol_fp = FingerprintMols.FingerprintMol(murcko_mol) sim = DataStructs.FingerprintSimilarity(query_fp, mol_fp) if sim * 100 >= cutoff: rec_counter += 1 rec["Sim"] = np.round(sim * 100, 2) if summary is not None: summary[comp_id] = rec_counter yield rec
def score(end, start=None): """complexity score of a compound Should be high when that compound is difficult to synthesize and low when it is easy to synthesize """ f_end = FingerprintMols.FingerprintMol(end) f_start = FingerprintMols.FingerprintMol(start) sim = DataStructs.FingerprintSimilarity(f_end, f_start) if start: return -1 * sim #lower scores are better if Hydrobromination.has_br(end): return 2 else: return 1
def __post_init__(self): """Properly initialize some class variable representations (or try).""" self.mol_reactants = [Chem.MolFromSmiles(x) for x in self.reactants] self.mol_reagents = [Chem.MolFromSmiles(x) for x in self.reagents] self.mol_products = [Chem.MolFromSmiles(x) for x in self.products] self.fp_reactants = [ FingerprintMols.FingerprintMol(x) for x in self.mol_reactants ] self.fp_reagents = [ FingerprintMols.FingerprintMol(x) for x in self.mol_reagents ] self.fp_products = [ FingerprintMols.FingerprintMol(x) for x in self.mol_products ]
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X, simType): if simType == "Topological": fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList] fp = FingerprintMols.FingerprintMol( Chem.MolFromSmiles(predEx[smilesAttrName].value)) elif simType == "Morgan": fpsTrain = [ AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList ] fp = AllChem.GetMorganFingerprint( Chem.MolFromSmiles(predEx[smilesAttrName].value), 2) elif simType == "MACCS": fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList] fp = MACCSkeys.GenMACCSKeys( Chem.MolFromSmiles(predEx[smilesAttrName].value)) else: print "This type of sim is not implemented ", simType simDict = {} idx = 0 simList = [] for ex in train: if simType == "Topological": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) elif simType == "Morgan": sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp) elif simType == "MACCS": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) else: print "This type of sim is not implemented ", simType idx = idx + 1 simDict[ex[nameAttr].value] = sim simList.append(sim) simList.sort(reverse=True) simList = simList[0:X] medSim = round(numpy.median(simList), 3) stdSim = round(numpy.std(simList), 3) minSim = round(min(simList), 3) maxSim = round(max(simList), 3) entropy = round(getRespVar(simList, simDict, train, nameAttr), 3) entropyClosest = round( getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3) return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
def _pathFingerprintsClustering(rdkit_mols): """ Returns the tanimoto distance matrix based on fingerprints method Parameters ---------- rdkit_mols: list The list of rdkit.Chem.rdchem.Mol objects Returns ------- tanimotomatrix: np.array The numpy array containing the tanimoto matrix """ from rdkit.Chem.Fingerprints import FingerprintMols # calcola path fingerprints fps = [] for m in tqdm(rdkit_mols): fps.append(FingerprintMols.FingerprintMol(m)) aprun = ParallelExecutor(n_jobs=-1) # _config['ncpus']) tanimoto_matrix = aprun(total=len(fps), desc='PathFingerprints Distance') \ (delayed(TanimotoDistances)(fp1, fps) for fp1 in fps) return np.array(tanimoto_matrix)
def CalculateDaylightFingerprint(mol): """ ################################################################# Calculate Daylight-like fingerprint or topological fingerprint (2048 bits). Usage: result=CalculateDaylightFingerprint(mol) Input: mol is a molecule object. Output: result is a tuple form. The first is the number of fingerprints. The second is a dict form whose keys are the position which this molecule has some substructure. The third is the DataStructs which is used for calculating the similarity. ################################################################# """ res = {} NumFinger = 2048 bv = FingerprintMols.FingerprintMol(mol) temp = tuple(bv.GetOnBits()) for i in temp: res.update({i: 1}) return NumFinger, res, bv
def make_matrix(file1, file2): res_df = pd.DataFrame(file1) list_smiles1 = list(file1.canonical_smiles) list_smiles1.append(file2) list_smiles = [Chem.MolFromSmiles(x) for x in list_smiles1] list_ids = list(res_df.name) list_ids.append("My_query") my_fps = [FingerprintMols.FingerprintMol(x) for x in list_smiles] dists = [] simil = [] nfps = len(my_fps) for j in range(0, nfps): simil.append(DataStructs.BulkTanimotoSimilarity(my_fps[j], my_fps)) res_dis = DataStructs.BulkTanimotoSimilarity(my_fps[j], my_fps, returnDistance=1) dists.append([1 - x for x in res_dis]) simil_mat = np.array(simil) dist_mat = np.array(dists) df_dist = pd.DataFrame(dist_mat) df_simil = pd.DataFrame(simil_mat) df_simil.columns = list_ids df_simil.index = list_ids return df_simil
def computeFP(self, typeFP): from rdkit.Chem.Fingerprints import FingerprintMols from rdkit.Chem import MACCSkeys from rdkit.Chem.AtomPairs import Pairs, Torsions from rdkit.Chem import AllChem if not "smiclean" in self.__dict__: self.log = self.log + "No smiles prepared\n" return 1 else: self.mol = Chem.MolFromSmiles(self.smiclean) #print self.smiclean dFP = {} if typeFP == "Mol" or typeFP == "All": dFP["Mol"] = FingerprintMols.FingerprintMol(self.mol) if typeFP == "MACCS" or typeFP == "All": dFP["MACCS"] = MACCSkeys.GenMACCSKeys(self.mol) if typeFP == "pairs" or typeFP == "All": dFP["pairs"] = Pairs.GetAtomPairFingerprint(self.mol) if typeFP == "Torsion" or typeFP == "All": dFP["Torsion"] = Torsions.GetTopologicalTorsionFingerprint( self.mol) if typeFP == "Morgan" or typeFP == "All": dFP["Morgan"] = AllChem.GetMorganFingerprint(self.mol, 2) self.FP = dFP return 0
def ScreenInDb(details, mol): try: probeFp = FingerprintMols.FingerprintMol(mol, **details.__dict__) except Exception: import traceback FingerprintMols.error('Error: problems fingerprinting molecule.\n') traceback.print_exc() return [] if details.metric not in (DataStructs.TanimotoSimilarity, DataStructs.DiceSimilarity, DataStructs.CosineSimilarity): return ScreenFingerprints(details, data=GetFingerprints(details), mol=mol) conn: DbConnect = _ConnectToDatabase(details) if details.metric == DataStructs.TanimotoSimilarity: func = 'rd_tanimoto' elif details.metric == DataStructs.DiceSimilarity: func = 'rd_dice' elif details.metric == DataStructs.CosineSimilarity: func = 'rd_cosine' pkl = probeFp.ToBitString() extraFields = f"{func}({DbModule.placeHolder},{details.fpColName}) as tani" cmd = _ConstructSQL(details, extraFields=extraFields) if details.doThreshold: # we need to do a subquery here: cmd = f"select * from ({cmd}) tmp where tani>{details.screenThresh}" cmd += " order by tani desc" if not details.doThreshold and details.topN > 0: cmd += f" limit {details.topN}" curs = conn.GetCursor() curs.execute(cmd, (pkl, )) return curs.fetchall()
def get_fp(mols): fps = [] if (args.fpType == 'ECFP4'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 2) fps.append(z) if (args.fpType == 'ECFP6'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 3) fps.append(z) if (args.fpType == 'ECFP12'): for x in mols: if (x): z = AllChem.GetMorganFingerprintAsBitVect(x, 6) fps.append(z) if (args.fpType == 'MACCS'): for x in mols: if (x): z = Chem.MACCSkeys.GenMACCSKeys(x) fps.append(z) if (args.fpType == 'Daylight'): for x in mols: if (x): z = FingerprintMols.FingerprintMol(x) fps.append(z) if (args.fpType == 'AP'): for x in mols: if (x): z = GetHashedAtomPairFingerprintAsBitVect(x, nBits=4096) fps.append(z) return fps
def tanimoto_similarity(self): ms = [ Chem.MolFromSmiles(self.smiles1), Chem.MolFromSmiles(self.smiles2) ] fps = [FingerprintMols.FingerprintMol(x) for x in ms] return DataStructs.FingerprintSimilarity(fps[0], fps[1])
def compute_tsne(self): Database = self.Database2 smiles = list(Database["SMILES"]) smi = [Chem.MolFromSmiles(x) for x in smiles] fps = [FingerprintMols.FingerprintMol(x) for x in smi] tanimoto_sim_mat_lower_triangle = GetTanimotoSimMat(fps) n_mol = len(fps) similarity_matrix = np.ones([n_mol, n_mol]) i_lower = np.tril_indices(n=n_mol, m=n_mol, k=-1) i_upper = np.triu_indices(n=n_mol, m=n_mol, k=1) similarity_matrix[i_lower] = tanimoto_sim_mat_lower_triangle similarity_matrix[i_upper] = similarity_matrix.T[i_upper] distance_matrix = np.subtract(1, similarity_matrix) TSNE_sim = TSNE( n_components=2, init='pca', random_state=1992, angle=0.3, perplexity=self.perplexity).fit_transform(distance_matrix) tsne_result = pd.DataFrame(data=TSNE_sim, columns=["PC1", "PC2"]) tsne_result["LIBRARY"] = list(Database.LIBRARY) tsne_result["TIPO"] = list(Database.LIBRARY) tsne_result["SMILES"] = list(Database.SMILES) tsne_result["NAME"] = list(Database.NAME) self.tsne_result = tsne_result.set_index('TIPO')