def similarity(a, b): # Tanimoto similarity if a is None or b is None: return 0.0 amol = Chem.MolFromSmiles(a) bmol = Chem.MolFromSmiles(b) if amol is None or bmol is None: return 0.0 fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False) fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False) print( '-----------test1: confirm jaccard_score is performing same way as the DataStructs.TanimotoSimilarity-----------' ) a, b = np.array(fp1), np.array(fp2) j = jaccard_score(a, b) print('jaccard score is {}'.format(j)) t = DataStructs.TanimotoSimilarity(fp1, fp2) print('TanimotoSimilarity is {}'.format(t)) assert abs( j - t) <= 1e-10, 'j and t should be similar, j is {}, t is {}'.format( j, t) return DataStructs.TanimotoSimilarity(fp1, fp2)
def __call__(self, smile): mol = Chem.MolFromSmiles(smile) if mol: try: fp = AllChem.GetMorganFingerprintAsBitVect(mol, 4) fp_4 = np.array(fp).reshape(1, -1) score = self.clf.predict(fp_4)[0] score *= Chem.AddHs(mol).HasSubstructMatch( Chem.MolFromSmarts(self.smarts)) if max([ DataStructs.TanimotoSimilarity(query_fp, fp) for query_fp in self.test_fps ]) > 0.99: print("Found original molecule: " + smile) if max([ DataStructs.TanimotoSimilarity(query_fp, fp) for query_fp in self.train_fps ]) > 0.99: score = 0 if score > 7.5: score = 1 else: score *= 1 / 7.5 return float(score) except: return 0.0 return 0.0
def _bulkTest(self,bvs): for metric in 'Tanimoto','Dice','AllBit','OnBit','RogotGoldberg': bulk = getattr(DataStructs,f'Bulk{metric}Similarity') single = getattr(DataStructs,f'{metric}Similarity') sims = bulk(bvs[0],bvs) for i in range(len(bvs)): sim = single(bvs[0],bvs[i]) self.assertEqual(sim,sims[i]) self.assertEqual(sim, single(bvs[0],bvs[i].ToBinary())) dists = bulk(bvs[0], bvs, returnDistance=True) for i in range(len(bvs)): dist = single(bvs[0], bvs[i], returnDistance=True) self.assertEqual(dist, dists[i]) self.assertEqual(dist, single(bvs[0], bvs[i].ToBinary(), returnDistance=True)) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.assertEqual(sim, sims[i]) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertEqual(sim, sims[i]) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1, returnDistance=True) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1, returnDistance=True) self.assertEqual(sim, sims[i]) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i], returnDistance=True) self.assertEqual(sim, sims[i])
def testBitVectorLeader1(self): # threshold tests fname = os.path.join(RDConfig.RDBaseDir, 'Code', 'SimDivPickers', 'Wrap', 'test_data', 'chembl_cyps.head.fps') fps = [] with open(fname) as infil: for line in infil: fp = DataStructs.CreateFromFPSText(line.strip()) fps.append(fp) mmp = rdSimDivPickers.LeaderPicker() thresh = 0.8 ids = mmp.LazyBitVectorPick(fps, len(fps), thresh) self.assertEqual(len(ids), 146) for i in range(len(ids)): for j in range(i): self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), thresh) thresh = 0.9 ids = mmp.LazyBitVectorPick(fps, len(fps), thresh) self.assertEqual(len(ids), 14) for i in range(len(ids)): for j in range(i): self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), thresh) ids = mmp.LazyBitVectorPick(fps, len(fps), thresh, pickSize=10) self.assertEqual(len(ids), 10) for i in range(len(ids)): for j in range(i): self.assertGreaterEqual(1 - DataStructs.TanimotoSimilarity(fps[ids[i]], fps[ids[j]]), thresh)
def optimize(self, smiles, sim_cutoff, lr=2.0, num_iter=20): mol_tree = MolTree(smiles) mol_tree.recover() _, tree_vec, mol_vec = self.encode([mol_tree]) mol = Chem.MolFromSmiles(smiles) fp1 = AllChem.GetMorganFingerprint(mol, 2) tree_mean = self.T_mean(tree_vec) # Following Mueller et al. tree_log_var = -torch.abs(self.T_var(tree_vec)) mol_mean = self.G_mean(mol_vec) # Following Mueller et al. mol_log_var = -torch.abs(self.G_var(mol_vec)) mean = torch.cat([tree_mean, mol_mean], dim=1) log_var = torch.cat([tree_log_var, mol_log_var], dim=1) cur_vec = create_var(mean.data, True) visited = [] for _ in xrange(num_iter): prop_val = self.propNN(cur_vec).squeeze() grad = torch.autograd.grad(prop_val, cur_vec)[0] cur_vec = cur_vec.data + lr * grad.data cur_vec = create_var(cur_vec, True) visited.append(cur_vec) l, r = 0, num_iter - 1 while l < r - 1: mid = (l + r) / 2 new_vec = visited[mid] tree_vec, mol_vec = torch.chunk(new_vec, 2, dim=1) new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False) if new_smiles is None: r = mid - 1 continue new_mol = Chem.MolFromSmiles(new_smiles) fp2 = AllChem.GetMorganFingerprint(new_mol, 2) sim = DataStructs.TanimotoSimilarity(fp1, fp2) if sim < sim_cutoff: r = mid - 1 else: l = mid tree_vec, mol_vec = torch.chunk(visited[l], 2, dim=1) new_smiles = self.decode(tree_vec, mol_vec, prob_decode=False) if new_smiles is None: return smiles, 1.0 new_mol = Chem.MolFromSmiles(new_smiles) fp2 = AllChem.GetMorganFingerprint(new_mol, 2) sim = DataStructs.TanimotoSimilarity(fp1, fp2) if sim >= sim_cutoff: return new_smiles, sim else: return smiles, 1.0
def compareAll(fpA_dict, fpB_dict=None, cutoff=None): ################################# ### Get compound similarities ### ################################# simD = {} namesA = list(fpA_dict.keys()) nA = len(namesA) # Work with only one input file if fpB is None: for i in range(nA): name1 = namesA[i] simD[name1] = {} [fp1, smiles1] = fpA_dict[name1] for j in range(i + 1, nA): name2 = namesA[j] [fp2, smiles2] = fpA_dict[name2] sim = DataStructs.TanimotoSimilarity(fp1, fp2) if cutoff is not None and sim < cutoff: simD[name1][name2] = None simD[name2][name1] = None else: simD[name1][name2] = [smiles1, smiles2, sim] simD[name2][name1] = [smiles2, smiles1, sim] # Work with two input files else: namesB = list(fpB_dict.keys()) for nameA in namesA: simD[nameA] = {} [fpA, smilesA] = fpA_dict[nameA] for nameB in namesB: [fpB, smilesB] = fpB_dict[nameB] sim = DataStructs.TanimotoSimilarity( fpA, fpB) #DataStructs.DiceSimilarity(fp1, fp2) if cutoff is not None and sim < cutoff: simD[nameA][nameB] = None else: simD[nameA][nameB] = [smilesA, smilesB, sim] # Remove compounds with no neighbours over the cutoff d = {x: simD[x] for x in simD if simD[x] is not None} # Convert dictionary to pandas dataframe df = pd.DataFrame.from_records([[i, j] + d[i][j] for i in d for j in d[i]]) df.columns = ['cmpd1', 'cmpd2', 'smiles1', 'smiles2', 'similarity'] return df
def test10BulkOps2(self): nbits = 10000 bvs = [] for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs.append(bv) bvs = tuple(bvs) sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.failUnless(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.failUnless(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i]))
def run(self): # if self.combox.currentText() == self.select_mol: # QMessageBox.information(self,'错误','运行已完成') smile = self.dff[self.select_mol] fp = self.get_fingerprint(smile) fps = [ self.get_fingerprint(smile) if self.get_fingerprint(smile) is not None else 0 for smile in self.db['SMILES'] ] self.db['similar'] = [ 0 if i == 0 else round(DataStructs.TanimotoSimilarity(fp, i), 2) for i in fps ] top20 = self.db[['db', 'plate', 'Col', 'Row', 'similar', 'MOLENAME']].sort_values('similar', ascending=False).head(20) # top20 = sorted(x,key = lambda i: i[1], reverse = True)[:20] # df = self.db[['db','plate','Col','Row','MOLENAME']].iloc[[i for i,j in top20],] res = '\n'.join([ '\t'.join([str(i) for i in line]) for index, line in top20.iterrows() ]) self.res_signal.emit(res)
def similarity(a, b, chiral=True): if a is None or b is None: return 0.0 amol = Chem.MolFromSmiles(a) bmol = Chem.MolFromSmiles(b) fp1 = AllChem.GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=chiral) fp2 = AllChem.GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=chiral) return DataStructs.TanimotoSimilarity(fp1, fp2)
def sim_filter(self, query, cutoff=0.75): if len(self.fp_name) == 0 or self.fp_col not in self.data.keys(): raise KeyError("No fingerprints found. Please generate them first with add_fp().") data_len = len(self.data) show_prog = IPYTHON and data_len > 5000 if show_prog: ctr = nbt.ProgCtr() pb = nbt.Progressbar() if isinstance(query, str): query_mol = Chem.MolFromSmiles(query) else: query_mol = deepcopy(query) if not query_mol: raise ValueError("Could not generate query mol.") fp_method = FPDICT[self.fp_name] query_fp = fp_method(query_mol) res_l = [] for _, rec in self.data.iterrows(): if show_prog: ctr.inc() pb.update(100 * ctr() / data_len) mol_fp = pickle.loads(b64.b64decode(rec[self.fp_col])) sim = DataStructs.TanimotoSimilarity(query_fp, mol_fp) if sim >= cutoff: rec["Sim"] = sim res_l.append(rec) result = self.new() result.data = pd.DataFrame(res_l) print_log(result.data, "sim_filter") if show_prog: pb.done() return result
def create_tanimoto_column(smiles_A, smiles_B): df_smiles = pd.DataFrame({'A': smiles_A, 'B': smiles_B}) df_smiles = df_smiles.iloc[np.logical_and(df_smiles['A'].values != 'nan', df_smiles['B'].values != 'nan')] df_smiles.dropna(inplace=True) smiles_A = df_smiles.A smiles_B = df_smiles.B smiles_A_mol = [Chem.MolFromSmiles(x) for x in smiles_A] smiles_B_mol = [Chem.MolFromSmiles(x) for x in smiles_B] smiles_A_fps = [ AllChem.GetMorganFingerprint(mol, 2) for mol in smiles_A_mol ] smiles_B_fps = [ AllChem.GetMorganFingerprint(mol, 2) for mol in smiles_B_mol ] tanimoto = np.array([ DataStructs.TanimotoSimilarity(fp1, fp2) for (fp1, fp2) in zip(smiles_A_fps, smiles_B_fps) ]) return tanimoto
def similar_smiles(self, peptide_to_match): """ Calculate similarity but using SMILES representations of the peptides Arguments: peptide_to_match -- peptide sequence that will be compared Return: SMILES similarity based on Morgan Fingerprints and Tanimoto coefficient """ # Generate molecule from sequence mol1 = Chem.MolFromSmiles(self.smiles) mol1.SetProp("_Name", self.sequence) connect_smiles = 'O' for res in peptide_to_match: connect_smiles = connect_smiles[:-1] smiles = aminoacidSMILES(res) connect_smiles = connect_smiles + smiles mol2 = Chem.MolFromSmiles(connect_smiles) mol2.SetProp("_Name", peptide_to_match) # Calculate the fingerprints and the similarity fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, 2048) fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, 2048) self.smiles_similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
def doSimilarityWeightedAdAnalysis(model_name): global rdkit_mols, ad_settings ad_idx = [] known = [] ad_data = getAdData(model_name) required_threshold = np.percentile(ad_data[:, 5], ad_settings) for mol_idx, m in enumerate(rdkit_mols): ad_flag = True #only check for known compounds if set in options (True means check) if options.known: k_flag = True else: k_flag = False for training_instance in ad_data: sim = DataStructs.TanimotoSimilarity(m, training_instance[0]) #check if input=train & need to check input=train if sim == 1.0 and k_flag == True: known.append([mol_idx, training_instance[1]]) k_flag = False weight = sim / (training_instance[2] * training_instance[3]) #if comp in AD & no comp already in AD if weight >= required_threshold and ad_flag == True: ad_idx.append(mol_idx) ad_flag = False #if compound is in AD and no need to check accross all comps for known then break if k_flag == False and ad_flag == False: break return ad_idx, np.array(known)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] radius = int( model_configuration["configuration"]["fragments"][0]["size"]) active_molecules_tt = [] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) tt_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect( molecule, radius) active_molecules_tt.append(tt_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect( test_molecule, radius) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_tt ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def pair_similiar(self, valid_smiles): if len(valid_smiles) <= 2: return 0, 0 else: valid_mols = [Chem.MolFromSmiles(i) for i in valid_smiles] valid_fps = [ AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024, useChirality=True) for mol in valid_mols ] pair_similiar = [] for i in range(len(valid_fps)): for j in range(i + 1, len(valid_fps)): fp_i = valid_fps[i] fp_j = valid_fps[j] pair_similiar.append( DataStructs.TanimotoSimilarity(fp_i, fp_j)) pair_similiar_numpy = np.array(pair_similiar) very_similiar = pair_similiar_numpy[pair_similiar_numpy > 0.75] very_similiar_rate = very_similiar.shape[0] / len(pair_similiar) mean_pair_similiar = sum(pair_similiar) / len(pair_similiar) return str(very_similiar_rate), str(mean_pair_similiar)
def getSimilarity(self, reference, method='tanimoto', alpha=None, beta=None): if method == 'tanimoto': return DataStructs.TanimotoSimilarity(reference.IFPvector, self.IFPvector) elif method == 'dice': return DataStructs.DiceSimilarity(reference.IFPvector, self.IFPvector) elif method == 'tversky': return DataStructs.TverskySimilarity(reference.IFPvector, self.IFPvector, alpha, beta)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] active_molecules_ap = [] nbits = model_configuration["configuration"]["nbits"] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) ap_fingerprint = Pairs.GetHashedAtomPairFingerprint(molecule, nBits=nbits) active_molecules_ap.append(ap_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Pairs.GetHashedAtomPairFingerprint( test_molecule, nBits=nbits) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_ap ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def search_files(files, fingerprints): with open('score.csv', 'w') as csvfile: score_writer = csv.writer(csvfile, delimiter=' ') for file in files: print('Processing file %s at %s'%(file, str(datetime.datetime.now()))) smiles = pickle.load( open(file, 'rb') ) # Precompute bitvectors fps = [] for i, row in smiles.iterrows(): try: fps += [DataStructs.ExplicitBitVect(base64.b64decode(row['fingerprint']))] except: fps += [None] print('None') if i%100000==0 and i>0: print('.', end='', flush=True) if i%1000000==0 and i>0: if i%10000000==0: print('M', end='', flush=True) else: print('m', end='', flush=True) smiles['fp'] = fps print('\n Precomputed at %s '%(str(datetime.datetime.now())), end='') plt.rcParams["figure.figsize"] = (12,10) plt.figure() plt.title(file, fontsize=12) # For each of our target SMILE strings for (insmile, fp2) in fingerprints: print('\n %s '%insmile, sep='') scores = [] # For each row in the file we are comparing against for i, row in smiles.iterrows(): try: score = DataStructs.TanimotoSimilarity(row['fp'], fp2) scores += [(row['canonical_smile'], score)] except: pass if i%100000==0 and i>0: print('.', end='', flush=True) if i%1000000==0 and i>0: if i%10000000==0: print('M', end='', flush=True) else: print('m', end='', flush=True) print(' %s'%(str(datetime.datetime.now()))) # Add line to graph sorted_scores = sorted(scores, key=itemgetter(1)) scores_only = [x[1] for x in sorted_scores] plt.step(np.arange(len(sorted_scores)), np.array(scores_only), label=insmile, linewidth=0.5) # Select top 200 lastN = sorted_scores[-200:] lastN.reverse() for (smile, score) in lastN: score_writer.writerow([file, '%.6f'%score, insmile, smile]) plt.legend(fontsize=6) plt.savefig('fig%d.pdf'%(random.randint(1,100000)))
def computeSimilarityFP(self, c_chem, typeFP, typeMetric): try: if typeMetric == 'Tanimoto': return DataStructs.TanimotoSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "Dice": return DataStructs.DiceSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "Cosine": return DataStructs.CosineSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "Sokal": return DataStructs.SokalSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "Russel": return DataStructs.RusselSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "RogotGoldberg": return DataStructs.RogotGoldbergSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "AllBit": return DataStructs.AllBitSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "Kulczynski": return DataStructs.KulczynskiSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "McConnaughey": return DataStructs.McConnaugheySimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "Asymmetric": return DataStructs.AsymmetricSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) elif typeMetric == "BraunBlanquet": return DataStructs.BraunBlanquetSimilarity(self.d_FP[typeFP], c_chem.d_FP[typeFP]) except: print("Combination %s and %s not supported"%(typeFP, typeMetric)) self.log = "%sCombination %s and %s not supported\n"%(self.log, typeFP, typeMetric) return "NA"
def get_similarity(self, smiles): structure = Chem.MolFromSmiles(smiles) if structure is None: return 0.0 fingerprint_structure = self.get_fingerprint(structure) return DataStructs.TanimotoSimilarity(self._target_mol_fingerprint, fingerprint_structure)
def diversity(smiles_list): """ Function that takes as input a list containing SMILES strings to compute its internal diversity Parameters ---------- smiles_list: List with valid SMILES strings Returns ------- This function returns the internal diversity of the list given as input, based on the computation Tanimoto similarity """ td = 0 fps_A = [] for i, row in enumerate(smiles_list): try: mol = Chem.MolFromSmiles(row) fps_A.append(AllChem.GetMorganFingerprint(mol, 6)) except: print('ERROR: Invalid SMILES!') for ii in range(len(fps_A)): for xx in range(len(fps_A)): tdi = 1 - DataStructs.TanimotoSimilarity(fps_A[ii], fps_A[xx]) td += tdi td = td / len(fps_A)**2 return td
def cal_sim(q, ref_data, return_dict_sim): Nref = len(ref_data) nbits = 1024 while True: qqq = q.get() if qqq == 'DONE': # print('proc =', os.getpid()) break idx, smi = qqq if idx % 10000 == 0: print(idx) Nsmi = len(smi) mol = Chem.MolFromSmiles(smi) if mol is None: continue if Chem.SanitizeMol(mol, catchErrors=True): continue com_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=nbits) sim_data = [] for j in range(Nref): ref_fp = ref_data[j][1] sim = DataStructs.TanimotoSimilarity(com_fp, ref_fp) sim_data += [sim] similarity = np.array(sim_data) j_max = similarity.argmax() sim_max = similarity[j_max] return_dict_sim[idx] = [sim_max, j_max]
def calculate_internal_diversity(smiles, ref_smiles,radius=4): """ Calculates internal diversity of the given compounds. See http://arxiv.org/abs/1708.08227 Arguments: ------------ :param smiles: list or tuple Compounds to be used for calculating internal diversity :param radius: int The circular fingerprint radius (NB: 2 corresponds to ECFP4) :return: float internal diversity value """ diversity = np.zeros((len(smiles), len(smiles))) mols = [] for s in smiles: mol = Chem.MolFromSmiles(s) if mol is not None: mols.append(mol) compounds = [AllChem.GetMorganFingerprint(m, radius) for m in mols] hist = {} for i in range(len(compounds)): c1 = compounds[i] for j, c2 in enumerate(compounds): if (c1, c2) in hist: td = hist[(c1, c2)] else: td = 1 - DataStructs.TanimotoSimilarity(c1, c2) hist[(c1, c2)] = hist[(c2, c1)] = td diversity[i, j] = td return diversity
def _smilarity_between_two_mols(mol1, mol2): # mol1, mol2 = Chem.MolFromSmiles(smi1), Chem.MolFromSmiles(smi2) vec1 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol1, 4, nBits=512) vec2 = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol2, 4, nBits=512) tani = DataStructs.TanimotoSimilarity(vec1, vec2) return tani
def pharmacophore(mol, target): i = 0 print('mol/target', mol, target) mol.standardize() target.standardize() mol = str(mol) mol = mol.replace('N(=O)O', '[N+](=O)[O-]') mol = mol.replace('N(O)=O', '[N+]([O-])=O') mol = mol.replace('n(O)', '[n+]([O-])') target = str(target) target = target.replace('N(=O)O', '[N+](=O)[O-]') target = target.replace('N(O)=O', '[N+]([O-])=O') target = target.replace('n(O)', '[n+]([O-])') featfactory = load_factory() sigfactory = SigFactory(featfactory, minPointCount=2, maxPointCount=3, trianglePruneBins=False) sigfactory.SetBins([(0, 2), (2, 5), (5, 8)]) sigfactory.Init() mol1 = Chem.MolFromSmiles(mol) mol2 = Chem.MolFromSmiles(target) if mol1 and mol2: fp1 = Generate.Gen2DFingerprint(mol1, sigfactory) fp2 = Generate.Gen2DFingerprint(mol2, sigfactory) sims = DataStructs.TanimotoSimilarity(fp1, fp2) return sims else: i = i + 1 print('ошибка', i, mol) return -100
def struct_score(SMILES1, SMILES2): from rdkit import Chem, DataStructs mol1 = Chem.MolFromSmiles(SMILES1) mol2 = Chem.MolFromSmiles(SMILES2) fp1 = Chem.RDKFingerprint(mol1) fp2 = Chem.RDKFingerprint(mol2) return DataStructs.TanimotoSimilarity(fp1, fp2)
def test6BulkTversky(self): """ """ sz = 10 nToSet = 5 nVs = 6 import random vs = [] for i in range(nVs): v = ds.IntSparseIntVect(sz) for j in range(nToSet): v[random.randint(0, sz - 1)] = random.randint(1, 10) vs.append(v) baseDs = [ds.TverskySimilarity(vs[0], vs[x], .5, .5) for x in range(1, nVs)] bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 0.5, 0.5) diceDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)] for i in range(len(baseDs)): self.assertTrue(feq(baseDs[i], bulkDs[i])) self.assertTrue(feq(baseDs[i], diceDs[i])) bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 1.0, 1.0) taniDs = [ds.TanimotoSimilarity(vs[0], vs[x]) for x in range(1, nVs)] for i in range(len(bulkDs)): self.assertTrue(feq(bulkDs[i], taniDs[i])) taniDs = ds.BulkTanimotoSimilarity(vs[0], vs[1:]) for i in range(len(bulkDs)): self.assertTrue(feq(bulkDs[i], taniDs[i]))
def create_tanimoto_index(similarity_value, aglycon_formulas, fps, df_Without_Double_or_Triple): """ Gets the similarity value, the Morgan-Fingerprint of each aglycon and a data frame with only single entries in the taxonomy row. Checks the Tanimito Index for all possible pairs of two aglycons. If the value of the Tanimoto Index is above the given similarity value, the aglycons and their Tanimoto Index are appended to a new data frame. Passes the new three column data frame with both aglycons and their Tanimoto Index. """ aglycon_pairs = itertools.combinations(fps, 2) aglycon1 = [] aglycon2 = [] tanimoto = [] counter = 0 for pair in aglycon_pairs: fingerprint = DataStructs.TanimotoSimilarity(pair[0], pair[1]) if fingerprint >= similarity_value: aglycon1.append(aglycon_formulas[counter][0]) aglycon2.append(aglycon_formulas[counter][1]) tanimoto.append(fingerprint) counter += 1 #print(counter) df_comparison = pd.DataFrame({ "aglycon1": aglycon1, "aglycon2": aglycon2, "tanimoto_index": tanimoto }) create_df_with_tanimoto_index(df_comparison, df_Without_Double_or_Triple)
def compute_fraggle_similarity_for_subs(inMol, qMol, qSmi, qSubs, tverskyThresh=0.8): qFP = Chem.RDKFingerprint(qMol, **rdkitFpParams) iFP = Chem.RDKFingerprint(inMol, **rdkitFpParams) rdkit_sim = DataStructs.TanimotoSimilarity(qFP, iFP) qm_key = "%s_%s" % (qSubs, qSmi) if qm_key in modified_query_fps: qmMolFp = modified_query_fps[qm_key] else: qmMol = atomContrib(qSubs, qMol, tverskyThresh) qmMolFp = Chem.RDKFingerprint(qmMol, **rdkitFpParams) modified_query_fps[qm_key] = qmMolFp rmMol = atomContrib(qSubs, inMol, tverskyThresh) # wrap in a try, catch try: rmMolFp = Chem.RDKFingerprint(rmMol, **rdkitFpParams) fraggle_sim = max(DataStructs.FingerprintSimilarity(qmMolFp, rmMolFp), rdkit_sim) except Exception: # pragma: nocover sys.stderr.write("Can't generate fp for: %s\n" % (Chem.MolToSmiles(rmMol, True))) fraggle_sim = 0.0 return rdkit_sim, fraggle_sim
def test10BulkOps3(self): nbits = 10000 bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect) for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs[bvi] = bv sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i]))