def _bulkTest(self,bvs): for metric in 'Tanimoto','Dice','AllBit','OnBit','RogotGoldberg': bulk = getattr(DataStructs,f'Bulk{metric}Similarity') single = getattr(DataStructs,f'{metric}Similarity') sims = bulk(bvs[0],bvs) for i in range(len(bvs)): sim = single(bvs[0],bvs[i]) self.assertEqual(sim,sims[i]) self.assertEqual(sim, single(bvs[0],bvs[i].ToBinary())) dists = bulk(bvs[0], bvs, returnDistance=True) for i in range(len(bvs)): dist = single(bvs[0], bvs[i], returnDistance=True) self.assertEqual(dist, dists[i]) self.assertEqual(dist, single(bvs[0], bvs[i].ToBinary(), returnDistance=True)) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.assertEqual(sim, sims[i]) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertEqual(sim, sims[i]) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1, returnDistance=True) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1, returnDistance=True) self.assertEqual(sim, sims[i]) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i], returnDistance=True) self.assertEqual(sim, sims[i])
def test6BulkTversky(self): """ """ sz = 10 nToSet = 5 nVs = 6 import random vs = [] for i in range(nVs): v = ds.IntSparseIntVect(sz) for j in range(nToSet): v[random.randint(0, sz - 1)] = random.randint(1, 10) vs.append(v) baseDs = [ds.TverskySimilarity(vs[0], vs[x], .5, .5) for x in range(1, nVs)] bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 0.5, 0.5) diceDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)] for i in range(len(baseDs)): self.assertTrue(feq(baseDs[i], bulkDs[i])) self.assertTrue(feq(baseDs[i], diceDs[i])) bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 1.0, 1.0) taniDs = [ds.TanimotoSimilarity(vs[0], vs[x]) for x in range(1, nVs)] for i in range(len(bulkDs)): self.assertTrue(feq(bulkDs[i], taniDs[i])) taniDs = ds.BulkTanimotoSimilarity(vs[0], vs[1:]) for i in range(len(bulkDs)): self.assertTrue(feq(bulkDs[i], taniDs[i]))
def test10BulkOps2(self): nbits = 10000 bvs = [] for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs.append(bv) bvs = tuple(bvs) sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.failUnless(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.failUnless(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i]))
def test10BulkOps3(self): nbits = 10000 bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect) for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs[bvi] = bv sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i]))
def GetNeighborLists(probes, topN, pool, simMetric=DataStructs.DiceSimilarity, simThresh=-1., silent=False, **kwargs): probeFps = [x[1] for x in probes] validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None] validFps = [probeFps[x] for x in validProbes] from rdkit.DataStructs.TopNContainer import TopNContainer if simThresh <= 0: nbrLists = [TopNContainer(topN) for x in range(len(probeFps))] else: nbrLists = [TopNContainer(-1) for x in range(len(probeFps))] nDone = 0 for nm, fp in pool: nDone += 1 if not silent and not nDone % 1000: logger.info(' searched %d rows' % nDone) if (simMetric == DataStructs.DiceSimilarity): scores = DataStructs.BulkDiceSimilarity(fp, validFps) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) elif (simMetric == DataStructs.TanimotoSimilarity): scores = DataStructs.BulkTanimotoSimilarity(fp, validFps) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) elif (simMetric == DataStructs.TverskySimilarity): av = float(kwargs.get('tverskyA', 0.5)) bv = float(kwargs.get('tverskyB', 0.5)) scores = DataStructs.BulkTverskySimilarity(fp, validFps, av, bv) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) else: for i in range(len(probeFps)): pfp = probeFps[i] if pfp is not None: score = simMetric(probeFps[i], fp) if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) return nbrLists
#generate fp of query_substructs qfp = Chem.RDKFingerprint(qmol, maxPath=5, fpSize=1024, nBitsPerHash=2) queries.append(qfp) query_info.append((info[0], info[1], info[2])) fragments = len(query_info) for line in sys.stdin: line = line.rstrip() smi, id = re.split('\s|,', line) #print smi,id mol = Chem.MolFromSmiles(smi) if mol is None: sys.stderr.write("Can't generate mol for: %s\n" % (smi)) continue mfp = Chem.RDKFingerprint(mol, maxPath=5, fpSize=1024, nBitsPerHash=2) #print smi res = DataStructs.BulkTverskySimilarity(mfp, queries, 0, 1, False) #query_frag_smiles,query_smiles,query_id,retrieved_smi,retrieved_id,tversky_sim for i in range(fragments): if (res[i] >= options.cutoff): print("%s,%s,%s,%s,%s,%s" % (query_info[i][2], query_info[i][0], query_info[i][1], smi, id, res[i]))
def similarity_metric(x, y): return DataStructs.BulkTverskySimilarity(x, y, 1.0, 1.5)
fp = gen_fp(mol) fps1.append(fp) smiles1.append(smi) t2 = time.time() print("Fingerprinting targets took:", (t2 - t1)) print("Number of targets:", len(fps1)) t3 = time.time() hits = [] best_scores = [] best_target = [] for mol, smi in read_file(p2, True): if mol: fp2 = gen_fp(mol) scores = DataStructs.BulkTverskySimilarity(fp2, fps1, alpha, 1.0 - alpha) best_score = 0.0 best_target = None for score, target in zip(scores, smiles1): # print(score) if score > best_score: best_score = score best_target = target if threshold1 <= best_score <= threshold2: print(smi, best_score) print(best_target) hits.append(smi) # for score in scores: # if threshold1 <= score <= threshold2: