def calculate_sims(fps1, fps2, simtype='tanimoto'): sim_mat = np.zeros((len(fps1), len(fps2))) # ,dtype=np.float32) for i in range(len(fps1)): fp_i = fps1[i] if simtype == 'tanimoto': sims = DataStructs.BulkTanimotoSimilarity(fp_i, fps2) elif simtype == 'dice': sims = DataStructs.BulkDiceSimilarity(fp_i, fps2) sim_mat[i, :] = sims return sim_mat
def ClusterFps(Fp, cutoff): from rdkit import DataStructs from rdkit.ML.Cluster import Butina # first generate the distance matrix: dists = [] nfps = len(Fp) for i in range(1,nfps): # sims = DataStructs.BulkTanimotoSimilarity(Fp[i], Fp[:i]) sims = DataStructs.BulkDiceSimilarity(Fp[i], Fp[:i]) dists.extend([1-x for x in sims]) # now cluster the data: cs = Butina.ClusterData(dists, nfps, cutoff, isDistData=True) return cs
def test10BulkOps2(self): nbits = 10000 bvs = [] for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs.append(bv) bvs = tuple(bvs) sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.failUnless(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.failUnless(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i]))
def test10BulkOps3(self): nbits = 10000 bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect) for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs[bvi] = bv sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i]))
def GetNeighborLists(probes, topN, pool, simMetric=DataStructs.DiceSimilarity, simThresh=-1., silent=False, **kwargs): probeFps = [x[1] for x in probes] validProbes = [x for x in range(len(probeFps)) if probeFps[x] is not None] validFps = [probeFps[x] for x in validProbes] from rdkit.DataStructs.TopNContainer import TopNContainer if simThresh <= 0: nbrLists = [TopNContainer(topN) for x in range(len(probeFps))] else: nbrLists = [TopNContainer(-1) for x in range(len(probeFps))] nDone = 0 for nm, fp in pool: nDone += 1 if not silent and not nDone % 1000: logger.info(' searched %d rows' % nDone) if (simMetric == DataStructs.DiceSimilarity): scores = DataStructs.BulkDiceSimilarity(fp, validFps) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) elif (simMetric == DataStructs.TanimotoSimilarity): scores = DataStructs.BulkTanimotoSimilarity(fp, validFps) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) elif (simMetric == DataStructs.TverskySimilarity): av = float(kwargs.get('tverskyA', 0.5)) bv = float(kwargs.get('tverskyB', 0.5)) scores = DataStructs.BulkTverskySimilarity(fp, validFps, av, bv) for i, score in enumerate(scores): if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) else: for i in range(len(probeFps)): pfp = probeFps[i] if pfp is not None: score = simMetric(probeFps[i], fp) if score > simThresh: nbrLists[validProbes[i]].Insert(score, nm) return nbrLists
def test6BulkDice(self): """ """ sz = 10 nToSet = 5 nVs = 6 import random vs = [] for i in range(nVs): v = ds.IntSparseIntVect(sz) for j in range(nToSet): v[random.randint(0, sz - 1)] = random.randint(1, 10) vs.append(v) baseDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)] bulkDs = ds.BulkDiceSimilarity(vs[0], vs[1:]) for i in range(len(baseDs)): self.assertTrue(feq(baseDs[i], bulkDs[i]))
'''Gets the fingerprints from the fingerprint library and stores them in a dictioanry''' fp_dict = {} for fp in fp_names: fp_dict[fp] = fingerprint_lib.CalculateFP(fp, smiles) return fp_dict def getFP(fp_name, smiles): '''Gets fingerprint from fingerprint library''' return fingerprint_lib.CalculateFP(fp_name, smiles) # dictionary for similarity measures simil_dict = {} simil_dict['Dice'] = lambda x, y: sorted(DataStructs.BulkDiceSimilarity(x, y), reverse=True) simil_dict['Tanimoto'] = lambda x, y: sorted( DataStructs.BulkTanimotoSimilarity(x, y), reverse=True) simil_dict['Cosine'] = lambda x, y: sorted( DataStructs.BulkCosineSimilarity(x, y), reverse=True) simil_dict['Russel'] = lambda x, y: sorted( DataStructs.BulkRusselSimilarity(x, y), reverse=True) simil_dict['Kulczynski'] = lambda x, y: sorted( DataStructs.BulkKulczynskiSimilarity(x, y), reverse=True) simil_dict['McConnaughey'] = lambda x, y: sorted( DataStructs.BulkMcConnaugheySimilarity(x, y), reverse=True) simil_dict['Manhattan'] = lambda x, y: sorted( DataStructs.BulkAllBitSimilarity(x, y), reverse=True) simil_dict['RogotGoldberg'] = lambda x, y: sorted( DataStructs.BulkRogotGoldbergSimilarity(x, y), reverse=True)