def _bulkTest(self,bvs): for metric in 'Tanimoto','Dice','AllBit','OnBit','RogotGoldberg': bulk = getattr(DataStructs,f'Bulk{metric}Similarity') single = getattr(DataStructs,f'{metric}Similarity') sims = bulk(bvs[0],bvs) for i in range(len(bvs)): sim = single(bvs[0],bvs[i]) self.assertEqual(sim,sims[i]) self.assertEqual(sim, single(bvs[0],bvs[i].ToBinary())) dists = bulk(bvs[0], bvs, returnDistance=True) for i in range(len(bvs)): dist = single(bvs[0], bvs[i], returnDistance=True) self.assertEqual(dist, dists[i]) self.assertEqual(dist, single(bvs[0], bvs[i].ToBinary(), returnDistance=True)) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.assertEqual(sim, sims[i]) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertEqual(sim, sims[i]) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1, returnDistance=True) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1, returnDistance=True) self.assertEqual(sim, sims[i]) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i], returnDistance=True) self.assertEqual(sim, sims[i])
def test10BulkOps2(self): nbits = 10000 bvs = [] for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs.append(bv) bvs = tuple(bvs) sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.failUnless(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.failUnless(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i]))
def test6BulkTversky(self): """ """ sz = 10 nToSet = 5 nVs = 6 import random vs = [] for i in range(nVs): v = ds.IntSparseIntVect(sz) for j in range(nToSet): v[random.randint(0, sz - 1)] = random.randint(1, 10) vs.append(v) baseDs = [ds.TverskySimilarity(vs[0], vs[x], .5, .5) for x in range(1, nVs)] bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 0.5, 0.5) diceDs = [ds.DiceSimilarity(vs[0], vs[x]) for x in range(1, nVs)] for i in range(len(baseDs)): self.assertTrue(feq(baseDs[i], bulkDs[i])) self.assertTrue(feq(baseDs[i], diceDs[i])) bulkDs = ds.BulkTverskySimilarity(vs[0], vs[1:], 1.0, 1.0) taniDs = [ds.TanimotoSimilarity(vs[0], vs[x]) for x in range(1, nVs)] for i in range(len(bulkDs)): self.assertTrue(feq(bulkDs[i], taniDs[i])) taniDs = ds.BulkTanimotoSimilarity(vs[0], vs[1:]) for i in range(len(bulkDs)): self.assertTrue(feq(bulkDs[i], taniDs[i]))
def getSimilarity(self, reference, method='tanimoto', alpha=None, beta=None): if method == 'tanimoto': return DataStructs.TanimotoSimilarity(reference.IFPvector, self.IFPvector) elif method == 'dice': return DataStructs.DiceSimilarity(reference.IFPvector, self.IFPvector) elif method == 'tversky': return DataStructs.TverskySimilarity(reference.IFPvector, self.IFPvector, alpha, beta)
def test10BulkOps3(self): nbits = 10000 bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect) for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs[bvi] = bv sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i]))
def partialFP(atomID, tverskyThresh): #create empty fp modifiedFP = DataStructs.ExplicitBitVect(1024) modifiedFP.SetBitsFromList(aBits[atomID]) tverskySim = DataStructs.TverskySimilarity(subsFp, modifiedFP, 0, 1) if (tverskySim < tverskyThresh): #print "%i %s: %f" % (atomID+1, pMol.GetAtomWithIdx(atomID).GetSymbol(), tverskySim) marked[atomID] = 1
def partialSimilarity(atomID): """ Determine similarity for the atoms set by atomID """ # create empty fp modifiedFP = DataStructs.ExplicitBitVect(1024) modifiedFP.SetBitsFromList(aBits[atomID]) return DataStructs.TverskySimilarity(subsFp, modifiedFP, 0, 1)
# 'path':OEFPType_Path} fpcodes = { 'rdkit': FingerprintMols.GetRDKFingerprint, 'maccs': MACCSkeys.GenMACCSKeys, 'morgan': lambda mol: AllChem.GetMorganFingerprintAsBitVect(mol, morganradius), 'atompairs': lambda mol: Chem.GetAtomPairFingerPrint(mol, 2) } # tversky(F1, F2)= F1@F2 / ( a*sum(F1) + b*sum(F2) - (1-a-b)* F1@F2 ) dmetrics = { 'tanimoto': DataStructs.TanimotoSimilarity, # a, b = 1, 1 'dice': DataStructs.DiceSimilarity, # a, b = 0.5, 0.5 'cosine': DataStructs.CosineSimilarity, 'tversky': lambda m1, m2: DataStructs.TverskySimilarity(m1, m2, 0.5, 0.5), 'sokal': DataStructs.SokalSimilarity } ######################### # Module initialization # ######################### def Init(): global dmetric, fpcode #check requested fingerprint existance if not fpcodes.has_key(fp): print 'Unrecognized fingerprint (mprms.fp): ' + fp
count = 0 for i in range(fpA.GetNumBits()): if fpA.GetBit(i): count += 1 print(count, size) if fper == GenMACCSKeys: onbits = tuple(fpA.GetOnBits()) print(str(onbits)) for i in onbits: print(Chem.MACCSkeys.smartsPatts[i][0]) fpB = fper(molB) fptxtB = DataStructs.BitVectToText(fpB) print(fptxtB) size = fpB.GetNumBits() count = 0 for i in range(fpB.GetNumBits()): if fpB.GetBit(i): count += 1 print(count, size) if fper == GenMACCSKeys: onbits = tuple(fpB.GetOnBits()) print(str(onbits)) for i in onbits: print(Chem.MACCSkeys.smartsPatts[i][0]) sim = DataStructs.TanimotoSimilarity(fpA, fpB) print(sim) sim = DataStructs.TverskySimilarity(fpA, fpB, 0.9, 0.1) print(sim) bcom = DataStructs.NumBitsInCommon(fpA, fpB) print(bcom)
def Similarities(fps1, fps2, fingerprint, measure='tanimoto'): # Indigo fingerptins if fingerprint in indigofps and measure in indigosims: return np.array( [[indigo.similarity(fp1, fp2, measure) for fp2 in fps2] for fp1 in fps1]) # RDKit fingerprints if fingerprint in rdkitfps and measure in rdkitsims: if measure == 'allbit': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.AllBitSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'asymmetric': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.AsymmetricSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'asymmetric2': return np.array([[ DataStructs.FingerprintSimilarity( fp2, fp1, metric=DataStructs.AsymmetricSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'braunblanquet': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.BraunBlanquetSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'cosine': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.CosineSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'dice': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.DiceSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'kulczynski': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.KulczynskiSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'mcconnaughey': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.McConnaugheySimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'rogotgoldberg': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.RogotGoldbergSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'russel': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.RusselSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'sokal': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.SokalSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'tanimoto': return np.array([[ DataStructs.FingerprintSimilarity( fp1, fp2, metric=DataStructs.TanimotoSimilarity) for fp2 in fps2 ] for fp1 in fps1]) elif measure == 'tversky': return np.array([[ DataStructs.TverskySimilarity(fp1, fp2, a=0.5, b=0.5) for fp2 in fps2 ] for fp1 in fps1]) # RDKit non-bit (integer or float) fingerprints if fingerprint in rdkitnonbitfps and measure in rdkitnonbitsims: if measure == 'dice': return np.array( [[DataStructs.DiceSimilarity(fp1, fp2) for fp2 in fps2] for fp1 in fps1]) elif measure == 'tanimoto': return np.array( [[DataStructs.TanimotoSimilarity(fp1, fp2) for fp2 in fps2] for fp1 in fps1]) # E-state fingerprints if fingerprint in rdkitestatefps and measure in rdkitestatesims: if measure == 'pearson': sims = np.array([[np.corrcoef(fp1, fp2)[0][1] for fp2 in fps2] for fp1 in fps1]) # check for nan's if sum([sum(np.isnan(sim)) for sim in sims]) == 0: return sims else: return None # unknown fingerprint and/or similarity measure return None