def GetMorganFingerprint(mol, atomId=-1, radius=2, fpType='bv', nBits=2048, useFeatures=False): """ Calculates the Morgan fingerprint with the counts of atomId removed. Parameters: mol -- the molecule of interest radius -- the maximum radius fpType -- the type of Morgan fingerprint: 'count' or 'bv' atomId -- the atom to remove the counts for (if -1, no count is removed) nBits -- the size of the bit vector (only for fpType = 'bv') useFeatures -- if false: ConnectivityMorgan, if true: FeatureMorgan """ if fpType not in ['bv', 'count']: raise ValueError("Unknown Morgan fingerprint type") if not hasattr(mol, '_fpInfo'): info = {} # get the fingerprint if fpType == 'bv': molFp = rdMD.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits, useFeatures=useFeatures, bitInfo=info) else: molFp = rdMD.GetMorganFingerprint(mol, radius, useFeatures=useFeatures, bitInfo=info) # construct the bit map if fpType == 'bv': bitmap = [DataStructs.ExplicitBitVect(nBits) for x in range(mol.GetNumAtoms())] else: bitmap = [[] for x in range(mol.GetNumAtoms())] for bit, es in info.iteritems(): for at1, rad in es: if rad == 0: # for radius 0 if fpType == 'bv': bitmap[at1][bit] = 1 else: bitmap[at1].append(bit) else: # for radii > 0 env = Chem.FindAtomEnvironmentOfRadiusN(mol, rad, at1) amap = {} submol = Chem.PathToSubmol(mol, env, atomMap=amap) for at2 in amap.keys(): if fpType == 'bv': bitmap[at2][bit] = 1 else: bitmap[at2].append(bit) mol._fpInfo = (molFp, bitmap) if atomId < 0: return mol._fpInfo[0] else: # remove the bits of atomId if atomId >= mol.GetNumAtoms(): raise ValueError("atom index greater than number of atoms") if len(mol._fpInfo) != 2: raise ValueError("_fpInfo not set") if fpType == 'bv': molFp = mol._fpInfo[0] ^ mol._fpInfo[1][atomId] # xor else: # count molFp = copy.deepcopy(mol._fpInfo[0]) # delete the bits with atomId for bit in mol._fpInfo[1][atomId]: molFp[bit] -= 1 return molFp
def test10BulkOps2(self): nbits = 10000 bvs = [] for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs.append(bv) bvs = tuple(bvs) sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.failUnless(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.failUnless(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.failUnless(feq(sim, sims[i]))
def test7FPS(self): bv = DataStructs.ExplicitBitVect(32) bv.SetBit(0) bv.SetBit(1) bv.SetBit(17) bv.SetBit(23) bv.SetBit(31) self.assertEqual(DataStructs.BitVectToFPSText(bv), "03008280") bv2 = DataStructs.CreateFromFPSText("03008280") self.assertEqual(bv, bv2) self.assertRaises(ValueError, lambda: DataStructs.CreateFromFPSText("030082801")) bv2 = DataStructs.CreateFromFPSText("") self.assertEqual(bv2.GetNumBits(), 0)
def test10BulkOps3(self): nbits = 10000 bvs = numpy.empty((10, ), DataStructs.ExplicitBitVect) for bvi in range(10): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs[bvi] = bv sims = DataStructs.BulkTanimotoSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkDiceSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkAllBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.AllBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkOnBitSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.OnBitSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkRogotGoldbergSimilarity(bvs[0], bvs) for i in range(len(bvs)): sim = DataStructs.RogotGoldbergSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, 1, 1) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], 1, 1) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.TanimotoSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i])) sims = DataStructs.BulkTverskySimilarity(bvs[0], bvs, .5, .5) for i in range(len(bvs)): sim = DataStructs.TverskySimilarity(bvs[0], bvs[i], .5, .5) self.assertTrue(feq(sim, sims[i])) sim = DataStructs.DiceSimilarity(bvs[0], bvs[i]) self.assertTrue(feq(sim, sims[i]))
def testNonUniqueCrash(self): from rdkit import DataStructs sz = 300 nbits = 40 nBitsToSet = int(nbits * .3) N = 8 vs = [] for i in range(sz): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nBitsToSet): val = int(nbits * random.random()) bv.SetBit(val) vs.append(bv) vs.append(bv) def taniFunc(i, j, bvs=vs): d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j]) return d picker = rdSimDivPickers.MaxMinPicker() mm1 = picker.LazyPick(taniFunc, len(vs), N) self.assertEqual(len(mm1), N) picker = None picker = rdSimDivPickers.MaxMinPicker() mm2 = picker.LazyBitVectorPick(vs, len(vs), N) self.assertEqual(len(mm2), N) picker = rdSimDivPickers.MaxMinPicker() mm3 = picker.LazyBitVectorPick(vs, len(vs), N) self.assertEqual(len(mm3), N) # we get the occasional dupe randomly, # make sure we don't get three dupes in a row self.assertTrue(tuple(mm2) != tuple(mm1)) or (tuple(mm3) != tuple(mm1)) picker = None ds = [] nvs = len(vs) for i in range(nvs): for j in range(i + 1, nvs): d = taniFunc(i, j) ds.append(d) m = numpy.array(ds) picker = rdSimDivPickers.HierarchicalClusterPicker( rdSimDivPickers.ClusterMethod.WARD) p1 = list(picker.Pick(m, nvs, N))
def process_one_target(file, targets, top_n_matches, outfile=None): import rdkit import base64 from rdkit import RDLogger from rdkit import DataStructs from pstats import SortKey import pickle from operator import itemgetter import csv target_results = {} read = pickle.load(open(file, 'rb')) fingerprint_set = [] for sm, identifier, fp in read: # Next TRY here as some do not convert try: bv = DataStructs.ExplicitBitVect(base64.b64decode(fp)) except: bv = None fingerprint_set += [(sm, bv, identifier)] for smile_target in targets: best_so_far = [('', 0.0) for index in range(top_n_matches)] bit_target = targets[smile_target] # Find scores for non-None fingerprints in fingerprint set scores = [] for (smile, fingerprint, identifier) in fingerprint_set: try: score = DataStructs.TanimotoSimilarity(fingerprint, bit_target) scores += [(smile, score, identifier)] except: pass sorted_scores = sorted(scores, key=itemgetter(1)) new_list = sorted_scores[-top_n_matches:] target_results[smile_target] = new_list if outfile: with open(outfile, 'wb') as f: pickle.dump(target_results, f) return outfile else: return (target_results)
def generateIFP(self, ligand, protein): """Generates the complete IFP from each residue's bitstring""" IFPvector = DataStructs.ExplicitBitVect( len(self.interactions) * len(protein.residues)) i = 0 IFP = '' for residue in sorted(protein.residues, key=get_resnumber): bitstring = self.generateBitstring(ligand, protein.residues[residue]) for bit in bitstring: if bit == '1': IFPvector.SetBit(i) i += 1 IFP += bitstring ligand.setIFP(IFP, IFPvector) return IFPvector
def MapToClusterFP(self, fp): """ Map the fingerprint to a smaller sized (= number of clusters) fingerprint Each cluster get a bit in the new fingerprint and is turned on if any of the bits in the cluster are turned on in the original fingerprint""" ebv = DataStructs.ExplicitBitVect(self._nClusters) i = 0 for cls in self._clusters: for bid in cls: if fp[bid]: ebv.SetBit(i) break i += 1 return ebv
def _BuildFp(self,data): data = list(data) pkl = str(data[self.fpCol]) del data[self.fpCol] self._numProcessed+=1; try: if self._usePickles: newFp = cPickle.loads(pkl) else: newFp = DataStructs.ExplicitBitVect(pkl) except: import traceback traceback.print_exc() newFp = None if newFp: newFp._fieldsFromDb = data return newFp
def _BuildFp(self, data): data = list(data) pkl = bytes(data[self.fpCol], encoding='Latin1') del data[self.fpCol] self._numProcessed += 1 try: if self._usePickles: newFp = pickle.loads(pkl, encoding='bytes') else: newFp = DataStructs.ExplicitBitVect(pkl) except Exception: import traceback traceback.print_exc() newFp = None if newFp: newFp._fieldsFromDb = data return newFp
def test4ebv(self): n = 30 m = 2048 dm = 800 lst = [] for i in range(n): v = DataStructs.ExplicitBitVect(m) for j in range(dm): v.SetBit(random.randrange(0, m)) lst.append(v) dMat = rdmmc.GetTanimotoDistMat(lst) sMat = rdmmc.GetTanimotoSimMat(lst) for i in range(n * (n - 1) // 2): assert feq(sMat[i] + dMat[i], 1.0)
def CalculateFP3Fingerprint(mol: Chem.Mol, rtype: str = 'bitstring') -> Tuple[str, dict, Any]: """Calculate FP3 fingerprints (55 bits). :param rtype: Type of output, may either be: bitstring (default), returns a binary string rdkit, return the native rdkit DataStructs dict, for a dict of bits turned on """ m = pybel.readstring('smi', Chem.MolToSmiles(mol)) temp = m.calcfp('FP3').bits if rtype == 'dict': return {f'FP3_{i}': 1 for i in temp} bv = DataStructs.ExplicitBitVect(55) bv.SetBitsFromList([x - 1 for x in temp]) if rtype == 'rdkit': return bv return bv.ToBitString()
def __setstate__(self, pkl): self.__vects = {} self.__orVect = None self.__numBits = -1 self.__needReset = True szI = struct.calcsize('I') offset = 0 nToRead = struct.unpack('<I', pkl[offset:offset + szI])[0] offset += szI for i in range(nToRead): k = struct.unpack('<I', pkl[offset:offset + szI])[0] offset += szI l = struct.unpack('<I', pkl[offset:offset + szI])[0] offset += szI sz = struct.calcsize('%ds' % l) bv = DataStructs.ExplicitBitVect( struct.unpack('%ds' % l, pkl[offset:offset + sz])[0]) offset += sz self.AddVect(k, bv)
def __setstate__(self, pkl): if six.PY3 and isinstance(pkl, str): pkl = bytes(pkl, encoding='Latin1') self.__vects = {} self.__orVect = None self.__numBits = -1 self.__needReset = True szI = struct.calcsize('I') offset = 0 nToRead = struct.unpack('<I', pkl[offset:offset + szI])[0] offset += szI for _ in range(nToRead): k = struct.unpack('<I', pkl[offset:offset + szI])[0] offset += szI l = struct.unpack('<I', pkl[offset:offset + szI])[0] offset += szI sz = struct.calcsize('%ds' % l) bv = DataStructs.ExplicitBitVect(struct.unpack('%ds' % l, pkl[offset:offset + sz])[0]) offset += sz self.AddVect(k, bv)
def testNonUniqueCrash(self): from rdkit import DataStructs sz = 10 nbits = 20 nBitsToSet = int(nbits * .3) N = 12 vs = [] for i in range(sz): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nBitsToSet): val = int(nbits * random.random()) bv.SetBit(val) vs.append(bv) vs.append(bv) def taniFunc(i, j, bvs=vs): d = 1 - DataStructs.FingerprintSimilarity(bvs[i], bvs[j]) return d picker = rdSimDivPickers.MaxMinPicker() try: mm = picker.LazyPick(taniFunc, len(vs), N) except: ok = False else: ok = True self.failUnless(ok) self.failUnless(len(mm) == N) picker = None ds = [] nvs = len(vs) for i in range(nvs): for j in range(i + 1, nvs): d = taniFunc(i, j) ds.append(d) m = numpy.array(ds) picker = rdSimDivPickers.HierarchicalClusterPicker( rdSimDivPickers.ClusterMethod.WARD) p1 = list(picker.Pick(m, nvs, N))
def test11BulkNeighbors(self): nbits = 2048 bvs = [] for bvi in range(1000): bv = DataStructs.ExplicitBitVect(nbits) for j in range(nbits): x = random.randrange(0, nbits) bv.SetBit(x) bvs.append(bv) qs = bvs[:10] db = bvs[10:] for metric in ['Tanimoto','Cosine', 'Kulczynski', 'Dice', 'Sokal', 'McConnaughey', 'Asymmetric', 'BraunBlanquet', 'Russel', 'RogotGoldberg']: bulkSim = getattr(DataStructs,f'Bulk{metric}Similarity') nbrSim = getattr(DataStructs,f'{metric}SimilarityNeighbors') tgts = [] for q in qs: sims = bulkSim(q,db) sim, idx = max((sim, -idx) for idx, sim in enumerate(sims)) tgts.append((-idx,sim)) nbrs = nbrSim(qs,db) self.assertEqual(tgts,nbrs)
def test1Cluster(self): if BitClusterer is None: return cmg = rdInfoTheory.BitCorrMatGenerator() cmg.SetBitList(self.blist) for fp in self.fps: cmg.CollectVotes(fp) corrMat = cmg.GetCorrMatrix() bcl = BitClusterer.BitClusterer(self.blist, self.nbits / 2) bcl.ClusterBits(corrMat) cls = bcl.GetClusters() for cl in cls: assert len(cl) == 2 assert (cl[0] + self.nbits / 2) == cl[1] tfp = DataStructs.ExplicitBitVect(self.nbits) obits = range(0, self.nbits / 4) + range(self.nbits / 2, 3 * self.nbits / 4) tfp.SetBitsFromList(obits) rvc = bcl.MapToClusterScores(tfp) assert len(rvc) == self.nbits / 2 for i in range(self.nbits / 2): if i < self.nbits / 4: assert rvc[i] == 2 else: assert rvc[i] == 0 nfp = bcl.MapToClusterFP(tfp) assert len(nfp) == self.nbits / 2 for i in range(self.nbits / 2): if i < self.nbits / 4: assert nfp[i] else: assert not nfp[i]
def test1Cluster(self): cmg = rdInfoTheory.BitCorrMatGenerator() cmg.SetBitList(self.blist) for fp in self.fps: cmg.CollectVotes(fp) corrMat = cmg.GetCorrMatrix() bcl = BitClusterer.BitClusterer(self.blist, self.nbits // 2) bcl.ClusterBits(corrMat) cls = bcl.GetClusters() for cl in cls: self.assertEqual(len(cl), 2) self.assertEqual((cl[0] + self.nbits // 2), cl[1]) bcl.SetClusters(cls) self.assertRaises(AssertionError, bcl.SetClusters, cls[:-1]) tfp = DataStructs.ExplicitBitVect(self.nbits) obits = list(range(0, self.nbits // 4)) + list( range(self.nbits // 2, 3 * self.nbits // 4)) tfp.SetBitsFromList(obits) rvc = bcl.MapToClusterScores(tfp) self.assertEqual(len(rvc), self.nbits // 2) for i in range(self.nbits // 2): if i < self.nbits // 4: self.assertEqual(rvc[i], 2) else: self.assertEqual(rvc[i], 0) nfp = bcl.MapToClusterFP(tfp) self.assertEqual(len(nfp), self.nbits // 2) for i in range(self.nbits // 2): if i < self.nbits // 4: self.assertTrue(nfp[i]) else: self.assertFalse(nfp[i])
def partialSimilarity(atomID): """ Determine similarity for the atoms set by atomID """ # create empty fp modifiedFP = DataStructs.ExplicitBitVect(1024) modifiedFP.SetBitsFromList(aBits[atomID]) return DataStructs.TverskySimilarity(subsFp, modifiedFP, 0, 1)
fpq = [ Chem.RDKFingerprint(x, fpSize=2048, minPath=7, useBondOrder=False, nBitsPerHash=1, minSize=2048, useHs=False) for x in MyMolList ] #fpq = [Chem.PatternFingerprint(x,fpSize=2048) for x in MyMolList] AllQBits = [list(fpqe.GetOnBits()) for fpqe in fpq] AllQBits = list(set([j for i in AllQBits for j in i])) if len(AllQBits) == 0: AllQBits = [2047] bv = DataStructs.ExplicitBitVect(2048) bv.SetBitsFromList(AllQBits) print " the number of compounds generated from the compound used to query: ", len( MyMolList) print len(AllQBits) print Chem.MolToSmiles(MyMol) print len(MyMolFp.GetOnBits()) ################################ End Query Ensemble Creation ################################################################ ############################ Find the AntiSmash compounds with the most overlap with the query ensemble fingerprint ######### print " Culling the AntiSmash compounds " print " examining fingerprints : ", len(fps) MySearchList = [] similarity = 0.17
def test3Bounds(self): nbits = 10 bv1 = DataStructs.ExplicitBitVect(nbits) bv1[0] with self.assertRaisesRegex(IndexError, ""): bv1[11]
def test01BVWithAllOnes(self): bv1 = DataStructs.ExplicitBitVect(10, True) for i in range(10): assert bv1.GetBit(i) == 1