def test3FPgenerator(self): smiLines = open(self.smiName, 'r').readlines() fparams = FragmentCatalog.FragCatParams(1, 6, self.fName) fcat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() suppl = Chem.SmilesMolSupplier(self.smiName, " ", 0, 1, 0) smiles = [] for mol in suppl: nent = fgen.AddFragsFromMol(mol, fcat) smiles.append(Chem.MolToSmiles(mol)) assert fcat.GetNumEntries() == 21 assert fcat.GetFPLength() == 21, fcat.GetFPLength() fpgen = FragmentCatalog.FragFPGenerator() obits = [3, 2, 3, 3, 2, 3, 5, 5, 5, 4, 5, 6] obls = [(0, 1, 2), (1, 3), (1, 4, 5), (1, 6, 7), (0, 8), (0, 6, 9), (0, 1, 2, 3, 10), (0, 1, 2, 8, 11), (1, 3, 4, 5, 12), (1, 4, 5, 13), (1, 3, 6, 7, 14), (0, 1, 6, 7, 9, 15)] for i in range(len(smiles)): smi = smiles[i] mol = Chem.MolFromSmiles(smi) fp = fpgen.GetFPForMol(mol, fcat) if i < len(obits): assert fp.GetNumOnBits() == obits[i], '%s: %s' % ( smi, str(fp.GetOnBits())) obl = fp.GetOnBits() if i < len(obls): assert tuple(obl) == obls[i], '%s: %s' % (smi, obl)
def test4Serialize(self): smiLines = open(self.smiName, 'r').readlines() fparams = FragmentCatalog.FragCatParams(1, 6, self.fName) fcat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() suppl = Chem.SmilesMolSupplier(self.smiName, " ", 0, 1, 0) smiles = [] for mol in suppl: nent = fgen.AddFragsFromMol(mol, fcat) smiles.append(Chem.MolToSmiles(mol)) assert fcat.GetNumEntries() == 21 assert fcat.GetFPLength() == 21, fcat.GetFPLength() pkl = cPickle.dumps(fcat) fcat2 = cPickle.loads(pkl) assert fcat2.GetNumEntries() == 21 assert fcat2.GetFPLength() == 21, fcat2.GetFPLength() fpgen = FragmentCatalog.FragFPGenerator() for i in range(len(smiles)): smi = smiles[i] mol = Chem.MolFromSmiles(smi) fp1 = fpgen.GetFPForMol(mol, fcat) fp2 = fpgen.GetFPForMol(mol, fcat2) assert fp1.GetNumOnBits() == fp2.GetNumOnBits() obl1 = fp1.GetOnBits() obl2 = fp2.GetOnBits() assert tuple(obl1) == tuple(obl2)
def ScoreMolecules(suppl, catalog, maxPts=-1, actName='', acts=None, nActs=2, reportFreq=10): """ scores the compounds in a supplier using a catalog **Arguments** - suppl: a mol supplier - catalog: the FragmentCatalog - maxPts: (optional) the maximum number of molecules to be considered - actName: (optional) the name of the molecule's activity property. If this is not provided, the molecule's last property will be used. - acts: (optional) a sequence of activity values (integers). If not provided, the activities will be read from the molecules. - nActs: (optional) number of possible activity values - reportFreq: (optional) how often to display status information **Returns** a 2-tuple: 1) the results table (a 3D array of ints nBits x 2 x nActs) 2) a list containing the on bit lists for each molecule """ nBits = catalog.GetFPLength() resTbl = numpy.zeros((nBits, 2, nActs), numpy.int) obls = [] if not actName and not acts: actName = suppl[0].GetPropNames()[-1] fpgen = FragmentCatalog.FragFPGenerator() suppl.reset() i = 1 for mol in suppl: if i and not i % reportFreq: message('Done %d.\n' % (i)) if mol: if not acts: act = int(mol.GetProp(actName)) else: act = acts[i - 1] fp = fpgen.GetFPForMol(mol, catalog) obls.append([x for x in fp.GetOnBits()]) for j in range(nBits): resTbl[j, 0, act] += 1 for id_ in obls[i - 1]: resTbl[id_ - 1, 0, act] -= 1 resTbl[id_ - 1, 1, act] += 1 else: obls.append([]) i += 1 return resTbl, obls
def CalcGains(suppl,catalog,topN=-1,actName='',acts=None, nActs=2,reportFreq=10,biasList=None,collectFps=0): """ calculates info gains by constructing fingerprints *DOC* Returns a 2-tuple: 1) gains matrix 2) list of fingerprints """ nBits = catalog.GetFPLength() if topN < 0: topN = nBits if not actName and not acts: actName = suppl[0].GetPropNames()[-1] gains = [0]*nBits if hasattr(suppl,'__len__'): nMols = len(suppl) else: nMols = -1 fpgen = FragmentCatalog.FragFPGenerator() #ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) if biasList: ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY) ranker.SetBiasList(biasList) else: ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) i = 0 fps = [] for mol in suppl: if not acts: try: act = int(mol.GetProp(actName)) except KeyError: message('ERROR: Molecule has no property: %s\n'%(actName)) message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames()))) raise KeyError(actName) else: act = acts[i] if i and not i%reportFreq: if nMols>0: message('Done %d of %d.\n'%(i,nMols)) else: message('Done %d.\n'%(i)) fp = fpgen.GetFPForMol(mol,catalog) ranker.AccumulateVotes(fp,act) i+=1; if collectFps: fps.append(fp) gains = ranker.GetTopN(topN) return gains,fps
def _test5MoreComplex(self): lastIdx = 0 ranges = {} suppl = Chem.SmilesMolSupplierFromText('\n'.join(self.smiList), ',', 0, -1, 0) for i, mol in enumerate(suppl): nEnt = self.fgen.AddFragsFromMol(mol, self.fragCat) ranges[i] = range(lastIdx, lastIdx + nEnt) lastIdx += nEnt # now make sure that those bits are contained in the signatures: fpgen = FragmentCatalog.FragFPGenerator() for i, mol in enumerate(suppl): fp = fpgen.GetFPForMol(mol, self.fragCat) for bit in ranges[i]: assert fp[bit], '%s: %s' % (Chem.MolToSmiles(mol), str(bit))
def _testBits(self, fragCat): fpgen = FragmentCatalog.FragFPGenerator() obits = [3, 2, 3, 3, 2, 3, 5, 5, 5, 4, 5, 6] obls = self.list2Obls suppl = Chem.SmilesMolSupplierFromText('\n'.join(self.smiList2), ',', 0, -1, 0) i = 0 for mol in suppl: fp = fpgen.GetFPForMol(mol, fragCat) if i < len(obits): smi = Chem.MolToSmiles(mol) assert fp.GetNumOnBits() == obits[i], '%s: %s' % (smi, str(fp.GetOnBits())) obl = fp.GetOnBits() if i < len(obls): assert tuple(obl) == obls[i], '%s: %s' % (smi, obl) i += 1
def test9Issue116(self): smiList = ['Cc1ccccc1'] suppl = Chem.SmilesMolSupplierFromText('\n'.join(smiList), ',', 0, -1, 0) cat = BuildFragmentCatalog.BuildCatalog(suppl, minPath=2, maxPath=2) assert cat.GetFPLength() == 2 assert cat.GetBitDescription(0) == 'ccC' fpgen = FragmentCatalog.FragFPGenerator() mol = Chem.MolFromSmiles('Cc1ccccc1') fp = fpgen.GetFPForMol(mol, cat) assert fp[0] assert fp[1] mol = Chem.MolFromSmiles('c1ccccc1-c1ccccc1') fp = fpgen.GetFPForMol(mol, cat) assert not fp[0] assert fp[1]
def test7Issue116(self): smiList = ['Cc1ccccc1'] suppl = Chem.SmilesMolSupplierFromText('\n'.join(smiList), ',', 0, -1, 0) fparams = FragmentCatalog.FragCatParams(2, 2, self.fName, 1.0e-8) cat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() for mol in suppl: nent = fgen.AddFragsFromMol(mol, cat) self.assertEqual(cat.GetFPLength(), 2) self.assertEqual(cat.GetBitDescription(0), 'ccC') fpgen = FragmentCatalog.FragFPGenerator() mol = Chem.MolFromSmiles('Cc1ccccc1') fp = fpgen.GetFPForMol(mol, cat) self.assertEqual(fp[0], 1) self.assertEqual(fp[1], 1) mol = Chem.MolFromSmiles('c1ccccc1-c1ccccc1') fp = fpgen.GetFPForMol(mol, cat) self.assertEqual(fp[0], 0) self.assertEqual(fp[1], 1)
ms = [Chem.MolFromSmiles('OCC(NC1CC1)CCC'), Chem.MolFromSmiles('OCC=CC(=O)O')] # 片段存储器 fcat = FragmentCatalog.FragCatalog(fparams) # 片段生成器 for m in ms: fcgen.AddFragsFromMol(m, fcat) # 查看分子片段数量 num_entries = fcat.GetNumEntries() print(num_entries) # 17 # 存储器收集完所有片段后 , 再用它来生成分子指纹 # 创建一个片段指纹生成器:FragFPGenerator() fpgen = FragmentCatalog.FragFPGenerator() # 传入分子和存储器用于生成指纹:GetFPForMol(mol,fcat) fp1 = fpgen.GetFPForMol(ms[1], fcat) # 以字符串形式查看指纹:ToBitString() print(fp1.ToBitString()) # 10000000000000011 # 查看指纹中哪些位是有效的:GetOnBits() print(list(fp1.GetOnBits())) # [0, 15, 16] # 可以用处理一般分子指纹的方法来处理片段分子指纹,例如寻找相同的片段 # 先对分子指纹做“&”位运算,两个指纹结果都为1时,结果为1,否则为0 # 获取两个指纹中都出现的片段:GetOnBits() # 查看片段信息:GetEnteyDescription() fp0 = fpgen.GetFPForMol(ms[0], fcat)