def test3FPgenerator(self): smiLines = open(self.smiName, 'r').readlines() fparams = FragmentCatalog.FragCatParams(1, 6, self.fName) fcat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() suppl = Chem.SmilesMolSupplier(self.smiName, " ", 0, 1, 0) smiles = [] for mol in suppl: nent = fgen.AddFragsFromMol(mol, fcat) smiles.append(Chem.MolToSmiles(mol)) assert fcat.GetNumEntries() == 21 assert fcat.GetFPLength() == 21, fcat.GetFPLength() fpgen = FragmentCatalog.FragFPGenerator() obits = [3, 2, 3, 3, 2, 3, 5, 5, 5, 4, 5, 6] obls = [(0, 1, 2), (1, 3), (1, 4, 5), (1, 6, 7), (0, 8), (0, 6, 9), (0, 1, 2, 3, 10), (0, 1, 2, 8, 11), (1, 3, 4, 5, 12), (1, 4, 5, 13), (1, 3, 6, 7, 14), (0, 1, 6, 7, 9, 15)] for i in range(len(smiles)): smi = smiles[i] mol = Chem.MolFromSmiles(smi) fp = fpgen.GetFPForMol(mol, fcat) if i < len(obits): assert fp.GetNumOnBits() == obits[i], '%s: %s' % ( smi, str(fp.GetOnBits())) obl = fp.GetOnBits() if i < len(obls): assert tuple(obl) == obls[i], '%s: %s' % (smi, obl)
def setUp(self): self.smiList = ["S(SC1=NC2=CC=CC=C2S1)C3=NC4=C(S3)C=CC=C4", "CC1=CC(=O)C=CC1=O", "OC1=C(Cl)C=C(C=C1[N+]([O-])=O)[N+]([O-])=O", "[O-][N+](=O)C1=CNC(=N)S1", "NC1=CC2=C(C=C1)C(=O)C3=C(C=CC=C3)C2=O", "OC(=O)C1=C(C=CC=C1)C2=C3C=CC(=O)C(=C3OC4=C2C=CC(=C4Br)O)Br", "CN(C)C1=C(Cl)C(=O)C2=C(C=CC=C2)C1=O", "CC1=C(C2=C(C=C1)C(=O)C3=CC=CC=C3C2=O)[N+]([O-])=O", "CC(=NO)C(C)=NO"] self.smiList2 = ['OCCC', 'CCC', 'C=CC', 'OC=CC', 'CC(O)C', 'C=C(O)C', 'OCCCC', 'CC(O)CC', 'C=CCC', 'CC=CC', 'OC=CCC', 'CC=C(O)C', 'OCC=CC', 'C=C(O)CC', 'C=CC(O)C', 'C=CCCO', ] self.list2Acts = [1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1] self.list2Obls = [(0, 1, 2), (1, 3), (1, 4, 5), (1, 6, 7), (0, 8), (0, 6, 9), (0, 1, 2, 3, 10), (0, 1, 2, 8, 11), (1, 3, 4, 5, 12), (1, 4, 5, 13), (1, 3, 6, 7, 14), (0, 1, 6, 7, 9, 15)] ffile = os.path.join(RDConfig.RDDataDir, 'FunctionalGroups.txt') self.catParams = FragmentCatalog.FragCatParams(1, 6, ffile) self.fragCat = FragmentCatalog.FragCatalog(self.catParams) self.fgen = FragmentCatalog.FragCatGenerator()
def test4Serialize(self): smiLines = open(self.smiName, 'r').readlines() fparams = FragmentCatalog.FragCatParams(1, 6, self.fName) fcat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() suppl = Chem.SmilesMolSupplier(self.smiName, " ", 0, 1, 0) smiles = [] for mol in suppl: nent = fgen.AddFragsFromMol(mol, fcat) smiles.append(Chem.MolToSmiles(mol)) assert fcat.GetNumEntries() == 21 assert fcat.GetFPLength() == 21, fcat.GetFPLength() pkl = cPickle.dumps(fcat) fcat2 = cPickle.loads(pkl) assert fcat2.GetNumEntries() == 21 assert fcat2.GetFPLength() == 21, fcat2.GetFPLength() fpgen = FragmentCatalog.FragFPGenerator() for i in range(len(smiles)): smi = smiles[i] mol = Chem.MolFromSmiles(smi) fp1 = fpgen.GetFPForMol(mol, fcat) fp2 = fpgen.GetFPForMol(mol, fcat2) assert fp1.GetNumOnBits() == fp2.GetNumOnBits() obl1 = fp1.GetOnBits() obl2 = fp2.GetOnBits() assert tuple(obl1) == tuple(obl2)
def calculate_fragments(smiles): """ Objective: Create fragments and import them into Neo4j based on our ontology Intent: This script is based on Adam's "mol_frag.ipynb" file in his deepml branch, which is based on rdkit's https://www.rdkit.org/docs/GettingStartedInPython.html. I still need some council on this one since we can tune how much fragment this script can generate for one SMILES. Also, everything (line 69 to 77) needs to be under a for loop or else it will break (as in not generating the correct amount of fragments, usually much less than the actual amount). I'm not sure why :param smiles: :return: """ fName = os.path.join(RDConfig.RDDataDir, 'FunctionalGroups.txt') fparams = FragmentCatalog.FragCatParams( 0, 4, fName) # I need more research and tuning on this one fcat = FragmentCatalog.FragCatalog( fparams) # The fragments are stored as entries fcgen = FragmentCatalog.FragCatGenerator() mol = MolFromSmiles(smiles) fcount = fcgen.AddFragsFromMol(mol, fcat) # print("This SMILES, %s, has %d fragments" % (smiles, fcount)) frag_list = [] for frag in range(fcount): frag_list.append( fcat.GetEntryDescription(frag)) # List of molecular fragments return frag_list
def generate_geneset(): atoms = [6, 7, 8, 9, 5, 15, 16, 17] fName = os.path.join(RDConfig.RDDataDir, 'FunctionalGroups.txt') rdkitFrags = FragmentCatalog.FragCatParams(1, 5, fName) customFrags = FragmentCatalog.FragCatalog(rdkitFrags) fcgen = FragmentCatalog.FragCatGenerator() m = Chem.MolFromSmiles('CCCC') fcgen.AddFragsFromMol(m, customFrags) return GeneSet(atoms, rdkitFrags, customFrags)
def test1Catalog(self): fparams = FragmentCatalog.FragCatParams(1, 6, self.fName, 1.0e-8) fcat = FragmentCatalog.FragCatalog(fparams) assert (fcat.GetNumEntries() == 0) assert (fcat.GetFPLength() == 0) nparams = fcat.GetCatalogParams() assert (nparams.GetLowerFragLength() == 1) assert (nparams.GetUpperFragLength() == 6)
def test1Catalog(self): fparams = FragmentCatalog.FragCatParams(1, 6, self.fName, 1.0e-8) fcat = FragmentCatalog.FragCatalog(fparams) self.assertEqual(fcat.GetNumEntries(), 0) self.assertEqual(fcat.GetFPLength(), 0) nparams = fcat.GetCatalogParams() self.assertEqual(nparams.GetLowerFragLength(), 1) self.assertEqual(nparams.GetUpperFragLength(), 6)
def test8Issue118(self): smiList = ['CCN(C(N)=O)N=O'] fName = os.path.join(RDConfig.RDDataDir, 'FunctionalGroups.txt') suppl = Chem.SmilesMolSupplierFromText('\n'.join(smiList), ',', 0, -1, 0) fparams = FragmentCatalog.FragCatParams(2, 4, fName, 1.0e-8) cat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() for mol in suppl: nent = fgen.AddFragsFromMol(mol, cat) self.assertEqual(cat.GetFPLength(), 1) self.assertEqual(cat.GetBitDescription(0), 'CCN(<-C(=O)N>)<-N=O>')
def test6DownEntries(self): fparams = FragmentCatalog.FragCatParams(1, 6, self.fName, 1.0e-8) fcat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() suppl = Chem.SmilesMolSupplier(self.smiName, " ", 0, 1, 0) for mol in suppl: nent = fgen.AddFragsFromMol(mol, fcat) assert fcat.GetNumEntries() == 21 assert fcat.GetFPLength() == 21 assert tuple(fcat.GetEntryDownIds(0)) == (2, 8, 9, 16) assert tuple(fcat.GetEntryDownIds(1)) == (2, 3, 5, 7)
def BuildCatalog(suppl, maxPts=-1, groupFileName=None, minPath=2, maxPath=6, reportFreq=10): """ builds a fragment catalog from a set of molecules in a delimited text block **Arguments** - suppl: a mol supplier - maxPts: (optional) if provided, this will set an upper bound on the number of points to be considered - groupFileName: (optional) name of the file containing functional group information - minPath, maxPath: (optional) names of the minimum and maximum path lengths to be considered - reportFreq: (optional) how often to display status information **Returns** a FragmentCatalog """ if groupFileName is None: groupFileName = os.path.join(RDConfig.RDDataDir, "FunctionalGroups.txt") fpParams = FragmentCatalog.FragCatParams(minPath, maxPath, groupFileName) catalog = FragmentCatalog.FragCatalog(fpParams) fgen = FragmentCatalog.FragCatGenerator() if maxPts > 0: nPts = maxPts else: if hasattr(suppl, '__len__'): nPts = len(suppl) else: nPts = -1 for i, mol in enumerate(suppl): if i == nPts: break if i and not i % reportFreq: if nPts > -1: message('Done %d of %d, %d paths\n' % (i, nPts, catalog.GetFPLength())) else: message('Done %d, %d paths\n' % (i, catalog.GetFPLength())) fgen.AddFragsFromMol(mol, catalog) return catalog
def test5FPsize(self): smiLines = open(self.smiName, 'r').readlines() fparams = FragmentCatalog.FragCatParams(6, 6, self.fName) fcat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() suppl = [Chem.MolFromSmiles('C1CCCOC1O')] for mol in suppl: nent = fgen.AddFragsFromMol(mol, fcat) assert fcat.GetFPLength() == 1 for i in range(fcat.GetFPLength()): assert fcat.GetBitOrder(i) == 6 assert fcat.GetBitDescription( i) == "C1CCOC<-O>C1", fcat.GetBitDescription(i) assert tuple(fcat.GetBitFuncGroupIds(i)) == (1, )
def test5FPsize(self): with open(self.smiName, 'r') as smiF: smiLines = smiF.readlines() fparams = FragmentCatalog.FragCatParams(6, 6, self.fName) fcat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() suppl = [Chem.MolFromSmiles('C1CCCOC1O')] for mol in suppl: nent = fgen.AddFragsFromMol(mol, fcat) self.assertEqual(fcat.GetFPLength(), 1) for i in range(fcat.GetFPLength()): self.assertEqual(fcat.GetBitOrder(i), 6) self.assertEqual(fcat.GetBitDescription(i), "C1CC<-O>OCC1") self.assertEqual(tuple(fcat.GetBitFuncGroupIds(i)), (1, ))
def test2Generator(self): fparams = FragmentCatalog.FragCatParams(1, 6, self.fName, 1.0e-8) fcat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() suppl = Chem.SmilesMolSupplier(self.smiName, " ", 0, 1, 0) for mol in suppl: nent = fgen.AddFragsFromMol(mol, fcat) self.assertEqual(fcat.GetNumEntries(), 21) self.assertEqual(fcat.GetFPLength(), 21) for id in range(fcat.GetNumEntries()): self.assertEqual(fcat.GetEntryBitId(id), id) self.assertEqual(fcat.GetEntryOrder(id), fcat.GetBitOrder(id)) self.assertEqual(fcat.GetEntryDescription(id), fcat.GetBitDescription(id)) self.assertEqual(tuple(fcat.GetEntryFuncGroupIds(id)), tuple(fcat.GetBitFuncGroupIds(id)))
def ScoreMolecules(suppl, catalog, maxPts=-1, actName='', acts=None, nActs=2, reportFreq=10): """ scores the compounds in a supplier using a catalog **Arguments** - suppl: a mol supplier - catalog: the FragmentCatalog - maxPts: (optional) the maximum number of molecules to be considered - actName: (optional) the name of the molecule's activity property. If this is not provided, the molecule's last property will be used. - acts: (optional) a sequence of activity values (integers). If not provided, the activities will be read from the molecules. - nActs: (optional) number of possible activity values - reportFreq: (optional) how often to display status information **Returns** a 2-tuple: 1) the results table (a 3D array of ints nBits x 2 x nActs) 2) a list containing the on bit lists for each molecule """ nBits = catalog.GetFPLength() resTbl = numpy.zeros((nBits, 2, nActs), numpy.int) obls = [] if not actName and not acts: actName = suppl[0].GetPropNames()[-1] fpgen = FragmentCatalog.FragFPGenerator() suppl.reset() i = 1 for mol in suppl: if i and not i % reportFreq: message('Done %d.\n' % (i)) if mol: if not acts: act = int(mol.GetProp(actName)) else: act = acts[i - 1] fp = fpgen.GetFPForMol(mol, catalog) obls.append([x for x in fp.GetOnBits()]) for j in range(nBits): resTbl[j, 0, act] += 1 for id_ in obls[i - 1]: resTbl[id_ - 1, 0, act] -= 1 resTbl[id_ - 1, 1, act] += 1 else: obls.append([]) i += 1 return resTbl, obls
def test7Issue116(self): smiList = ['Cc1ccccc1'] suppl = Chem.SmilesMolSupplierFromText('\n'.join(smiList), ',', 0, -1, 0) fparams = FragmentCatalog.FragCatParams(2, 2, self.fName, 1.0e-8) cat = FragmentCatalog.FragCatalog(fparams) fgen = FragmentCatalog.FragCatGenerator() for mol in suppl: nent = fgen.AddFragsFromMol(mol, cat) self.assertEqual(cat.GetFPLength(), 2) self.assertEqual(cat.GetBitDescription(0), 'ccC') fpgen = FragmentCatalog.FragFPGenerator() mol = Chem.MolFromSmiles('Cc1ccccc1') fp = fpgen.GetFPForMol(mol, cat) self.assertEqual(fp[0], 1) self.assertEqual(fp[1], 1) mol = Chem.MolFromSmiles('c1ccccc1-c1ccccc1') fp = fpgen.GetFPForMol(mol, cat) self.assertEqual(fp[0], 0) self.assertEqual(fp[1], 1)
def test0Params(self): fparams = FragmentCatalog.FragCatParams(1, 6, self.fName, 1.0e-8) ctype = fparams.GetTypeString() assert (ctype == "Fragment Catalog Parameters") assert (fparams.GetLowerFragLength() == 1) assert (fparams.GetUpperFragLength() == 6) ngps = fparams.GetNumFuncGroups() assert ngps == 15 for i in range(ngps): mol = fparams.GetFuncGroup(i)
def fragment_database(self): fName = 'C:/RDKit_2017_03_2/Data/FunctionalGroups.txt' fparams = FragmentCatalog.FragCatParams(1, 6, fName) self.fcat = FragmentCatalog.FragCatalog(fparams) ## macrocycle_file = 'macrocycles_IDs.csv' ## suppl = [i.split(',')[0] for i in open(self.directory+name,'r').read().splitlines()][1:] # read all the macrocycle smiles from file ## ms = [Chem.MolFromSmiles(i) for i in suppl] # mols of macrocycles zinc_file = 'data/smiles_database.csv' zinc_suppl = [ i.split(',')[1] for i in open(self.directory + zinc_file, 'r').read().splitlines() ][1:] zinc_ms = [Chem.MolFromSmiles(i) for i in zinc_suppl] pre_synthetic_frag_database = [ BRICS.BRICSDecompose(i) for i in zinc_ms ] self.synthetic_frag_database = list( set(chain.from_iterable(pre_synthetic_frag_database)))
def test0Params(self): fparams = FragmentCatalog.FragCatParams(1, 6, self.fName, 1.0e-8) ctype = fparams.GetTypeString() self.assertEqual(ctype, "Fragment Catalog Parameters") self.assertEqual(fparams.GetLowerFragLength(), 1) self.assertEqual(fparams.GetUpperFragLength(), 6) ngps = fparams.GetNumFuncGroups() self.assertEqual(ngps, 15) for i in range(ngps): mol = fparams.GetFuncGroup(i)
def CalcGains(suppl,catalog,topN=-1,actName='',acts=None, nActs=2,reportFreq=10,biasList=None,collectFps=0): """ calculates info gains by constructing fingerprints *DOC* Returns a 2-tuple: 1) gains matrix 2) list of fingerprints """ nBits = catalog.GetFPLength() if topN < 0: topN = nBits if not actName and not acts: actName = suppl[0].GetPropNames()[-1] gains = [0]*nBits if hasattr(suppl,'__len__'): nMols = len(suppl) else: nMols = -1 fpgen = FragmentCatalog.FragFPGenerator() #ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) if biasList: ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.BIASENTROPY) ranker.SetBiasList(biasList) else: ranker = InfoTheory.InfoBitRanker(nBits,nActs,InfoTheory.InfoType.ENTROPY) i = 0 fps = [] for mol in suppl: if not acts: try: act = int(mol.GetProp(actName)) except KeyError: message('ERROR: Molecule has no property: %s\n'%(actName)) message('\tAvailable properties are: %s\n'%(str(mol.GetPropNames()))) raise KeyError(actName) else: act = acts[i] if i and not i%reportFreq: if nMols>0: message('Done %d of %d.\n'%(i,nMols)) else: message('Done %d.\n'%(i)) fp = fpgen.GetFPForMol(mol,catalog) ranker.AccumulateVotes(fp,act) i+=1; if collectFps: fps.append(fp) gains = ranker.GetTopN(topN) return gains,fps
def _test5MoreComplex(self): lastIdx = 0 ranges = {} suppl = Chem.SmilesMolSupplierFromText('\n'.join(self.smiList), ',', 0, -1, 0) for i, mol in enumerate(suppl): nEnt = self.fgen.AddFragsFromMol(mol, self.fragCat) ranges[i] = range(lastIdx, lastIdx + nEnt) lastIdx += nEnt # now make sure that those bits are contained in the signatures: fpgen = FragmentCatalog.FragFPGenerator() for i, mol in enumerate(suppl): fp = fpgen.GetFPForMol(mol, self.fragCat) for bit in ranges[i]: assert fp[bit], '%s: %s' % (Chem.MolToSmiles(mol), str(bit))
def test9Issue116(self): smiList = ['Cc1ccccc1'] suppl = Chem.SmilesMolSupplierFromText('\n'.join(smiList), ',', 0, -1, 0) cat = BuildFragmentCatalog.BuildCatalog(suppl, minPath=2, maxPath=2) assert cat.GetFPLength() == 2 assert cat.GetBitDescription(0) == 'ccC' fpgen = FragmentCatalog.FragFPGenerator() mol = Chem.MolFromSmiles('Cc1ccccc1') fp = fpgen.GetFPForMol(mol, cat) assert fp[0] assert fp[1] mol = Chem.MolFromSmiles('c1ccccc1-c1ccccc1') fp = fpgen.GetFPForMol(mol, cat) assert not fp[0] assert fp[1]
def _testBits(self, fragCat): fpgen = FragmentCatalog.FragFPGenerator() obits = [3, 2, 3, 3, 2, 3, 5, 5, 5, 4, 5, 6] obls = self.list2Obls suppl = Chem.SmilesMolSupplierFromText('\n'.join(self.smiList2), ',', 0, -1, 0) i = 0 for mol in suppl: fp = fpgen.GetFPForMol(mol, fragCat) if i < len(obits): smi = Chem.MolToSmiles(mol) assert fp.GetNumOnBits() == obits[i], '%s: %s' % (smi, str(fp.GetOnBits())) obl = fp.GetOnBits() if i < len(obls): assert tuple(obl) == obls[i], '%s: %s' % (smi, obl) i += 1
def generate_geneset(): """ Populates the GeneSet class with atoms and fragments to be used by the engine. As it stands these are hardcoded into the engine but will probably be adapted in future versions Parameters ---------- None Returns ---------- GeneSet : object returns an instance of the GeneSet class containing atoms, rdkit fragments, and custom fragments """ atoms = [6, 7, 8, 9, 5, 15, 16, 17] fName = os.path.join(RDConfig.RDDataDir, 'FunctionalGroups.txt') rdkitFrags = FragmentCatalog.FragCatParams(1, 5, fName) customFrags = FragmentCatalog.FragCatalog(rdkitFrags) fcgen = FragmentCatalog.FragCatGenerator() m = Chem.MolFromSmiles('CCCC') fcgen.AddFragsFromMol(m, customFrags) return GeneSet(atoms, rdkitFrags, customFrags)
import rdkit from rdkit import Chem from rdkit.Chem import FragmentCatalog labels = [] all_molecules = [] with open('train.csv') as f: f.readline() for line in f: if line != '': l = line.strip().split(',') assert len(l) == 3 smiles = l[1] labels.append(int(l[2])) mol = Chem.MolFromSmiles(smiles) all_molecules.append(mol) >>> fName=os.path.join(RDConfig.RDDataDir,'FunctionalGroups.txt') >>> from rdkit.Chem import FragmentCatalog >>> fparams = FragmentCatalog.FragCatParams(1,6,fName) >>> fparams.GetNumFuncGroups()
#!/usr/bin/env python import os from rdkit import Chem from rdkit import RDConfig from rdkit.Chem import FragmentCatalog fName = os.path.join(RDConfig.RDDataDir, 'FunctionalGroups.txt') fparams = FragmentCatalog.FragCatParams(1, 6, fName) print('found %d functional groups in catalog' % (fparams.GetNumFuncGroups())) fcat = FragmentCatalog.FragCatalog(fparams) fcgen = FragmentCatalog.FragCatGenerator() smiles = 'OCC=CC(=O)O' m = Chem.MolFromSmiles(smiles) print('examining molecule: ' + smiles) frag_count = fcgen.AddFragsFromMol(m, fcat) print('identified %d fragments' % (frag_count)) for m in range(frag_count): print(fcat.GetEntryDescription(m))