def __testBuildSearchIndexCacheFiles(self, **kwargs): """Test build search index chemical component cache files from the input component dictionaries""" molLimit = kwargs.get("molLimit", None) useCache = kwargs.get("useCache", False) logSizes = kwargs.get("logSizes", False) limitPerceptions = kwargs.get("limitPerceptions", False) numProc = kwargs.get("numProc", 1) maxChunkSize = kwargs.get("maxChunkSize", 5) molLimit = kwargs.get("molLimit", None) ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc") quietFlag = kwargs.get("quietFlag", True) ccUrlTarget = kwargs.get("ccUrlTarget", None) birdUrlTarget = kwargs.get("birdUrlTarget", None) # ccsiP = ChemCompSearchIndexProvider( ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, cachePath=self.__cachePath, useCache=useCache, molLimit=molLimit, ccFileNamePrefix=ccFileNamePrefix, limitPerceptions=limitPerceptions, numProc=numProc, maxChunkSize=maxChunkSize, quietFlag=quietFlag, ) ok = ccsiP.testCache(minCount=molLimit, logSizes=logSizes) self.assertTrue(ok) logger.info(" ******* Completed operation ******** ") # return ccsiP
def testSubStructureSearchFromIndexSelected(self): matchOpts = self.__myKwargs.get("matchOpts", "sub-struct-graph-relaxed") numProc = self.__numProcSearch oemp = OeSearchMoleculeProvider(**self.__myKwargs) ok = oemp.testCache() self.assertTrue(ok) oesU = OeSubStructSearchUtils(oemp) # ccIdxP = ChemCompSearchIndexProvider(**self.__myKwargs) ok = ccIdxP.testCache(minCount=self.__minCount) self.assertTrue(ok) ccIdxD = ccIdxP.getIndex() ky = next(iter(ccIdxD)) oeMol = oemp.getMol(ky) # for ccId in ["BNZ", "ALA"]: # ---- startTime = time.time() oeMol = oemp.getMol(ccId) # ccIdL = oesU.prefilterIndex(oeMol, ccIdxP, matchOpts=matchOpts) logger.info("%s search length %d in (%.4f seconds)", ccId, len(ccIdL), time.time() - startTime) # retStatus, mL = oesU.searchSubStructure(oeMol, ccIdList=ccIdL, matchOpts=matchOpts, numProc=numProc) logger.info("%s status %r result length %d in (%.4f seconds)", ccId, retStatus, len(mL), time.time() - startTime) self.assertTrue(retStatus) self.assertTrue(self.__resultContains(ccId, mL))
def __buildChemCompSearchIndex(self, numProc, **kwargs): """Test build search index chemical component cache files from the input component dictionaries""" try: cachePath = kwargs.get("cachePath", self.__cachePath) molLimit = kwargs.get("molLimit", None) useCache = not kwargs.get("rebuildChemIndices", False) logSizes = kwargs.get("logSizes", False) limitPerceptions = kwargs.get("limitPerceptions", False) # # numProc = kwargs.get("numProc", 1) # numProc = self.__numProc chunkSize = kwargs.get("chunkSize", 5) molLimit = kwargs.get("molLimit", None) ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full") quietFlag = kwargs.get("quietFlag", True) ccUrlTarget = kwargs.get("ccUrlTarget", None) birdUrlTarget = kwargs.get("birdUrlTarget", None) # ccsiP = ChemCompSearchIndexProvider( ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, cachePath=cachePath, useCache=useCache, molLimit=molLimit, ccFileNamePrefix=ccFileNamePrefix, limitPerceptions=limitPerceptions, numProc=numProc, maxChunkSize=chunkSize, quietFlag=quietFlag, ) ok = ccsiP.testCache(minCount=molLimit, logSizes=logSizes) return ok, ccsiP if ok else None except Exception as e: logger.exception("Failing with %s", str(e)) return False, None
def updateSearchIndex(self, useCache=False): """Rebuild the search index from source chemical component and BIRD definitions. Update the internal state of this index in the current object instance. Resource requirements 771 secs 6 proc macbook pro 7GB memory. Args: useCache (bool): False to rebuild search index and True to reload Returns: bool: True for success or false otherwise """ ok = False try: kwargs = copy.deepcopy( self.__configD["ccsiKwargs"] ) if "ccsiKwargs" in self.__configD else None if kwargs: kwargs["useCache"] = useCache siIdxP = ChemCompSearchIndexProvider(**kwargs) ok = siIdxP.testCache() self.__siIdxP = siIdxP if siIdxP else None self.__siIdx = siIdxP.getIndex() if siIdxP and ok else {} logger.info("Search index status %r index len %d", ok, len(self.__siIdx) if self.__siIdx else 0) except Exception as e: logger.exception("Failing with %s", str(e)) return ok
class OeSubStructSearchCompareTests(unittest.TestCase): useFull = False def setUp(self): self.__workPath = os.path.join(HERE, "test-output") self.__dataPath = os.path.join(HERE, "test-data") self.__cachePath = os.path.join(HERE, "test-output", "CACHE") self.__ccUrlTarget = os.path.join(self.__dataPath, "components-abbrev.cif") self.__birdUrlTarget = os.path.join(self.__dataPath, "prdcc-abbrev.cif") self.__doDisplay = True self.__numProcPrep = 6 self.__numProcSearch = 6 self.__minCount = None self.__startTime = time.time() # if OeSubStructSearchCompareTests.useFull: self.__myKwargs = { "cachePath": self.__cachePath, "useCache": True, "ccFileNamePrefix": "cc-full", "oeFileNamePrefix": "oe-full", "molBuildType": "model-xyz", "limitPerceptions": False, "screenTypeList": None, "numProc": self.__numProcPrep, "suppressHydrogens": True, "matchOpts": "sub-struct-graph-relaxed", "fpTypeCuttoffD": { "TREE": 0.6, "MACCS": 0.9 }, "maxFpResults": 50, } else: self.__myKwargs = { "ccUrlTarget": self.__ccUrlTarget, "birdUrlTarget": self.__birdUrlTarget, "cachePath": self.__cachePath, "useCache": True, "ccFileNamePrefix": "cc-abbrev", "oeFileNamePrefix": "oe-abbrev", "molBuildType": "model-xyz", "limitPerceptions": False, "screenTypeList": None, "numProc": self.__numProcPrep, "suppressHydrogens": True, "matchOpts": "sub-struct-graph-relaxed", "fpTypeCuttoffD": { "TREE": 0.6, "MACCS": 0.9 }, "maxFpResults": 50, } # self.__oesmP = OeSearchMoleculeProvider(**self.__myKwargs) ok = self.__oesmP.testCache() self.assertTrue(ok) # self.__ccmP = ChemCompMoleculeProvider(**self.__myKwargs) self.__ccmP.testCache() # self.__ccsidxP = ChemCompSearchIndexProvider(**self.__myKwargs) ok = self.__ccsidxP.testCache(minCount=self.__minCount) self.assertTrue(ok) self.__oessU = OeSubStructSearchUtils(self.__oesmP) ok = self.__oessU.testCache() self.assertTrue(ok) # fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {}) fpTypeList = [k for k, v in fpTypeCuttoffD.items()] self.__oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList) ok = self.__oesU.testCache() self.assertTrue(ok) # logger.debug("Running tests on version %s", __version__) logger.info("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) @unittest.skipIf(not useFull, "Requires full data set") def testSubStructSearchDescriptor(self): # query = "n1ccccc1" queryId = "query-smiles" queryType = "oe-iso-smiles" # limitPerceptions = self.__myKwargs.get("limitPerceptions", False) suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True) numProc = self.__myKwargs.get("numProc", 4) # for matchOpts in ["sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo", "sub-struct-graph-strict"]: for matchOpts in ["sub-struct-graph-strict"]: # oeMol = self.__getMol(query, queryType, queryId, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) startTime = time.time() retStatus, mL = self.__search(oeMol, matchOpts, numProc) logger.info( "%s status (%r) matchOpts %s result %d in (%.4f seconds)", queryId, retStatus, matchOpts, len(mL), time.time() - startTime) self.assertTrue(retStatus) if queryType == "CC": self.assertTrue(self.__resultContains(queryId, mL)) # if self.__doDisplay: self.__display(mL, query, queryId, queryType, matchOpts) @unittest.skipIf(not useFull, "Requires full data set") def testSubStructSearchSelected(self): # query = queryId = "STI" queryType = "CC" # limitPerceptions = self.__myKwargs.get("limitPerceptions", False) suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True) numProc = self.__myKwargs.get("numProc", 4) for matchOpts in [ "sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo", "sub-struct-graph-strict" ]: # oeMol = self.__getMol(query, queryType, queryId, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) startTime = time.time() retStatus, mL = self.__search(oeMol, matchOpts, numProc) logger.info( "%s status (%r) matchOpts %s result %d in (%.4f seconds)", queryId, retStatus, matchOpts, len(mL), time.time() - startTime) self.assertTrue(retStatus) if queryType == "CC": self.assertTrue(self.__resultContains(queryId, mL)) # if self.__doDisplay: self.__display(mL, query, queryId, queryType, matchOpts) @unittest.skipIf(not useFull, "Requires full data set") def testSubStructSearchAll(self): # ccD = self.__ccmP.getMolD() for ccId in ccD: query = queryId = ccId if ccId in ["UNX", "UNL", "UNK", "DUM"]: continue queryType = "CC" # limitPerceptions = self.__myKwargs.get("limitPerceptions", False) suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True) numProc = self.__myKwargs.get("numProc", 5) for matchOpts in [ "sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo", "sub-struct-graph-strict" ]: # oeMol = self.__getMol(query, queryType, queryId, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) if oeMol.NumAtoms() < 3: continue # startTime = time.time() retStatus, mL = self.__search(oeMol, matchOpts, numProc) logger.info( "%s status (%r) matchOpts %s result %d in (%.4f seconds)", queryId, retStatus, matchOpts, len(mL), time.time() - startTime) self.assertTrue(retStatus) if queryType == "CC": self.assertTrue(self.__resultContains(queryId, mL)) # if self.__doDisplay: self.__display(mL, query, queryId, queryType, matchOpts) @unittest.skipIf(not useFull, "Requires full data set") def testMatchSearchSelected(self): # query = queryId = "STI" queryType = "CC" # limitPerceptions = self.__myKwargs.get("limitPerceptions", False) suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True) numProc = self.__myKwargs.get("numProc", 4) for matchOpts in [ "fingerprint-similarity", "graph-relaxed", "graph-relaxed-stereo", "graph-strict" ]: # oeMol = self.__getMol(query, queryType, queryId, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) startTime = time.time() retStatus, mL = self.__search(oeMol, matchOpts, numProc) logger.info( "%s status (%r) matchOpts %s result %d in (%.4f seconds)", queryId, retStatus, matchOpts, len(mL), time.time() - startTime) self.assertTrue(retStatus) if queryType == "CC": self.assertTrue(self.__resultContains(queryId, mL)) # if self.__doDisplay: self.__display(mL, query, queryId, queryType, matchOpts) def __search(self, oeMol, matchOpts, numProc): if matchOpts.startswith("sub-struct-"): retStatus, mL = self.__subStructureSearch(oeMol, matchOpts=matchOpts, numProc=numProc) else: retStatus, mL, fpL = self.__matchSearch(oeMol, matchOpts=matchOpts) # rL = fpL if matchOpts in ["fingerprint-similarity"] else mL return retStatus, rL # def __subStructureSearch(self, oeMol, matchOpts, numProc): ## ccIdL = self.__oessU.prefilterIndex(oeMol, self.__ccsidxP, matchOpts=matchOpts, skipFeatures=False) retStatus, mL = self.__oessU.searchSubStructure(oeMol, ccIdList=ccIdL, matchOpts=matchOpts, numProc=numProc) return retStatus, mL def __matchSearch(self, oeMol, matchOpts="graph-relaxed"): ssL = fpL = [] try: fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {}) maxFpResults = self.__myKwargs.get("maxFpResults", 50) retStatus, ssL, fpL = self.__oesU.searchSubStructureAndFingerPrint( oeMol, list(fpTypeCuttoffD.items())[:2], maxFpResults, matchOpts=matchOpts) # logger.info("fpL %r", fpL) except Exception as e: logger.exception("Failing with %s", str(e)) # return retStatus, ssL, fpL def __getMol(self, query, queryType, queryId, limitPerceptions=False, suppressHydrogens=True): oeioU = OeIoUtils() if queryType == "CC": oeMol = self.__oesmP.getMol(query) else: oeMol = oeioU.descriptorToMol(query, queryType, limitPerceptions=limitPerceptions, messageTag=queryId) # if suppressHydrogens: oeMol = oeioU.suppressHydrogens(oeMol) oeMol.SetTitle(queryId) return oeMol def __resultContains(self, ccId, matchResultList): for matchResult in matchResultList: if matchResult.ccId == ccId: return True return False # # ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ def __display(self, mL, query, queryId, queryType, matchOpts): smL = sorted(mL, key=lambda kv: kv.fpScore, reverse=True) # ---- tD = {} for sm in smL: ccId = sm.ccId.split("|")[0] tD.setdefault(ccId, []).append(sm) dL = [] for ccId, ttL in tD.items(): if len(ttL) == 1: dL.append(ttL[0]) else: parent = False for tt in ttL: if tt.ccId == ccId: dL.append(tt) parent = True break if not parent: dL.append(ttL[0]) # ---- pdfImagePath = os.path.join(self.__workPath, queryId + "-" + matchOpts + ".pdf") self.__displayPaginatedAlignments(pdfImagePath, query, queryType, queryId, dL, matchOpts=matchOpts) def __displayPaginatedAlignments(self, pdfImagePath, query, queryType, queryId, matchResultList, matchOpts="relaxed-stereo", alignMode="SS"): refId = queryId oeMolRef = self.__getMol(query, queryType, queryId, limitPerceptions=False, suppressHydrogens=True) pairList = [] for mr in sorted(matchResultList, key=lambda kv: kv.fpScore, reverse=True): fitId = mr.ccId.split("|")[0] if len(mr.ccId) > 4: fitId = fitId + " (tautomer/protomer)" oeMolFit = self.__oesmP.getMol(mr.ccId) pairList.append((refId, oeMolRef, fitId, oeMolFit)) # self.__depictFitList(pdfImagePath, pairList, matchOpts=matchOpts, alignMode=alignMode) def __pairDepictPage(self, imagePath, refId, refTitle, refMol, fitId, fitTitle, fitMol, matchOpts="strict"): """Depict pairwise alignment of the input reference and fit molecules. Args: imagePath (str): path to image (format by path extension) refId (str): reference molecule identifier refTitle (str): reference molecule title refMol (obj): reference OE molecule object fitId (str): fit molecule identifier fitTitle (str): fit molecule title fitMol (obj): fit OE molecule object matchOpts (str, optional): alignment criteria (relaxed|relaxed-stereo|strict). Defaults to "strict". Returns: (list): atom mapping in all aligned figures [(reference component Id, reference atom name, fit chemical component Id, fit atom name) """ aML = [] try: oed = OeDepictMCSAlignPage() oed.setSearchType(sType=matchOpts) oed.setRefMol(refMol, refId, title=refTitle) oed.setFitMol(fitMol, fitId, title=fitTitle) oed.setDisplayOptions( imageSizeX=2000, imageSizeY=1000, labelAtomName=True, labelAtomCIPStereo=True, labelAtomIndex=False, labelBondIndex=False, highlightStyleFit="ballAndStickInverse", bondDisplayWidth=0.5, highLightMatchColorRef="green", highLightNotMatchColorRef="pink", ) aML = oed.alignPair(imagePath=imagePath) if aML: for (rCC, rAt, tCC, tAt) in aML: logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt) except Exception as e: logger.exception("Failing with %s", str(e)) return aML def __depictFitList(self, pdfImagePath, pairList, matchOpts="exact", alignMode="SS"): """Depict pairwise alignments with multi-page layout in PDF format. Args: pdfImagePath (str): PDF image path pairList (list): [(refId, refOeMol, fitId, fitOeMol)] Returns: (list): atom mapping in all aligned figures [(reference component Id, reference atom name, fit chemical component Id, fit atom name) """ aML = [] try: if alignMode == "MCSS": oed = OeDepictMCSAlignMultiPage() else: oed = OeDepictSubStructureAlignMultiPage() oed.setSearchType(sType=matchOpts) oed.setPairMolList(pairList) oed.setDisplayOptions( labelAtomName=True, labelAtomCIPStereo=True, labelAtomIndex=False, labelBondIndex=False, highlightStyleFit="ballAndStickInverse", pageOrientation="portrait", gridRows=4, bondDisplayWidth=0.5, highLightMatchColorRef="green", highLightNotMatchColorRef="pink", ) aML = oed.alignPairListMulti(imagePath=pdfImagePath) if aML: for (rCC, rAt, tCC, tAt) in aML: logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt) # except Exception as e: logger.exception("Failing with %s", str(e)) return aML
class CODModelBuild(object): def __init__(self, cachePath, prefix=None, **kwargs): self.__cachePath = cachePath self.__prefix = prefix startTime = time.time() useCache = True self.__timeOut = kwargs.get("timeOut", None) self.__ccUrlTarget = kwargs.get("ccUrlTarget", None) self.__birdUrlTarget = kwargs.get("birdUrlTarget", None) ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc" oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe" # self.__startTime = time.time() self.__ccmP = ChemCompMoleculeProvider( ccUrlTarget=self.__ccUrlTarget, birdUrlTarget=self.__birdUrlTarget, cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix) ok1 = self.__ccmP.testCache() self.__ccSIdxP = ChemCompSearchIndexProvider( cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix) ok2 = self.__ccSIdxP.testCache() molLimit = kwargs.get("molLimit", None) quietFlag = kwargs.get("quietFlag", True) fpTypeList = kwargs.get("fpTypeList", []) screenTypeList = kwargs.get("screenTypeList", []) limitPerceptions = kwargs.get("limitPerceptions", False) numProc = kwargs.get("numProc", 4) self.__oesmP = OeSearchMoleculeProvider( ccUrlTarget=self.__ccUrlTarget, birdUrlTarget=self.__birdUrlTarget, cachePath=self.__cachePath, ccFileNamePrefix=ccFileNamePrefix, oeFileNamePrefix=oeFileNamePrefix, useCache=useCache, quietFlag=quietFlag, fpTypeList=fpTypeList, screenTypeList=screenTypeList, numProc=numProc, molLimit=molLimit, limitPerceptions=limitPerceptions, ) ok3 = self.__oesmP.testCache() self.__oesmP.getOeMolD() logger.info( "Completed chemical component search index load %r (%.4f seconds)", ok1 & ok2 & ok3, time.time() - startTime) logUsage("main", "Setup completed", self.__startTime) # logger.info("Starting model build (%s) at %s", __version__, time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def getModelDirFilePath(self): dN = "cod-%s-model-files" % self.__prefix if self.__prefix else "cod-model-files" return os.path.join(self.__cachePath, dN) def getModelImageDirFilePath(self): dN = "cod-%s-model-image" % self.__prefix if self.__prefix else "cod-model-images" return os.path.join(self.__cachePath, dN) def __getModelIndexPath(self): return os.path.join(self.getModelDirFilePath(), "cod-model-index.json") def fetchModelIndex(self): mD = {} try: mU = MarshalUtil(workPath=self.__cachePath) fp = self.__getModelIndexPath() mD = mU.doImport(fp, fmt="json") except Exception as e: logger.exception("Failing with %s", str(e)) return mD def storeModelIndex(self, mD): try: mU = MarshalUtil(workPath=self.__cachePath) fp = self.__getModelIndexPath() ok = mU.doExport(fp, mD, fmt="json", indent=3) except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok def build(self, alignType="relaxed-stereo", numProc=4, chunkSize=10, verbose=False, doFigures=True): """Run the model build step in the chemical component model workflow. Args: alignType (str): "relaxed"|"strict"| relaxed-stereo". Default: relaxed-stereo numProc (int, optional): number of processes to invoke. Defaults to 4. chunkSize (int, optional): work chunksize. Defaults to 10. verbose (bool, optional): verbose logging. Defaults to False. Returns: (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }] """ retD = {} try: mU = MarshalUtil(workPath=self.__cachePath) ccms = CODModelSearch(self.__cachePath, prefix=self.__prefix) modelDirPath = self.getModelDirFilePath() imageDirPath = self.getModelImageDirFilePath() # tD = ccms.getResultIndex() # Make parent index --- idxIdD = {} for idxId, iDL in tD.items(): pId = idxId.split("|")[0] idxIdD.setdefault(pId, []).extend(iDL) # idxIdL = list(idxIdD.keys()) midxIdL = [] for pId in idxIdL: fp = os.path.join(modelDirPath, pId, "model-index.json") if mU.exists(fp): # Skip empty indices fst = os.stat(fp) if fst.st_size > 10: continue midxIdL.append(pId) # logger.info( "Starting COD model build using (%d) from a total of results length (%d)", len(midxIdL), len(idxIdD)) # cmbw = CODModelBuildWorker(self.__cachePath, verbose=verbose, timeOut=self.__timeOut) mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(modelDirPath) mpu.setOptions( optionsD={ "modelDirPath": modelDirPath, "imageDirPath": imageDirPath, "alignType": alignType, "ccSIdxP": self.__ccSIdxP, "idxIdD": idxIdD, "oesmP": self.__oesmP, "ccmP": self.__ccmP, "doFigures": doFigures, }) # mpu.set(workerObj=cmbw, workerMethod="build") ok, failList, resultList, _ = mpu.runMulti(dataList=midxIdL, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info( "Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) successList = copy.copy(resultList[0]) # if successList: logger.info("Completed build with %d models ", len(successList)) else: logger.info("No models built") # # Build full index - # logger.info("Building full model index") for pId in idxIdL: fp = os.path.join(modelDirPath, pId, "model-index.json") if mU.exists(fp): tDL = mU.doImport(fp, fmt="json") for tD in tDL: retD.setdefault(tD["parentId"], []).append(tD) # retD = dict(sorted(retD.items())) logger.info("Storing models for %d parent components", len(retD)) ok = self.storeModelIndex(retD) except Exception as e: logger.exception("Failing with %s", str(e)) return retD
def __reload(self, **kwargs): """Reload the dictionary of OE molecules and related data artifacts for chemical component definitions. Args: limitPerceptions(bool): process input descriptors in essentially verbatim mode (default: True) fpTypeList (list): fingerprint type (TREE,PATH,MACCS,CIRCULAR,LINGO) screenTypeList (list): fast sub search screen type (MOLECULE, SMARTS, MDL, ... ) useCache (bool, optional): flag to use cached files. Defaults to True. cachePath (str): path to the top cache directory. Defaults to '.'. numProc (int): number processors to engage in screen substructure search database generation. suppressHydrogens (bool, optional): flag to suppress explicit hydrogens in the OE data store. molLimit (int): Returns: (bool) : True for success or False othewise """ try: useCache = kwargs.get("useCache", True) cachePath = kwargs.get("cachePath", ".") numProc = kwargs.get("numProc", 2) molLimit = kwargs.get("molLimit", None) fpTypeList = kwargs.get( "fpTypeList", ["TREE", "PATH", "MACCS", "CIRCULAR", "LINGO"]) # screenTypeList = kwargs.get("screenTypeList", ["SMARTS"]) screenTypeList = kwargs.get("screenTypeList", None) limitPerceptions = kwargs.get("limitPerceptions", False) suppressHydrogens = kwargs.get("suppressHydrogens", False) quietFlag = kwargs.get("quietFlag", True) logSizes = kwargs.get("logSizes", False) fpDbType = "STANDARD" buildScreenedDb = True # oeCount = 0 errCount = 0 failIdList = [] oeIo = OeIoUtils(quietFlag=quietFlag) # -------- oeSearchMolFilePath = os.path.join(self.__dirPath, self.__getOeSearchMolFileName()) if not useCache or (useCache and not self.__mU.exists(oeSearchMolFilePath)): cmpKwargs = { k: v for k, v in kwargs.items() if k not in ["cachePath", "useCache", "molLimit"] } ccsiP = ChemCompSearchIndexProvider(cachePath=cachePath, useCache=True, molLimit=molLimit, **cmpKwargs) ok = ccsiP.testCache(minCount=molLimit, logSizes=logSizes) # ---- ccIdxD = ccsiP.getIndex() if ok else {} idxCount = len(ccIdxD) # ------- JDW OE mol construction here ----- startTime = time.time() oeCount, errCount, failIdList = oeIo.buildOeBinaryMolCacheFromIndex( oeSearchMolFilePath, ccIdxD, quietFlag=quietFlag, fpTypeList=fpTypeList, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) if failIdList: logger.info("failures %r", failIdList) endTime = time.time() logger.info( "Constructed %d/%d cached oeMols (unconverted %d) (%.4f seconds)", oeCount, idxCount, errCount, endTime - startTime) # -------- oeMolDbFilePath = os.path.join(self.__dirPath, self.__getOeMolDbFileName()) if not useCache or (useCache and not self.__mU.exists(oeMolDbFilePath)): startTime = time.time() molCount = oeIo.createOeBinaryDatabaseAndIndex( oeSearchMolFilePath, oeMolDbFilePath) endTime = time.time() logger.info( "Created and stored %d indexed oeMols in OE database format (%.4f seconds)", molCount, endTime - startTime) # -------- if fpDbType == "FAST": for fpType in fpTypeList: startTime = time.time() # Fast FP search database file names fpPath = os.path.join(self.__dirPath, self.__getFastFpDbFileName(fpType)) if not useCache or (useCache and not self.__mU.exists(fpPath)): ok = oeIo.createOeFingerPrintDatabase(oeMolDbFilePath, fpPath, fpType=fpType) endTime = time.time() logger.info( "Created and stored %s fingerprint database (%.4f seconds)", fpType, endTime - startTime) # -------- if buildScreenedDb and screenTypeList: for screenType in screenTypeList: startTime = time.time() fp = os.path.join(self.__dirPath, self.__getSubSearchFileName(screenType)) if not useCache or (useCache and not self.__mU.exists(fp)): ok = oeIo.createOeSubSearchDatabase( oeSearchMolFilePath, fp, screenType=screenType, numProc=numProc) endTime = time.time() logger.info( "Constructed screened substructure database (status %r) with screenType %s (%.4f seconds)", ok, screenType, endTime - startTime) # --------- ssDb = oeIo.loadOeSubSearchDatabase( fp, screenType=screenType, numProc=numProc) ok = ssDb.NumMolecules() == oeCount # ---------- # return True except Exception as e: logger.exception("Failing with %s", str(e)) return False
def buildSearchFiles(self, **kwargs): """Build cif, sdf (optional), and mol2 files for components in the chemical component search index. Exclude ions or other extraneous molecules lacking bonds. Args: ccUrlTarget (str): locator for source chemical component dictionary (default: full public dictionary) birdUrlTarget (str): locator for source BIRD dictionary (default: full public dictionary) limitPerceptions (bool): restrict automatic perceptions in OE molecular build operations (default: False) numProc (int): number of processors useCache (bool): use existing resource file where possible (default: True) molLimit (str): limit the number to ingested chemical compont (default: None) quietFlag (bool): suppress output in OE library operations (default: True) Returns: (int): number molfiles generated """ cachePath = self.__cachePath ccUrlTarget = kwargs.get("ccUrlTarget", None) birdUrlTarget = kwargs.get("birdUrlTarget", None) molLimit = kwargs.get("molLimit", None) quietFlag = kwargs.get("quietFlag", True) fpTypeList = kwargs.get("fpTypeList", []) screenTypeList = kwargs.get("screenTypeList", []) ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full" oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe-cc-full" numProc = kwargs.get("numProc", 2) minCount = kwargs.get("minCount", 0) useCache = kwargs.get("useCache", True) useSdf = kwargs.get("useSdf", True) useMol2 = kwargs.get("useMol2", False) limitPerceptions = kwargs.get("limitPerceptions", False) logSizes = False # startTime = time.time() ccmP = ChemCompMoleculeProvider(cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix, ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, molLimit=molLimit) ok = ccmP.testCache(minCount=minCount, logSizes=logSizes) logger.info( "Completed chemical component provider load %r (%.4f seconds)", ok, time.time() - startTime) # startTime = time.time() oesmp = OeSearchMoleculeProvider( ccUrlTarget=ccUrlTarget, birdUrlTarget=birdUrlTarget, cachePath=cachePath, ccFileNamePrefix=ccFileNamePrefix, oeFileNamePrefix=oeFileNamePrefix, useCache=useCache, quietFlag=quietFlag, fpTypeList=fpTypeList, screenTypeList=screenTypeList, numProc=numProc, molLimit=molLimit, limitPerceptions=limitPerceptions, ) ok = oesmp.testCache() logger.info("Completed OE molecule provider load %r (%.4f seconds)", ok, time.time() - startTime) # startTime = time.time() ccSIdxP = ChemCompSearchIndexProvider( cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix, limitPerceptions=limitPerceptions, numProc=numProc) ok = ccSIdxP.testCache() logger.info( "Completed chemical component search index load %r (%.4f seconds)", ok, time.time() - startTime) # ccSIdx = ccSIdxP.getIndex() if ccSIdxP and ok else {} logger.info("Search index status %r index length %d", ok, len(ccSIdx)) # ccIdD = {} mU = MarshalUtil() oeU = OeIoUtils(dirPath=cachePath) numMols = 0 searchFileDirPath = self.getSearchDirFilePath() pathTupList = [] for sId in ccSIdx: ccId = sId.split("|")[0] # standard CIF definition if ccId not in ccIdD: cifPath = os.path.join(searchFileDirPath, ccId[0], ccId, ccId + ".cif") if not (useCache and mU.exists(cifPath)): ccMol = ccmP.getMol(ccId) if not self.__checkCif(ccMol): continue mU.doExport(cifPath, [ccMol], fmt="mmcif") # oeMol = oesmp.getMol(sId) if not self.__checkOeMol(oeMol): continue # # Sanity checks on the generated OE molecule # cifPath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".cif") if sId != ccId and not (useCache and mU.exists(cifPath)): oeccU = OeChemCompUtils() ok = oeccU.addOeMol(sId, oeMol, missingModelXyz=True, writeIdealXyz=False) if ok: oeccU.write(cifPath) if useSdf: molFilePath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".sdf") if not (useCache and mU.exists(molFilePath)): ok = oeU.write(molFilePath, oeMol, constantMol=False, addSdTags=True) if ok: pathTupList.append((sId, molFilePath, "sdf")) # if useMol2: mol2FilePath = os.path.join(searchFileDirPath, ccId[0], ccId, sId + ".mol2") if not (useCache and mU.exists(mol2FilePath)): oeU.write(mol2FilePath, oeMol, constantMol=False, addSdTags=True) if ok: pathTupList.append((sId, mol2FilePath, "mol2")) numMols += 1 # self.__storePathList(pathTupList) return numMols
class ChemCompModelBuild(object): def __init__(self, cachePath, prefix=None): self.__cachePath = cachePath self.__prefix = prefix startTime = time.time() useCache = True ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc" self.__ccSIdxP = ChemCompSearchIndexProvider(cachePath=cachePath, useCache=useCache, ccFileNamePrefix=ccFileNamePrefix) ok = self.__ccSIdxP.testCache() logger.info("Completed chemical component search index load %r (%.4f seconds)", ok, time.time() - startTime) # self.__startTime = time.time() logger.info("Starting model build (%s) at %s", __version__, time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def getModelDirFilePath(self): dN = "cc-%s-model-files" % self.__prefix if self.__prefix else "cc-model-files" return os.path.join(self.__cachePath, dN) def getModelImageDirFilePath(self): dN = "cc-%s-model-image" % self.__prefix if self.__prefix else "cc-model-images" return os.path.join(self.__cachePath, dN) def fetchModelIndex(self): mD = {} try: mU = MarshalUtil(workPath=self.__cachePath) fp = self.__getModelIndexPath() mD = mU.doImport(fp, fmt="json") except Exception as e: logger.exception("Failing with %s", str(e)) return mD def storeModelIndex(self, mD): try: mU = MarshalUtil(workPath=self.__cachePath) fp = self.__getModelIndexPath() ok = mU.doExport(fp, mD, fmt="json", indent=3) except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok def __getModelIndexPath(self): return os.path.join(self.getModelDirFilePath(), "model-index.json") def build(self, alignType="relaxed-stereo", numProc=4, chunkSize=10, verbose=False): """Run the model build step in the chemical component model workflow. Args: alignType (str): "relaxed"|"strict"| relaxed-stereo". Default: relaxed-stereo numProc (int, optional): number of processes to invoke. Defaults to 4. chunkSize (int, optional): work chunksize. Defaults to 10. verbose (bool, optional): verbose logging. Defaults to False. Returns: (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }] """ retD = {} try: ccms = ChemCompModelSearch(self.__cachePath, None, None, prefix=self.__prefix) modelDirPath = self.getModelDirFilePath() imageDirPath = self.getModelImageDirFilePath() # idxPathD = ccms.getResultIndex() idxPathL = list(idxPathD.values()) pD = {} for sId in idxPathD: parentId = sId.split("|")[0] pD.setdefault(parentId, []).append(sId) logger.info("Using search result index length ridxD (%d) parent coverage (%d)", len(idxPathD), len(pD)) # pU = ChemCompModelBuildWorker(self.__cachePath, verbose=verbose) mpu = MultiProcUtil(verbose=True) mpu.setWorkingDir(modelDirPath) mpu.setOptions(optionsD={"modelDirPath": modelDirPath, "imageDirPath": imageDirPath, "alignType": alignType, "ccSIdxP": self.__ccSIdxP}) # mpu.set(workerObj=pU, workerMethod="build") ok, failList, resultList, _ = mpu.runMulti(dataList=idxPathL, numProc=numProc, numResults=1, chunkSize=chunkSize) logger.info("Run ended with status %r success count %d failures %r", ok, len(resultList[0]), len(failList)) successList = copy.copy(resultList[0]) for tD in successList: retD.setdefault(tD["parentId"], []).append(tD) # if retD: logger.info("Completed build with models for %d parent chemical definitions", len(retD)) else: logger.info("No models built") ok = self.storeModelIndex(retD) except Exception as e: logger.exception("Failing with %s", str(e)) return retD