def testSubStructureSearchScreened(self): oeioU = OeIoUtils() oemp = OeMoleculeProvider(**self.__myKwargs) ok = oemp.testCache() ccmP = ChemCompIndexProvider(**self.__myKwargs) ccIdxD = ccmP.getIndex() ok = ccmP.testCache(minCount=self.__minCount) self.assertTrue(ok) oesU = OeSearchUtils(oemp, screenType=self.__screenType, numProc=self.__numProc) numMols = 20 missL = [] for ccId, ccD in list(ccIdxD.items())[:numMols]: # ---- startTime = time.time() if "oe-smiles" not in ccD: continue logger.info("Search %s %r", ccId, ccD["oe-smiles"]) oeQMol = oeioU.smartsToQmol(ccD["oe-smiles"]) retStatus, mL = oesU.searchSubStructureScreened(oeQMol, maxMatches=100) if retStatus: logger.info("%s (status=%r) match length %d in (%.4f seconds)", ccId, retStatus, len(mL), time.time() - startTime) if not self.__resultContains(ccId, mL): missL.append(ccId) # # self.assertGreaterEqual(len(mL), 1) # ---- logger.info("Missed searches (%d) %r", len(missL), missL)
def reloadSearchDatabase(self): """Reload the in-memory search databases from the OE molecule provider. Resource requirements: ~90sec load time 0.35 GB memory Returns: bool: True for success or False otherwise """ ok = False try: okmp = self.updateSearchMoleculeProvider(useCache=True) if not okmp: return ok fpTypeCuttoffD = self.__configD["oesmpKwargs"][ "fpTypeCuttoffD"] if "fpTypeCuttoffD" in self.__configD[ "oesmpKwargs"] else {} fpTypeList = [k for k, v in fpTypeCuttoffD.items()] oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList) ok1 = oesU.testCache() self.__oesU = oesU if ok1 else None # oesubsU = OeSubStructSearchUtils(self.__oesmP) ok2 = oesubsU.testCache() self.__oesubsU = oesubsU if ok2 else None except Exception as e: logger.exception("Failing with %s", str(e)) return ok1 and ok2
def testSubStructureSearchWithFingerPrint(self): oemp = OeMoleculeProvider(**self.__myKwargs) # ok = oemp.testCache() ccmP = ChemCompIndexProvider(**self.__myKwargs) ccIdxD = ccmP.getIndex() ok = ccmP.testCache(minCount=self.__minCount) self.assertTrue(ok) minFpScore = 0.40 maxFpResults = 50 numMols = 20 matchOpts = "graph-relaxed" oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList) # ---- startTime = time.time() for ccId, _ in list(ccIdxD.items())[:numMols]: for fpType in self.__fpTypeList: oeMol = oemp.getMol(ccId) retStatus, mL = oesU.searchSubStructureWithFingerPrint( oeMol, fpType, minFpScore, maxFpResults, matchOpts=matchOpts) self.assertTrue(retStatus) self.assertTrue(self.__resultContains(ccId, mL)) logger.info("%s fingerprints search on %d in (%.4f seconds)", len(self.__fpTypeList), numMols, time.time() - startTime)
def testFingerPrintSearch(self): oemp = OeMoleculeProvider(**self.__myKwargs) # This will reload the oe binary cache. oeMol = oemp.getMol("004") self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12) # ok = oemp.testCache() ccmP = ChemCompIndexProvider(**self.__myKwargs) ccIdxD = ccmP.getIndex() ok = ccmP.testCache(minCount=self.__minCount) self.assertTrue(ok) minScore = 0.50 maxResults = 50 numMols = 50 oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList) # ---- startTime = time.time() for ccId, _ in list(ccIdxD.items())[:numMols]: for fpType in self.__fpTypeList: oeMol = oemp.getMol(ccId) retStatus, mL = oesU.searchFingerPrints( oeMol, fpType=fpType, minFpScore=minScore, maxFpResults=maxResults) self.assertTrue(retStatus) self.assertTrue(self.__resultContains(ccId, mL)) # self.assertGreaterEqual(len(mL), 1) logger.info("%s fingerprints search on %d in (%.4f seconds)", len(self.__fpTypeList), numMols, time.time() - startTime)
def __fingerPrintSearch(self, numMols, **kwargs): maxFpResults = kwargs.get("maxFpResults", 50) limitPerceptions = kwargs.get("limitPerceptions", False) fpTypeCuttoffList = kwargs.get("fpTypeCuttoffList", [("TREE", 0.6)]) buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"]) # oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs) oesU = OeSearchUtils(oesmP, fpTypeList=[tup[0] for tup in fpTypeCuttoffList]) oeioU = OeIoUtils() # This will reload the oe binary cache. oeMol = oesmP.getMol("004") self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12) missedFpD = {} missedBuildD = {} numMols = min(len(ccIdxD), numMols) if numMols else len(ccIdxD) logger.info("Begin finger print search on %d molecules", numMols) # ---- startTime = time.time() for ccId, ccD in list(ccIdxD.items())[:numMols]: for buildType in buildTypeList: if buildType in ccD: oeMol = oeioU.descriptorToMol( ccD[buildType], buildType, limitPerceptions=limitPerceptions, messageTag=ccId + ":" + buildType) if not oeMol: continue selfHit = False for fpType, minFpScore in fpTypeCuttoffList: retStatus, mL = oesU.searchFingerPrints( oeMol, fpType=fpType, minFpScore=minFpScore, maxFpResults=maxFpResults) self.assertTrue(retStatus) # matchedSelf = self.__resultContains(ccId, mL) selfHit = selfHit or matchedSelf if not matchedSelf: missedFpD.setdefault(ccId, []).append( (buildType, fpType, len(mL))) # if not selfHit: missedBuildD.setdefault(ccId, []).append(buildType) # ------ for ccId, bTL in missedBuildD.items(): logger.info("%s missed all fptypes: buildtype list %r", ccId, bTL) if ccId in missedFpD: logger.info("%s unmatched by fpTypes %r", ccId, missedFpD[ccId]) # ---- logger.info("%s fingerprints search on %d in (%.4f seconds)", len(fpTypeCuttoffList), numMols, time.time() - startTime) # ---- return True
def __subStructureSearchScreened(self, numMols, **kwargs): # buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"]) screenTypeList = kwargs.get("screenTypeList", ["SMARTS"]) oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs) for screenType in screenTypeList: oesU = OeSearchUtils(oesmP, screenType=screenType, numProc=self.__numProc) oeioU = OeIoUtils() # missL = [] numMols = min(len(ccIdxD), numMols) if numMols else len(ccIdxD) for ( ii, ccId, ) in enumerate(list(ccIdxD.keys())[:numMols]): ccD = ccIdxD[ccId] for buildType in buildTypeList: if buildType in ccD: if screenType == "SMARTS": smiles = oeioU.descriptorToSmiles(ccD[buildType], buildType, messageTag=ccId + ":" + buildType) oeQMol = oeioU.descriptorToMol(smiles, "SMARTS", messageTag=ccId + ":" + buildType) else: oeQMol = oeioU.descriptorToQMol(ccD[buildType], "SMARTS", messageTag=ccId + ":" + buildType) if not oeQMol: logger.debug("%s build failed for %s - skipping", ccId, buildType) continue # ---- startTime = time.time() retStatus, mL = oesU.searchSubStructureScreened( oeQMol, maxMatches=100) if retStatus: logger.debug( "%s - %s - %s (status=%r) match length %d in (%.4f seconds)", ccId, buildType, screenType, retStatus, len(mL), time.time() - startTime) if not self.__resultContains(ccId, mL): missL.append((ccId, buildType, screenType)) # ---- if ii % 100 == 0: logger.info("Completed %d of %d missed count %d", ii, numMols, len(missL)) logger.info("Screen %r missed searches (%d) %r", screenType, len(missL), missL) return True
def __exhaustiveSubStructureSearch(self, numMols, **kwargs): """Exhaustive substructure search.""" try: limitPerceptions = kwargs.get("limitPerceptions", False) buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"]) oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs) oesU = OeSearchUtils(oesmP, fpTypeList=[]) oeioU = OeIoUtils() # for ccId, ccD in list(ccIdxD.items())[:numMols]: matchCount = 0 mtS = set() for buildType in buildTypeList: if buildType in ccD: oeMol = oeioU.descriptorToMol( ccD[buildType], buildType, limitPerceptions=limitPerceptions, messageTag=ccId + ":" + buildType) if not oeMol: logger.error( "%s %s build query molecule build fails (skipping)", ccId, buildType) continue # ---- startTime = time.time() retStatus, mL = oesU.searchSubStructure( oeMol, matchOpts="graph-strict") if not retStatus: logger.info("%s match fails for build type %s", ccId, buildType) elif not self.__resultContains(ccId, mL): logger.info( "%s failed match length %d build type %s in (%.4f seconds)", ccId, len(mL), buildType, time.time() - startTime) elif self.__resultContains(ccId, mL): mtS.update([m.ccId for m in mL]) matchCount += 1 self.assertTrue(retStatus) self.assertTrue(self.__resultContains(ccId, mL)) if matchCount: logger.info("%s MATCHES %d: %r", ccId, matchCount, mtS) else: logger.info("%s NO MATCHES", ccId) # ---- return True except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return False
def testSubStructureSearch(self): oemp = OeMoleculeProvider(**self.__myKwargs) ok = oemp.testCache() ccmP = ChemCompIndexProvider(**self.__myKwargs) ccIdxD = ccmP.getIndex() ok = ccmP.testCache(minCount=self.__minCount) self.assertTrue(ok) oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList) numMols = 10 for ccId, _ in list(ccIdxD.items())[:numMols]: # ---- startTime = time.time() oeMol = oemp.getMol(ccId) retStatus, mL = oesU.searchSubStructure(oeMol, matchOpts="relaxed") logger.info("%s match length %d in (%.4f seconds)", ccId, len(mL), time.time() - startTime) self.assertTrue(retStatus) self.assertTrue(self.__resultContains(ccId, mL))
def testSubStructureSearchScreenedFiltered(self): myKwargs = { "cachePath": self.__cachePath, "useCache": True, "fpTypeList": self.__fpTypeList, "ccFileNamePrefix": "cc-filtered", "oeFileNamePrefix": "oe-filtered", "molBuildType": "oe-iso-smiles", "limitPerceptions": False, } oeioU = OeIoUtils() oemp = OeMoleculeProvider(**myKwargs) ok = oemp.testCache() ccmP = ChemCompIndexProvider(**myKwargs) ccIdxD = ccmP.getIndex() ok = ccmP.testCache(minCount=self.__minCount) self.assertTrue(ok) oesU = OeSearchUtils(oemp, screenType=self.__screenType, numProc=self.__numProc) numMols = 5000 missL = [] for ccId, ccD in list(ccIdxD.items())[:numMols]: # ---- startTime = time.time() if "oe-smiles" not in ccD: continue logger.info("Search %s %r", ccId, ccD["oe-smiles"]) oeQMol = oeioU.smartsToQmol(ccD["oe-smiles"]) retStatus, mL = oesU.searchSubStructureScreened(oeQMol, maxMatches=100) logger.info("%s (status=%r)match length %d in (%.4f seconds)", ccId, retStatus, len(mL), time.time() - startTime) if not self.__resultContains(ccId, mL): missL.append(ccId) # self.assertGreaterEqual(len(mL), 1) # ---- logger.info("Missed searches (%d) %r", len(missL), missL)
def __exhaustiveSubStructureSearch(self, numMols, **kwargs): """Exhaustive substructure search.""" try: limitPerceptions = kwargs.get("limitPerceptions", False) buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"]) oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs) oesU = OeSearchUtils(oesmP, fpTypeList=[]) oeioU = OeIoUtils() # for ccId, ccD in list(ccIdxD.items())[:numMols]: for buildType in buildTypeList: if buildType in ccD: oeMol = oeioU.descriptorToMol( ccD[buildType], buildType, limitPerceptions=limitPerceptions, messageTag=ccId + ":" + buildType) if not oeMol: continue # ---- startTime = time.time() retStatus, mL = oesU.searchSubStructure( oeMol, matchOpts="graph-strict") if not self.__resultContains(ccId, mL): logger.info( "%s match length %d build type %s in (%.4f seconds)", ccId, len(mL), buildType, time.time() - startTime) self.assertTrue(retStatus) self.assertTrue(self.__resultContains(ccId, mL)) # ---- return True except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return False
def setUp(self): self.__workPath = os.path.join(HERE, "test-output") self.__dataPath = os.path.join(HERE, "test-data") self.__cachePath = os.path.join(HERE, "test-output", "CACHE") self.__ccUrlTarget = os.path.join(self.__dataPath, "components-abbrev.cif") self.__birdUrlTarget = os.path.join(self.__dataPath, "prdcc-abbrev.cif") self.__doDisplay = True self.__numProcPrep = 6 self.__numProcSearch = 6 self.__minCount = None self.__startTime = time.time() # if OeSubStructSearchCompareTests.useFull: self.__myKwargs = { "cachePath": self.__cachePath, "useCache": True, "ccFileNamePrefix": "cc-full", "oeFileNamePrefix": "oe-full", "molBuildType": "model-xyz", "limitPerceptions": False, "screenTypeList": None, "numProc": self.__numProcPrep, "suppressHydrogens": True, "matchOpts": "sub-struct-graph-relaxed", "fpTypeCuttoffD": { "TREE": 0.6, "MACCS": 0.9 }, "maxFpResults": 50, } else: self.__myKwargs = { "ccUrlTarget": self.__ccUrlTarget, "birdUrlTarget": self.__birdUrlTarget, "cachePath": self.__cachePath, "useCache": True, "ccFileNamePrefix": "cc-abbrev", "oeFileNamePrefix": "oe-abbrev", "molBuildType": "model-xyz", "limitPerceptions": False, "screenTypeList": None, "numProc": self.__numProcPrep, "suppressHydrogens": True, "matchOpts": "sub-struct-graph-relaxed", "fpTypeCuttoffD": { "TREE": 0.6, "MACCS": 0.9 }, "maxFpResults": 50, } # self.__oesmP = OeSearchMoleculeProvider(**self.__myKwargs) ok = self.__oesmP.testCache() self.assertTrue(ok) # self.__ccmP = ChemCompMoleculeProvider(**self.__myKwargs) self.__ccmP.testCache() # self.__ccsidxP = ChemCompSearchIndexProvider(**self.__myKwargs) ok = self.__ccsidxP.testCache(minCount=self.__minCount) self.assertTrue(ok) self.__oessU = OeSubStructSearchUtils(self.__oesmP) ok = self.__oessU.testCache() self.assertTrue(ok) # fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {}) fpTypeList = [k for k, v in fpTypeCuttoffD.items()] self.__oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList) ok = self.__oesU.testCache() self.assertTrue(ok) # logger.debug("Running tests on version %s", __version__) logger.info("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
class OeSubStructSearchCompareTests(unittest.TestCase): useFull = False def setUp(self): self.__workPath = os.path.join(HERE, "test-output") self.__dataPath = os.path.join(HERE, "test-data") self.__cachePath = os.path.join(HERE, "test-output", "CACHE") self.__ccUrlTarget = os.path.join(self.__dataPath, "components-abbrev.cif") self.__birdUrlTarget = os.path.join(self.__dataPath, "prdcc-abbrev.cif") self.__doDisplay = True self.__numProcPrep = 6 self.__numProcSearch = 6 self.__minCount = None self.__startTime = time.time() # if OeSubStructSearchCompareTests.useFull: self.__myKwargs = { "cachePath": self.__cachePath, "useCache": True, "ccFileNamePrefix": "cc-full", "oeFileNamePrefix": "oe-full", "molBuildType": "model-xyz", "limitPerceptions": False, "screenTypeList": None, "numProc": self.__numProcPrep, "suppressHydrogens": True, "matchOpts": "sub-struct-graph-relaxed", "fpTypeCuttoffD": { "TREE": 0.6, "MACCS": 0.9 }, "maxFpResults": 50, } else: self.__myKwargs = { "ccUrlTarget": self.__ccUrlTarget, "birdUrlTarget": self.__birdUrlTarget, "cachePath": self.__cachePath, "useCache": True, "ccFileNamePrefix": "cc-abbrev", "oeFileNamePrefix": "oe-abbrev", "molBuildType": "model-xyz", "limitPerceptions": False, "screenTypeList": None, "numProc": self.__numProcPrep, "suppressHydrogens": True, "matchOpts": "sub-struct-graph-relaxed", "fpTypeCuttoffD": { "TREE": 0.6, "MACCS": 0.9 }, "maxFpResults": 50, } # self.__oesmP = OeSearchMoleculeProvider(**self.__myKwargs) ok = self.__oesmP.testCache() self.assertTrue(ok) # self.__ccmP = ChemCompMoleculeProvider(**self.__myKwargs) self.__ccmP.testCache() # self.__ccsidxP = ChemCompSearchIndexProvider(**self.__myKwargs) ok = self.__ccsidxP.testCache(minCount=self.__minCount) self.assertTrue(ok) self.__oessU = OeSubStructSearchUtils(self.__oesmP) ok = self.__oessU.testCache() self.assertTrue(ok) # fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {}) fpTypeList = [k for k, v in fpTypeCuttoffD.items()] self.__oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList) ok = self.__oesU.testCache() self.assertTrue(ok) # logger.debug("Running tests on version %s", __version__) logger.info("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) @unittest.skipIf(not useFull, "Requires full data set") def testSubStructSearchDescriptor(self): # query = "n1ccccc1" queryId = "query-smiles" queryType = "oe-iso-smiles" # limitPerceptions = self.__myKwargs.get("limitPerceptions", False) suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True) numProc = self.__myKwargs.get("numProc", 4) # for matchOpts in ["sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo", "sub-struct-graph-strict"]: for matchOpts in ["sub-struct-graph-strict"]: # oeMol = self.__getMol(query, queryType, queryId, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) startTime = time.time() retStatus, mL = self.__search(oeMol, matchOpts, numProc) logger.info( "%s status (%r) matchOpts %s result %d in (%.4f seconds)", queryId, retStatus, matchOpts, len(mL), time.time() - startTime) self.assertTrue(retStatus) if queryType == "CC": self.assertTrue(self.__resultContains(queryId, mL)) # if self.__doDisplay: self.__display(mL, query, queryId, queryType, matchOpts) @unittest.skipIf(not useFull, "Requires full data set") def testSubStructSearchSelected(self): # query = queryId = "STI" queryType = "CC" # limitPerceptions = self.__myKwargs.get("limitPerceptions", False) suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True) numProc = self.__myKwargs.get("numProc", 4) for matchOpts in [ "sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo", "sub-struct-graph-strict" ]: # oeMol = self.__getMol(query, queryType, queryId, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) startTime = time.time() retStatus, mL = self.__search(oeMol, matchOpts, numProc) logger.info( "%s status (%r) matchOpts %s result %d in (%.4f seconds)", queryId, retStatus, matchOpts, len(mL), time.time() - startTime) self.assertTrue(retStatus) if queryType == "CC": self.assertTrue(self.__resultContains(queryId, mL)) # if self.__doDisplay: self.__display(mL, query, queryId, queryType, matchOpts) @unittest.skipIf(not useFull, "Requires full data set") def testSubStructSearchAll(self): # ccD = self.__ccmP.getMolD() for ccId in ccD: query = queryId = ccId if ccId in ["UNX", "UNL", "UNK", "DUM"]: continue queryType = "CC" # limitPerceptions = self.__myKwargs.get("limitPerceptions", False) suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True) numProc = self.__myKwargs.get("numProc", 5) for matchOpts in [ "sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo", "sub-struct-graph-strict" ]: # oeMol = self.__getMol(query, queryType, queryId, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) if oeMol.NumAtoms() < 3: continue # startTime = time.time() retStatus, mL = self.__search(oeMol, matchOpts, numProc) logger.info( "%s status (%r) matchOpts %s result %d in (%.4f seconds)", queryId, retStatus, matchOpts, len(mL), time.time() - startTime) self.assertTrue(retStatus) if queryType == "CC": self.assertTrue(self.__resultContains(queryId, mL)) # if self.__doDisplay: self.__display(mL, query, queryId, queryType, matchOpts) @unittest.skipIf(not useFull, "Requires full data set") def testMatchSearchSelected(self): # query = queryId = "STI" queryType = "CC" # limitPerceptions = self.__myKwargs.get("limitPerceptions", False) suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True) numProc = self.__myKwargs.get("numProc", 4) for matchOpts in [ "fingerprint-similarity", "graph-relaxed", "graph-relaxed-stereo", "graph-strict" ]: # oeMol = self.__getMol(query, queryType, queryId, limitPerceptions=limitPerceptions, suppressHydrogens=suppressHydrogens) startTime = time.time() retStatus, mL = self.__search(oeMol, matchOpts, numProc) logger.info( "%s status (%r) matchOpts %s result %d in (%.4f seconds)", queryId, retStatus, matchOpts, len(mL), time.time() - startTime) self.assertTrue(retStatus) if queryType == "CC": self.assertTrue(self.__resultContains(queryId, mL)) # if self.__doDisplay: self.__display(mL, query, queryId, queryType, matchOpts) def __search(self, oeMol, matchOpts, numProc): if matchOpts.startswith("sub-struct-"): retStatus, mL = self.__subStructureSearch(oeMol, matchOpts=matchOpts, numProc=numProc) else: retStatus, mL, fpL = self.__matchSearch(oeMol, matchOpts=matchOpts) # rL = fpL if matchOpts in ["fingerprint-similarity"] else mL return retStatus, rL # def __subStructureSearch(self, oeMol, matchOpts, numProc): ## ccIdL = self.__oessU.prefilterIndex(oeMol, self.__ccsidxP, matchOpts=matchOpts, skipFeatures=False) retStatus, mL = self.__oessU.searchSubStructure(oeMol, ccIdList=ccIdL, matchOpts=matchOpts, numProc=numProc) return retStatus, mL def __matchSearch(self, oeMol, matchOpts="graph-relaxed"): ssL = fpL = [] try: fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {}) maxFpResults = self.__myKwargs.get("maxFpResults", 50) retStatus, ssL, fpL = self.__oesU.searchSubStructureAndFingerPrint( oeMol, list(fpTypeCuttoffD.items())[:2], maxFpResults, matchOpts=matchOpts) # logger.info("fpL %r", fpL) except Exception as e: logger.exception("Failing with %s", str(e)) # return retStatus, ssL, fpL def __getMol(self, query, queryType, queryId, limitPerceptions=False, suppressHydrogens=True): oeioU = OeIoUtils() if queryType == "CC": oeMol = self.__oesmP.getMol(query) else: oeMol = oeioU.descriptorToMol(query, queryType, limitPerceptions=limitPerceptions, messageTag=queryId) # if suppressHydrogens: oeMol = oeioU.suppressHydrogens(oeMol) oeMol.SetTitle(queryId) return oeMol def __resultContains(self, ccId, matchResultList): for matchResult in matchResultList: if matchResult.ccId == ccId: return True return False # # ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ def __display(self, mL, query, queryId, queryType, matchOpts): smL = sorted(mL, key=lambda kv: kv.fpScore, reverse=True) # ---- tD = {} for sm in smL: ccId = sm.ccId.split("|")[0] tD.setdefault(ccId, []).append(sm) dL = [] for ccId, ttL in tD.items(): if len(ttL) == 1: dL.append(ttL[0]) else: parent = False for tt in ttL: if tt.ccId == ccId: dL.append(tt) parent = True break if not parent: dL.append(ttL[0]) # ---- pdfImagePath = os.path.join(self.__workPath, queryId + "-" + matchOpts + ".pdf") self.__displayPaginatedAlignments(pdfImagePath, query, queryType, queryId, dL, matchOpts=matchOpts) def __displayPaginatedAlignments(self, pdfImagePath, query, queryType, queryId, matchResultList, matchOpts="relaxed-stereo", alignMode="SS"): refId = queryId oeMolRef = self.__getMol(query, queryType, queryId, limitPerceptions=False, suppressHydrogens=True) pairList = [] for mr in sorted(matchResultList, key=lambda kv: kv.fpScore, reverse=True): fitId = mr.ccId.split("|")[0] if len(mr.ccId) > 4: fitId = fitId + " (tautomer/protomer)" oeMolFit = self.__oesmP.getMol(mr.ccId) pairList.append((refId, oeMolRef, fitId, oeMolFit)) # self.__depictFitList(pdfImagePath, pairList, matchOpts=matchOpts, alignMode=alignMode) def __pairDepictPage(self, imagePath, refId, refTitle, refMol, fitId, fitTitle, fitMol, matchOpts="strict"): """Depict pairwise alignment of the input reference and fit molecules. Args: imagePath (str): path to image (format by path extension) refId (str): reference molecule identifier refTitle (str): reference molecule title refMol (obj): reference OE molecule object fitId (str): fit molecule identifier fitTitle (str): fit molecule title fitMol (obj): fit OE molecule object matchOpts (str, optional): alignment criteria (relaxed|relaxed-stereo|strict). Defaults to "strict". Returns: (list): atom mapping in all aligned figures [(reference component Id, reference atom name, fit chemical component Id, fit atom name) """ aML = [] try: oed = OeDepictMCSAlignPage() oed.setSearchType(sType=matchOpts) oed.setRefMol(refMol, refId, title=refTitle) oed.setFitMol(fitMol, fitId, title=fitTitle) oed.setDisplayOptions( imageSizeX=2000, imageSizeY=1000, labelAtomName=True, labelAtomCIPStereo=True, labelAtomIndex=False, labelBondIndex=False, highlightStyleFit="ballAndStickInverse", bondDisplayWidth=0.5, highLightMatchColorRef="green", highLightNotMatchColorRef="pink", ) aML = oed.alignPair(imagePath=imagePath) if aML: for (rCC, rAt, tCC, tAt) in aML: logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt) except Exception as e: logger.exception("Failing with %s", str(e)) return aML def __depictFitList(self, pdfImagePath, pairList, matchOpts="exact", alignMode="SS"): """Depict pairwise alignments with multi-page layout in PDF format. Args: pdfImagePath (str): PDF image path pairList (list): [(refId, refOeMol, fitId, fitOeMol)] Returns: (list): atom mapping in all aligned figures [(reference component Id, reference atom name, fit chemical component Id, fit atom name) """ aML = [] try: if alignMode == "MCSS": oed = OeDepictMCSAlignMultiPage() else: oed = OeDepictSubStructureAlignMultiPage() oed.setSearchType(sType=matchOpts) oed.setPairMolList(pairList) oed.setDisplayOptions( labelAtomName=True, labelAtomCIPStereo=True, labelAtomIndex=False, labelBondIndex=False, highlightStyleFit="ballAndStickInverse", pageOrientation="portrait", gridRows=4, bondDisplayWidth=0.5, highLightMatchColorRef="green", highLightNotMatchColorRef="pink", ) aML = oed.alignPairListMulti(imagePath=pdfImagePath) if aML: for (rCC, rAt, tCC, tAt) in aML: logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt) # except Exception as e: logger.exception("Failing with %s", str(e)) return aML
def __sssWithFingerPrintFromDescriptor(self, numMols, **kwargs): maxFpResults = kwargs.get("maxResults", 50) limitPerceptions = kwargs.get("limitPerceptions", False) fpTypeCuttoffList = kwargs.get("fpTypeCuttoffList", [("TREE", 0.6)]) buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"]) doDisplay = kwargs.get("doDisplay", False) # oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs) oesU = OeSearchUtils(oesmP, fpTypeList=[tup[0] for tup in fpTypeCuttoffList]) oeioU = OeIoUtils() # This will reload the oe binary cache. oeMol = oesmP.getMol("004") self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12) # matchOpts = "graph-relaxed" matchOpts = "graph-strict" missTupL = [] missedD = {} missedFpD = {} numMols = min(len(ccIdxD), numMols) if numMols else len(ccIdxD) logger.info( "Begin substructure search w/ finger print filter on %d molecules", numMols) # ---- startTime = time.time() for ( ii, ccId, ) in enumerate(list(ccIdxD.keys())[:numMols]): ccD = ccIdxD[ccId] for buildType in buildTypeList: if buildType in ccD: startTime1 = time.time() oeMol = oeioU.descriptorToMol( ccD[buildType], buildType, limitPerceptions=limitPerceptions, messageTag=ccId + ":" + buildType) if not oeMol: logger.debug("%s build failed for %s - skipping", ccId, buildType) continue maxHits = 0 minHits = maxFpResults selfHit = False for fpType, minFpScore in fpTypeCuttoffList: retStatus, mL = oesU.searchSubStructureWithFingerPrint( oeMol, fpType, minFpScore, maxFpResults, matchOpts=matchOpts) self.assertTrue(retStatus) logger.debug("%s fpType %r hits %d", ccId, fpType, len(mL)) maxHits = max(maxHits, len(mL)) minHits = min(minHits, len(mL)) matchedSelf = self.__resultContains(ccId, mL) selfHit = selfHit or matchedSelf if not matchedSelf: missedFpD.setdefault(ccId, []).append( (buildType, fpType, len(mL))) if not selfHit: missedD.setdefault(ccId, []).append(buildType) if maxHits < 1 or not selfHit: logger.info( "%s (%r) buildType %r min hits %d max hits %d (%.4f seconds)", ccId, selfHit, buildType, minHits, maxHits, time.time() - startTime1) else: logger.debug("%s missing descriptor %r", ccId, buildType) if ii % 100 == 0: logger.info("Completed %d of %d missed count %d", ii, numMols, len(missedD)) # for ccId, missL in missedD.items(): logger.info("%s missed list %r", ccId, missL) if ccId in missedFpD: logger.info("%s unmatched for fpTypes %r", ccId, missedFpD[ccId]) # ---- if doDisplay: mD = {} for missTup in missTupL: mD.setdefault(missTup[0], []).append(missTup[1]) for ccId, buildTypeL in mD.items(): idxD = ccIdxD[ccId] if "oe-iso-smiles" in idxD: for buildType in buildTypeL: self.__displayAlignedDescriptorPair( ccId, idxD["oe-iso-smiles"], "oe-iso-smiles", idxD[buildType], buildType, title=None, limitPerceptions=True) logger.info("%s fingerprints search on %d in (%.4f seconds)", len(fpTypeCuttoffList), numMols, time.time() - startTime) return True
def testSssWithFingerPrintFromDescriptor(self): oemp = OeMoleculeProvider(**self.__myKwargs) ok = oemp.testCache() ccmP = ChemCompIndexProvider(**self.__myKwargs) ccIdxD = ccmP.getIndex() ok = ccmP.testCache(minCount=self.__minCount) self.assertTrue(ok) limitPerceptions = False # minFpScore = 0.5 maxFpResults = 50 matchOpts = "graph-relaxed" numMols = 20 oeioU = OeIoUtils() oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList) missTupL = [] missedD = {} missedFpD = {} # ---- startTime = time.time() for ccId, ccD in list(ccIdxD.items())[:numMols]: for buildType in [ "oe-iso-smiles", "oe-smiles", "acdlabs-smiles", "cactvs-iso-smiles", "cactvs-smiles", "inchi" ]: if buildType in ccD: logger.debug("Search %s %r", ccId, ccD[buildType]) if buildType in ["inchi"]: oemf = OeMoleculeFactory() oemf.setDescriptor(ccD["inchi"], "inchi", ccId) ok = oemf.build(molBuildType="inchi", limitPerceptions=limitPerceptions) if not ok: logger.info("%s build failed with InChI %r", ccId, ccD["inchi"]) else: oeMol = oemf.getMol() if oemf.getInChI() != ccD["inchi"]: logger.info( "%s regenerated InChI differs\n%r\n%s", ccId, ccD["inchi"], oemf.getInChI()) else: oeMol = oeioU.smilesToMol( ccD[buildType], limitPerceptions=limitPerceptions) if not oeMol: continue maxHits = 0 minHits = maxFpResults selfHit = False for fpType, minFpScore in self.__fpTypeCuttoffList: retStatus, mL = oesU.searchSubStructureWithFingerPrint( oeMol, fpType, minFpScore, maxFpResults, matchOpts=matchOpts) self.assertTrue(retStatus) logger.debug("%s fpType %r hits %d", ccId, fpType, len(mL)) maxHits = max(maxHits, len(mL)) minHits = min(minHits, len(mL)) matchedSelf = self.__resultContains(ccId, mL) selfHit = selfHit or matchedSelf if not matchedSelf: missedFpD.setdefault(ccId, []).append( (buildType, fpType, len(mL))) if not selfHit: missedD.setdefault(ccId, []).append(buildType) logger.info("%s (%r) buildType %r min hits %d max hits %d", ccId, selfHit, buildType, minHits, maxHits) else: logger.info("%s missing descriptor %r", ccId, buildType) # for ccId, missL in missedD.items(): logger.info("%s missed list %r", ccId, missL) if ccId in missedFpD: logger.info("%s unmatched for fpTypes %r", ccId, missedFpD[ccId]) # ---- doDepict = False if doDepict: mD = {} for missTup in missTupL: mD.setdefault(missTup[0], []).append(missTup[1]) for ccId, buildTypeL in mD.items(): idxD = ccIdxD[ccId] if "oe-iso-smiles" in idxD: for buildType in buildTypeL: self.__displayAlignedDescriptorPair( ccId, idxD["oe-iso-smiles"], "oe-iso-smiles", idxD[buildType], buildType, title=None, limitPerceptions=True) logger.info("%s fingerprints search on %d in (%.4f seconds)", len(self.__fpTypeList), numMols, time.time() - startTime)
def __fingerPrintScores(self, numMols, **kwargs): maxFpResults = kwargs.get("maxResults", 50) limitPerceptions = kwargs.get("limitPerceptions", True) fpTypeCuttoffList = kwargs.get("fpTypeCuttoffList", [("TREE", 0.6)]) buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"]) doDisplay = kwargs.get("doDisplay", False) failedIdList = kwargs.get("failedIdList", []) # oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs) oesU = OeSearchUtils(oesmP, fpTypeList=[tup[0] for tup in fpTypeCuttoffList]) oeioU = OeIoUtils() # This will reload the oe binary cache. oeMol = oesmP.getMol("004") self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12) # missedFpD = {} missedBuildD = {} numMols = min(len(ccIdxD), numMols) if numMols else len(ccIdxD) logger.info("Begin finger print score search on %d molecules", numMols) # ---- startTime = time.time() # for ccId, ccD in list(ccIdxD.items())[:numMols]: for ii, ccId in enumerate(failedIdList[:numMols]): ccD = ccIdxD[ccId] for buildType in buildTypeList: if buildType in ccD: oeMol = oeioU.descriptorToMol( ccD[buildType], buildType, limitPerceptions=limitPerceptions, messageTag=ccId + ":" + buildType) if not oeMol: logger.debug("%s build failed for %s - skipping", ccId, buildType) continue maxHits = 0 minHits = maxFpResults selfHit = False # startTime1 = time.time() for fpType, minFpScore in fpTypeCuttoffList: retStatus, mL = oesU.getFingerPrintScores( oeMol, fpType, minFpScore, maxFpResults) self.assertTrue(retStatus) logger.debug("%s fpType %r hits %d", ccId, fpType, len(mL)) maxHits = max(maxHits, len(mL)) minHits = min(minHits, len(mL)) matchedSelf = self.__resultContains(ccId, mL) selfHit = selfHit or matchedSelf if not matchedSelf: missedFpD.setdefault(ccId, []).append( (buildType, fpType, len(mL))) # if not selfHit: missedBuildD.setdefault(ccId, []).append(buildType) # if maxHits < 1 or not selfHit: logger.info( "%s MISSED for buildType %r min hits %d max hits %d (%.4f seconds)", ccId, buildType, minHits, maxHits, time.time() - startTime1) else: logger.debug( "%s MATCHED for buildType %r min hits %d max hits %d (%.4f seconds)", ccId, buildType, minHits, maxHits, time.time() - startTime1) else: logger.debug("%s missing descriptor %r", ccId, buildType) if ii % 100 == 0: logger.info( "Completed %d of %d missed count %d in (%.4f seconds)", ii, len(failedIdList), len(missedBuildD), time.time() - startTime) # ------ for ccId, bTL in missedBuildD.items(): logger.info("%s missed all fptypes: buildtype list %r", ccId, bTL) if ccId in missedFpD: logger.info("%s unmatched by fpTypes %r", ccId, missedFpD[ccId]) # if doDisplay: for ccId, bTL in missedBuildD.items(): idxD = ccIdxD[ccId] if "oe-iso-smiles" in idxD: for bT in bTL: self.__displayAlignedDescriptorPair( ccId, idxD["oe-iso-smiles"], "oe-iso-smiles", idxD[bT], bT, title=None, limitPerceptions=True) logger.info("%s fingerprints search on %d in (%.4f seconds)", len(fpTypeCuttoffList), numMols, time.time() - startTime)