Example #1
0
 def testSubStructureSearchScreened(self):
     oeioU = OeIoUtils()
     oemp = OeMoleculeProvider(**self.__myKwargs)
     ok = oemp.testCache()
     ccmP = ChemCompIndexProvider(**self.__myKwargs)
     ccIdxD = ccmP.getIndex()
     ok = ccmP.testCache(minCount=self.__minCount)
     self.assertTrue(ok)
     oesU = OeSearchUtils(oemp,
                          screenType=self.__screenType,
                          numProc=self.__numProc)
     numMols = 20
     missL = []
     for ccId, ccD in list(ccIdxD.items())[:numMols]:
         # ----
         startTime = time.time()
         if "oe-smiles" not in ccD:
             continue
         logger.info("Search %s %r", ccId, ccD["oe-smiles"])
         oeQMol = oeioU.smartsToQmol(ccD["oe-smiles"])
         retStatus, mL = oesU.searchSubStructureScreened(oeQMol,
                                                         maxMatches=100)
         if retStatus:
             logger.info("%s (status=%r) match length %d in (%.4f seconds)",
                         ccId, retStatus, len(mL),
                         time.time() - startTime)
         if not self.__resultContains(ccId, mL):
             missL.append(ccId)
         #
         # self.assertGreaterEqual(len(mL), 1)
         # ----
     logger.info("Missed searches (%d) %r", len(missL), missL)
    def reloadSearchDatabase(self):
        """Reload the in-memory search databases from the OE molecule provider.
           Resource requirements: ~90sec load time 0.35 GB memory

        Returns:
            bool: True for success or False otherwise
        """
        ok = False
        try:
            okmp = self.updateSearchMoleculeProvider(useCache=True)
            if not okmp:
                return ok
            fpTypeCuttoffD = self.__configD["oesmpKwargs"][
                "fpTypeCuttoffD"] if "fpTypeCuttoffD" in self.__configD[
                    "oesmpKwargs"] else {}
            fpTypeList = [k for k, v in fpTypeCuttoffD.items()]
            oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList)
            ok1 = oesU.testCache()
            self.__oesU = oesU if ok1 else None
            #
            oesubsU = OeSubStructSearchUtils(self.__oesmP)
            ok2 = oesubsU.testCache()
            self.__oesubsU = oesubsU if ok2 else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok1 and ok2
Example #3
0
    def testSubStructureSearchWithFingerPrint(self):
        oemp = OeMoleculeProvider(**self.__myKwargs)
        #
        ok = oemp.testCache()
        ccmP = ChemCompIndexProvider(**self.__myKwargs)
        ccIdxD = ccmP.getIndex()
        ok = ccmP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        minFpScore = 0.40
        maxFpResults = 50
        numMols = 20
        matchOpts = "graph-relaxed"
        oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList)
        # ----
        startTime = time.time()
        for ccId, _ in list(ccIdxD.items())[:numMols]:
            for fpType in self.__fpTypeList:
                oeMol = oemp.getMol(ccId)
                retStatus, mL = oesU.searchSubStructureWithFingerPrint(
                    oeMol,
                    fpType,
                    minFpScore,
                    maxFpResults,
                    matchOpts=matchOpts)
                self.assertTrue(retStatus)
                self.assertTrue(self.__resultContains(ccId, mL))

        logger.info("%s fingerprints search on %d in (%.4f seconds)",
                    len(self.__fpTypeList), numMols,
                    time.time() - startTime)
Example #4
0
 def testFingerPrintSearch(self):
     oemp = OeMoleculeProvider(**self.__myKwargs)
     # This will reload the oe binary cache.
     oeMol = oemp.getMol("004")
     self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12)
     #
     ok = oemp.testCache()
     ccmP = ChemCompIndexProvider(**self.__myKwargs)
     ccIdxD = ccmP.getIndex()
     ok = ccmP.testCache(minCount=self.__minCount)
     self.assertTrue(ok)
     minScore = 0.50
     maxResults = 50
     numMols = 50
     oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList)
     # ----
     startTime = time.time()
     for ccId, _ in list(ccIdxD.items())[:numMols]:
         for fpType in self.__fpTypeList:
             oeMol = oemp.getMol(ccId)
             retStatus, mL = oesU.searchFingerPrints(
                 oeMol,
                 fpType=fpType,
                 minFpScore=minScore,
                 maxFpResults=maxResults)
             self.assertTrue(retStatus)
             self.assertTrue(self.__resultContains(ccId, mL))
             # self.assertGreaterEqual(len(mL), 1)
     logger.info("%s fingerprints search on %d in (%.4f seconds)",
                 len(self.__fpTypeList), numMols,
                 time.time() - startTime)
    def __fingerPrintSearch(self, numMols, **kwargs):
        maxFpResults = kwargs.get("maxFpResults", 50)
        limitPerceptions = kwargs.get("limitPerceptions", False)
        fpTypeCuttoffList = kwargs.get("fpTypeCuttoffList", [("TREE", 0.6)])
        buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"])
        #
        oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs)
        oesU = OeSearchUtils(oesmP,
                             fpTypeList=[tup[0] for tup in fpTypeCuttoffList])
        oeioU = OeIoUtils()
        # This will reload the oe binary cache.
        oeMol = oesmP.getMol("004")
        self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12)
        missedFpD = {}
        missedBuildD = {}
        numMols = min(len(ccIdxD), numMols) if numMols else len(ccIdxD)
        logger.info("Begin finger print search on %d molecules", numMols)
        # ----
        startTime = time.time()
        for ccId, ccD in list(ccIdxD.items())[:numMols]:
            for buildType in buildTypeList:
                if buildType in ccD:
                    oeMol = oeioU.descriptorToMol(
                        ccD[buildType],
                        buildType,
                        limitPerceptions=limitPerceptions,
                        messageTag=ccId + ":" + buildType)
                    if not oeMol:
                        continue
                    selfHit = False
                    for fpType, minFpScore in fpTypeCuttoffList:
                        retStatus, mL = oesU.searchFingerPrints(
                            oeMol,
                            fpType=fpType,
                            minFpScore=minFpScore,
                            maxFpResults=maxFpResults)
                        self.assertTrue(retStatus)
                        #
                        matchedSelf = self.__resultContains(ccId, mL)
                        selfHit = selfHit or matchedSelf
                        if not matchedSelf:
                            missedFpD.setdefault(ccId, []).append(
                                (buildType, fpType, len(mL)))
                    #
                    if not selfHit:
                        missedBuildD.setdefault(ccId, []).append(buildType)
        # ------
        for ccId, bTL in missedBuildD.items():
            logger.info("%s missed all fptypes:  buildtype list %r", ccId, bTL)

        if ccId in missedFpD:
            logger.info("%s unmatched by fpTypes %r", ccId, missedFpD[ccId])

        # ----
        logger.info("%s fingerprints search on %d in (%.4f seconds)",
                    len(fpTypeCuttoffList), numMols,
                    time.time() - startTime)
        # ----
        return True
 def __subStructureSearchScreened(self, numMols, **kwargs):
     #
     buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"])
     screenTypeList = kwargs.get("screenTypeList", ["SMARTS"])
     oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs)
     for screenType in screenTypeList:
         oesU = OeSearchUtils(oesmP,
                              screenType=screenType,
                              numProc=self.__numProc)
         oeioU = OeIoUtils()
         #
         missL = []
         numMols = min(len(ccIdxD), numMols) if numMols else len(ccIdxD)
         for (
                 ii,
                 ccId,
         ) in enumerate(list(ccIdxD.keys())[:numMols]):
             ccD = ccIdxD[ccId]
             for buildType in buildTypeList:
                 if buildType in ccD:
                     if screenType == "SMARTS":
                         smiles = oeioU.descriptorToSmiles(ccD[buildType],
                                                           buildType,
                                                           messageTag=ccId +
                                                           ":" + buildType)
                         oeQMol = oeioU.descriptorToMol(smiles,
                                                        "SMARTS",
                                                        messageTag=ccId +
                                                        ":" + buildType)
                     else:
                         oeQMol = oeioU.descriptorToQMol(ccD[buildType],
                                                         "SMARTS",
                                                         messageTag=ccId +
                                                         ":" + buildType)
                     if not oeQMol:
                         logger.debug("%s build failed for %s - skipping",
                                      ccId, buildType)
                         continue
                     # ----
                     startTime = time.time()
                     retStatus, mL = oesU.searchSubStructureScreened(
                         oeQMol, maxMatches=100)
                     if retStatus:
                         logger.debug(
                             "%s - %s - %s (status=%r) match length %d in (%.4f seconds)",
                             ccId, buildType, screenType, retStatus,
                             len(mL),
                             time.time() - startTime)
                     if not self.__resultContains(ccId, mL):
                         missL.append((ccId, buildType, screenType))
                     # ----
             if ii % 100 == 0:
                 logger.info("Completed %d of %d missed count %d", ii,
                             numMols, len(missL))
         logger.info("Screen %r missed searches (%d) %r", screenType,
                     len(missL), missL)
     return True
 def __exhaustiveSubStructureSearch(self, numMols, **kwargs):
     """Exhaustive substructure search."""
     try:
         limitPerceptions = kwargs.get("limitPerceptions", False)
         buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"])
         oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs)
         oesU = OeSearchUtils(oesmP, fpTypeList=[])
         oeioU = OeIoUtils()
         #
         for ccId, ccD in list(ccIdxD.items())[:numMols]:
             matchCount = 0
             mtS = set()
             for buildType in buildTypeList:
                 if buildType in ccD:
                     oeMol = oeioU.descriptorToMol(
                         ccD[buildType],
                         buildType,
                         limitPerceptions=limitPerceptions,
                         messageTag=ccId + ":" + buildType)
                     if not oeMol:
                         logger.error(
                             "%s %s build query molecule build fails (skipping)",
                             ccId, buildType)
                         continue
                     # ----
                     startTime = time.time()
                     retStatus, mL = oesU.searchSubStructure(
                         oeMol, matchOpts="graph-strict")
                     if not retStatus:
                         logger.info("%s match fails for build type %s",
                                     ccId, buildType)
                     elif not self.__resultContains(ccId, mL):
                         logger.info(
                             "%s failed match length %d build type %s in (%.4f seconds)",
                             ccId, len(mL), buildType,
                             time.time() - startTime)
                     elif self.__resultContains(ccId, mL):
                         mtS.update([m.ccId for m in mL])
                         matchCount += 1
                     self.assertTrue(retStatus)
                     self.assertTrue(self.__resultContains(ccId, mL))
             if matchCount:
                 logger.info("%s MATCHES %d: %r", ccId, matchCount, mtS)
             else:
                 logger.info("%s NO MATCHES", ccId)
             # ----
         return True
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
     return False
Example #8
0
 def testSubStructureSearch(self):
     oemp = OeMoleculeProvider(**self.__myKwargs)
     ok = oemp.testCache()
     ccmP = ChemCompIndexProvider(**self.__myKwargs)
     ccIdxD = ccmP.getIndex()
     ok = ccmP.testCache(minCount=self.__minCount)
     self.assertTrue(ok)
     oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList)
     numMols = 10
     for ccId, _ in list(ccIdxD.items())[:numMols]:
         # ----
         startTime = time.time()
         oeMol = oemp.getMol(ccId)
         retStatus, mL = oesU.searchSubStructure(oeMol, matchOpts="relaxed")
         logger.info("%s match length %d in (%.4f seconds)", ccId, len(mL),
                     time.time() - startTime)
         self.assertTrue(retStatus)
         self.assertTrue(self.__resultContains(ccId, mL))
Example #9
0
    def testSubStructureSearchScreenedFiltered(self):
        myKwargs = {
            "cachePath": self.__cachePath,
            "useCache": True,
            "fpTypeList": self.__fpTypeList,
            "ccFileNamePrefix": "cc-filtered",
            "oeFileNamePrefix": "oe-filtered",
            "molBuildType": "oe-iso-smiles",
            "limitPerceptions": False,
        }
        oeioU = OeIoUtils()
        oemp = OeMoleculeProvider(**myKwargs)
        ok = oemp.testCache()
        ccmP = ChemCompIndexProvider(**myKwargs)
        ccIdxD = ccmP.getIndex()
        ok = ccmP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        oesU = OeSearchUtils(oemp,
                             screenType=self.__screenType,
                             numProc=self.__numProc)
        numMols = 5000
        missL = []
        for ccId, ccD in list(ccIdxD.items())[:numMols]:
            # ----
            startTime = time.time()
            if "oe-smiles" not in ccD:
                continue
            logger.info("Search %s %r", ccId, ccD["oe-smiles"])
            oeQMol = oeioU.smartsToQmol(ccD["oe-smiles"])
            retStatus, mL = oesU.searchSubStructureScreened(oeQMol,
                                                            maxMatches=100)
            logger.info("%s (status=%r)match length %d in (%.4f seconds)",
                        ccId, retStatus, len(mL),
                        time.time() - startTime)
            if not self.__resultContains(ccId, mL):
                missL.append(ccId)

            # self.assertGreaterEqual(len(mL), 1)
            # ----
        logger.info("Missed searches (%d) %r", len(missL), missL)
 def __exhaustiveSubStructureSearch(self, numMols, **kwargs):
     """Exhaustive substructure search."""
     try:
         limitPerceptions = kwargs.get("limitPerceptions", False)
         buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"])
         oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs)
         oesU = OeSearchUtils(oesmP, fpTypeList=[])
         oeioU = OeIoUtils()
         #
         for ccId, ccD in list(ccIdxD.items())[:numMols]:
             for buildType in buildTypeList:
                 if buildType in ccD:
                     oeMol = oeioU.descriptorToMol(
                         ccD[buildType],
                         buildType,
                         limitPerceptions=limitPerceptions,
                         messageTag=ccId + ":" + buildType)
                     if not oeMol:
                         continue
                     # ----
                     startTime = time.time()
                     retStatus, mL = oesU.searchSubStructure(
                         oeMol, matchOpts="graph-strict")
                     if not self.__resultContains(ccId, mL):
                         logger.info(
                             "%s match length %d build type %s in (%.4f seconds)",
                             ccId, len(mL), buildType,
                             time.time() - startTime)
                     self.assertTrue(retStatus)
                     self.assertTrue(self.__resultContains(ccId, mL))
             # ----
         return True
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
     return False
    def setUp(self):

        self.__workPath = os.path.join(HERE, "test-output")
        self.__dataPath = os.path.join(HERE, "test-data")
        self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
        self.__ccUrlTarget = os.path.join(self.__dataPath,
                                          "components-abbrev.cif")
        self.__birdUrlTarget = os.path.join(self.__dataPath,
                                            "prdcc-abbrev.cif")
        self.__doDisplay = True
        self.__numProcPrep = 6
        self.__numProcSearch = 6
        self.__minCount = None
        self.__startTime = time.time()
        #
        if OeSubStructSearchCompareTests.useFull:
            self.__myKwargs = {
                "cachePath": self.__cachePath,
                "useCache": True,
                "ccFileNamePrefix": "cc-full",
                "oeFileNamePrefix": "oe-full",
                "molBuildType": "model-xyz",
                "limitPerceptions": False,
                "screenTypeList": None,
                "numProc": self.__numProcPrep,
                "suppressHydrogens": True,
                "matchOpts": "sub-struct-graph-relaxed",
                "fpTypeCuttoffD": {
                    "TREE": 0.6,
                    "MACCS": 0.9
                },
                "maxFpResults": 50,
            }
        else:
            self.__myKwargs = {
                "ccUrlTarget": self.__ccUrlTarget,
                "birdUrlTarget": self.__birdUrlTarget,
                "cachePath": self.__cachePath,
                "useCache": True,
                "ccFileNamePrefix": "cc-abbrev",
                "oeFileNamePrefix": "oe-abbrev",
                "molBuildType": "model-xyz",
                "limitPerceptions": False,
                "screenTypeList": None,
                "numProc": self.__numProcPrep,
                "suppressHydrogens": True,
                "matchOpts": "sub-struct-graph-relaxed",
                "fpTypeCuttoffD": {
                    "TREE": 0.6,
                    "MACCS": 0.9
                },
                "maxFpResults": 50,
            }
        #
        self.__oesmP = OeSearchMoleculeProvider(**self.__myKwargs)
        ok = self.__oesmP.testCache()
        self.assertTrue(ok)
        #
        self.__ccmP = ChemCompMoleculeProvider(**self.__myKwargs)
        self.__ccmP.testCache()
        #
        self.__ccsidxP = ChemCompSearchIndexProvider(**self.__myKwargs)
        ok = self.__ccsidxP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        self.__oessU = OeSubStructSearchUtils(self.__oesmP)
        ok = self.__oessU.testCache()
        self.assertTrue(ok)
        #
        fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {})
        fpTypeList = [k for k, v in fpTypeCuttoffD.items()]
        self.__oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList)
        ok = self.__oesU.testCache()
        self.assertTrue(ok)
        #
        logger.debug("Running tests on version %s", __version__)
        logger.info("Starting %s at %s", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
class OeSubStructSearchCompareTests(unittest.TestCase):
    useFull = False

    def setUp(self):

        self.__workPath = os.path.join(HERE, "test-output")
        self.__dataPath = os.path.join(HERE, "test-data")
        self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
        self.__ccUrlTarget = os.path.join(self.__dataPath,
                                          "components-abbrev.cif")
        self.__birdUrlTarget = os.path.join(self.__dataPath,
                                            "prdcc-abbrev.cif")
        self.__doDisplay = True
        self.__numProcPrep = 6
        self.__numProcSearch = 6
        self.__minCount = None
        self.__startTime = time.time()
        #
        if OeSubStructSearchCompareTests.useFull:
            self.__myKwargs = {
                "cachePath": self.__cachePath,
                "useCache": True,
                "ccFileNamePrefix": "cc-full",
                "oeFileNamePrefix": "oe-full",
                "molBuildType": "model-xyz",
                "limitPerceptions": False,
                "screenTypeList": None,
                "numProc": self.__numProcPrep,
                "suppressHydrogens": True,
                "matchOpts": "sub-struct-graph-relaxed",
                "fpTypeCuttoffD": {
                    "TREE": 0.6,
                    "MACCS": 0.9
                },
                "maxFpResults": 50,
            }
        else:
            self.__myKwargs = {
                "ccUrlTarget": self.__ccUrlTarget,
                "birdUrlTarget": self.__birdUrlTarget,
                "cachePath": self.__cachePath,
                "useCache": True,
                "ccFileNamePrefix": "cc-abbrev",
                "oeFileNamePrefix": "oe-abbrev",
                "molBuildType": "model-xyz",
                "limitPerceptions": False,
                "screenTypeList": None,
                "numProc": self.__numProcPrep,
                "suppressHydrogens": True,
                "matchOpts": "sub-struct-graph-relaxed",
                "fpTypeCuttoffD": {
                    "TREE": 0.6,
                    "MACCS": 0.9
                },
                "maxFpResults": 50,
            }
        #
        self.__oesmP = OeSearchMoleculeProvider(**self.__myKwargs)
        ok = self.__oesmP.testCache()
        self.assertTrue(ok)
        #
        self.__ccmP = ChemCompMoleculeProvider(**self.__myKwargs)
        self.__ccmP.testCache()
        #
        self.__ccsidxP = ChemCompSearchIndexProvider(**self.__myKwargs)
        ok = self.__ccsidxP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        self.__oessU = OeSubStructSearchUtils(self.__oesmP)
        ok = self.__oessU.testCache()
        self.assertTrue(ok)
        #
        fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {})
        fpTypeList = [k for k, v in fpTypeCuttoffD.items()]
        self.__oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList)
        ok = self.__oesU.testCache()
        self.assertTrue(ok)
        #
        logger.debug("Running tests on version %s", __version__)
        logger.info("Starting %s at %s", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    @unittest.skipIf(not useFull, "Requires full data set")
    def testSubStructSearchDescriptor(self):
        #
        query = "n1ccccc1"
        queryId = "query-smiles"
        queryType = "oe-iso-smiles"
        #
        limitPerceptions = self.__myKwargs.get("limitPerceptions", False)
        suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True)
        numProc = self.__myKwargs.get("numProc", 4)
        # for matchOpts in ["sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo", "sub-struct-graph-strict"]:
        for matchOpts in ["sub-struct-graph-strict"]:
            #
            oeMol = self.__getMol(query,
                                  queryType,
                                  queryId,
                                  limitPerceptions=limitPerceptions,
                                  suppressHydrogens=suppressHydrogens)
            startTime = time.time()
            retStatus, mL = self.__search(oeMol, matchOpts, numProc)
            logger.info(
                "%s status (%r) matchOpts %s result %d in (%.4f seconds)",
                queryId, retStatus, matchOpts, len(mL),
                time.time() - startTime)
            self.assertTrue(retStatus)
            if queryType == "CC":
                self.assertTrue(self.__resultContains(queryId, mL))
            #
            if self.__doDisplay:
                self.__display(mL, query, queryId, queryType, matchOpts)

    @unittest.skipIf(not useFull, "Requires full data set")
    def testSubStructSearchSelected(self):
        #
        query = queryId = "STI"
        queryType = "CC"
        #
        limitPerceptions = self.__myKwargs.get("limitPerceptions", False)
        suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True)
        numProc = self.__myKwargs.get("numProc", 4)
        for matchOpts in [
                "sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo",
                "sub-struct-graph-strict"
        ]:
            #
            oeMol = self.__getMol(query,
                                  queryType,
                                  queryId,
                                  limitPerceptions=limitPerceptions,
                                  suppressHydrogens=suppressHydrogens)
            startTime = time.time()
            retStatus, mL = self.__search(oeMol, matchOpts, numProc)
            logger.info(
                "%s status (%r) matchOpts %s result %d in (%.4f seconds)",
                queryId, retStatus, matchOpts, len(mL),
                time.time() - startTime)
            self.assertTrue(retStatus)
            if queryType == "CC":
                self.assertTrue(self.__resultContains(queryId, mL))
            #
            if self.__doDisplay:
                self.__display(mL, query, queryId, queryType, matchOpts)

    @unittest.skipIf(not useFull, "Requires full data set")
    def testSubStructSearchAll(self):
        #
        ccD = self.__ccmP.getMolD()
        for ccId in ccD:
            query = queryId = ccId
            if ccId in ["UNX", "UNL", "UNK", "DUM"]:
                continue
            queryType = "CC"
            #
            limitPerceptions = self.__myKwargs.get("limitPerceptions", False)
            suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True)
            numProc = self.__myKwargs.get("numProc", 5)
            for matchOpts in [
                    "sub-struct-graph-relaxed",
                    "sub-struct-graph-relaxed-stereo",
                    "sub-struct-graph-strict"
            ]:
                #
                oeMol = self.__getMol(query,
                                      queryType,
                                      queryId,
                                      limitPerceptions=limitPerceptions,
                                      suppressHydrogens=suppressHydrogens)
                if oeMol.NumAtoms() < 3:
                    continue
                #
                startTime = time.time()
                retStatus, mL = self.__search(oeMol, matchOpts, numProc)
                logger.info(
                    "%s status (%r) matchOpts %s result %d in (%.4f seconds)",
                    queryId, retStatus, matchOpts, len(mL),
                    time.time() - startTime)
                self.assertTrue(retStatus)
                if queryType == "CC":
                    self.assertTrue(self.__resultContains(queryId, mL))
                #
                if self.__doDisplay:
                    self.__display(mL, query, queryId, queryType, matchOpts)

    @unittest.skipIf(not useFull, "Requires full data set")
    def testMatchSearchSelected(self):
        #
        query = queryId = "STI"
        queryType = "CC"
        #
        limitPerceptions = self.__myKwargs.get("limitPerceptions", False)
        suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True)
        numProc = self.__myKwargs.get("numProc", 4)
        for matchOpts in [
                "fingerprint-similarity", "graph-relaxed",
                "graph-relaxed-stereo", "graph-strict"
        ]:
            #
            oeMol = self.__getMol(query,
                                  queryType,
                                  queryId,
                                  limitPerceptions=limitPerceptions,
                                  suppressHydrogens=suppressHydrogens)
            startTime = time.time()
            retStatus, mL = self.__search(oeMol, matchOpts, numProc)
            logger.info(
                "%s status (%r) matchOpts %s result %d in (%.4f seconds)",
                queryId, retStatus, matchOpts, len(mL),
                time.time() - startTime)
            self.assertTrue(retStatus)
            if queryType == "CC":
                self.assertTrue(self.__resultContains(queryId, mL))
            #
            if self.__doDisplay:
                self.__display(mL, query, queryId, queryType, matchOpts)

    def __search(self, oeMol, matchOpts, numProc):
        if matchOpts.startswith("sub-struct-"):
            retStatus, mL = self.__subStructureSearch(oeMol,
                                                      matchOpts=matchOpts,
                                                      numProc=numProc)
        else:
            retStatus, mL, fpL = self.__matchSearch(oeMol, matchOpts=matchOpts)
        #
        rL = fpL if matchOpts in ["fingerprint-similarity"] else mL
        return retStatus, rL

    #
    def __subStructureSearch(self, oeMol, matchOpts, numProc):
        ##
        ccIdL = self.__oessU.prefilterIndex(oeMol,
                                            self.__ccsidxP,
                                            matchOpts=matchOpts,
                                            skipFeatures=False)
        retStatus, mL = self.__oessU.searchSubStructure(oeMol,
                                                        ccIdList=ccIdL,
                                                        matchOpts=matchOpts,
                                                        numProc=numProc)
        return retStatus, mL

    def __matchSearch(self, oeMol, matchOpts="graph-relaxed"):
        ssL = fpL = []
        try:
            fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {})
            maxFpResults = self.__myKwargs.get("maxFpResults", 50)
            retStatus, ssL, fpL = self.__oesU.searchSubStructureAndFingerPrint(
                oeMol,
                list(fpTypeCuttoffD.items())[:2],
                maxFpResults,
                matchOpts=matchOpts)
            # logger.info("fpL %r", fpL)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            #
        return retStatus, ssL, fpL

    def __getMol(self,
                 query,
                 queryType,
                 queryId,
                 limitPerceptions=False,
                 suppressHydrogens=True):
        oeioU = OeIoUtils()
        if queryType == "CC":
            oeMol = self.__oesmP.getMol(query)
        else:
            oeMol = oeioU.descriptorToMol(query,
                                          queryType,
                                          limitPerceptions=limitPerceptions,
                                          messageTag=queryId)
        #
        if suppressHydrogens:
            oeMol = oeioU.suppressHydrogens(oeMol)
        oeMol.SetTitle(queryId)
        return oeMol

    def __resultContains(self, ccId, matchResultList):
        for matchResult in matchResultList:
            if matchResult.ccId == ccId:
                return True
        return False

    #
    #  ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------
    def __display(self, mL, query, queryId, queryType, matchOpts):
        smL = sorted(mL, key=lambda kv: kv.fpScore, reverse=True)
        # ----
        tD = {}
        for sm in smL:
            ccId = sm.ccId.split("|")[0]
            tD.setdefault(ccId, []).append(sm)
        dL = []
        for ccId, ttL in tD.items():
            if len(ttL) == 1:
                dL.append(ttL[0])
            else:
                parent = False
                for tt in ttL:
                    if tt.ccId == ccId:
                        dL.append(tt)
                        parent = True
                        break
                if not parent:
                    dL.append(ttL[0])
        # ----
        pdfImagePath = os.path.join(self.__workPath,
                                    queryId + "-" + matchOpts + ".pdf")
        self.__displayPaginatedAlignments(pdfImagePath,
                                          query,
                                          queryType,
                                          queryId,
                                          dL,
                                          matchOpts=matchOpts)

    def __displayPaginatedAlignments(self,
                                     pdfImagePath,
                                     query,
                                     queryType,
                                     queryId,
                                     matchResultList,
                                     matchOpts="relaxed-stereo",
                                     alignMode="SS"):
        refId = queryId
        oeMolRef = self.__getMol(query,
                                 queryType,
                                 queryId,
                                 limitPerceptions=False,
                                 suppressHydrogens=True)
        pairList = []
        for mr in sorted(matchResultList,
                         key=lambda kv: kv.fpScore,
                         reverse=True):
            fitId = mr.ccId.split("|")[0]
            if len(mr.ccId) > 4:
                fitId = fitId + " (tautomer/protomer)"
            oeMolFit = self.__oesmP.getMol(mr.ccId)
            pairList.append((refId, oeMolRef, fitId, oeMolFit))
        #
        self.__depictFitList(pdfImagePath,
                             pairList,
                             matchOpts=matchOpts,
                             alignMode=alignMode)

    def __pairDepictPage(self,
                         imagePath,
                         refId,
                         refTitle,
                         refMol,
                         fitId,
                         fitTitle,
                         fitMol,
                         matchOpts="strict"):
        """Depict pairwise alignment of the input reference and fit molecules.

        Args:
            imagePath (str): path to image (format by path extension)
            refId (str): reference molecule identifier
            refTitle (str): reference molecule title
            refMol (obj): reference OE molecule object
            fitId (str): fit molecule identifier
            fitTitle (str): fit molecule title
            fitMol (obj): fit OE molecule object
            matchOpts (str, optional): alignment criteria (relaxed|relaxed-stereo|strict). Defaults to "strict".

        Returns:
            (list): atom mapping in all aligned figures
                    [(reference component Id, reference atom name, fit chemical component Id, fit atom name)
        """
        aML = []
        try:
            oed = OeDepictMCSAlignPage()
            oed.setSearchType(sType=matchOpts)

            oed.setRefMol(refMol, refId, title=refTitle)
            oed.setFitMol(fitMol, fitId, title=fitTitle)
            oed.setDisplayOptions(
                imageSizeX=2000,
                imageSizeY=1000,
                labelAtomName=True,
                labelAtomCIPStereo=True,
                labelAtomIndex=False,
                labelBondIndex=False,
                highlightStyleFit="ballAndStickInverse",
                bondDisplayWidth=0.5,
                highLightMatchColorRef="green",
                highLightNotMatchColorRef="pink",
            )
            aML = oed.alignPair(imagePath=imagePath)
            if aML:
                for (rCC, rAt, tCC, tAt) in aML:
                    logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return aML

    def __depictFitList(self,
                        pdfImagePath,
                        pairList,
                        matchOpts="exact",
                        alignMode="SS"):
        """Depict pairwise alignments with multi-page layout in PDF format.

        Args:
            pdfImagePath (str): PDF image path
            pairList (list): [(refId, refOeMol, fitId, fitOeMol)]

        Returns:
            (list): atom mapping in all aligned figures
                    [(reference component Id, reference atom name, fit chemical component Id, fit atom name)
        """
        aML = []
        try:
            if alignMode == "MCSS":
                oed = OeDepictMCSAlignMultiPage()
            else:
                oed = OeDepictSubStructureAlignMultiPage()
            oed.setSearchType(sType=matchOpts)
            oed.setPairMolList(pairList)

            oed.setDisplayOptions(
                labelAtomName=True,
                labelAtomCIPStereo=True,
                labelAtomIndex=False,
                labelBondIndex=False,
                highlightStyleFit="ballAndStickInverse",
                pageOrientation="portrait",
                gridRows=4,
                bondDisplayWidth=0.5,
                highLightMatchColorRef="green",
                highLightNotMatchColorRef="pink",
            )
            aML = oed.alignPairListMulti(imagePath=pdfImagePath)
            if aML:
                for (rCC, rAt, tCC, tAt) in aML:
                    logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return aML
    def __sssWithFingerPrintFromDescriptor(self, numMols, **kwargs):
        maxFpResults = kwargs.get("maxResults", 50)
        limitPerceptions = kwargs.get("limitPerceptions", False)
        fpTypeCuttoffList = kwargs.get("fpTypeCuttoffList", [("TREE", 0.6)])
        buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"])
        doDisplay = kwargs.get("doDisplay", False)
        #
        oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs)
        oesU = OeSearchUtils(oesmP,
                             fpTypeList=[tup[0] for tup in fpTypeCuttoffList])
        oeioU = OeIoUtils()
        # This will reload the oe binary cache.
        oeMol = oesmP.getMol("004")
        self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12)

        # matchOpts = "graph-relaxed"
        matchOpts = "graph-strict"
        missTupL = []
        missedD = {}
        missedFpD = {}
        numMols = min(len(ccIdxD), numMols) if numMols else len(ccIdxD)
        logger.info(
            "Begin substructure search w/ finger print filter on %d molecules",
            numMols)
        # ----
        startTime = time.time()
        for (
                ii,
                ccId,
        ) in enumerate(list(ccIdxD.keys())[:numMols]):
            ccD = ccIdxD[ccId]
            for buildType in buildTypeList:
                if buildType in ccD:
                    startTime1 = time.time()
                    oeMol = oeioU.descriptorToMol(
                        ccD[buildType],
                        buildType,
                        limitPerceptions=limitPerceptions,
                        messageTag=ccId + ":" + buildType)
                    if not oeMol:
                        logger.debug("%s build failed for %s - skipping", ccId,
                                     buildType)
                        continue
                    maxHits = 0
                    minHits = maxFpResults
                    selfHit = False
                    for fpType, minFpScore in fpTypeCuttoffList:
                        retStatus, mL = oesU.searchSubStructureWithFingerPrint(
                            oeMol,
                            fpType,
                            minFpScore,
                            maxFpResults,
                            matchOpts=matchOpts)
                        self.assertTrue(retStatus)
                        logger.debug("%s fpType %r hits %d", ccId, fpType,
                                     len(mL))
                        maxHits = max(maxHits, len(mL))
                        minHits = min(minHits, len(mL))
                        matchedSelf = self.__resultContains(ccId, mL)
                        selfHit = selfHit or matchedSelf
                        if not matchedSelf:
                            missedFpD.setdefault(ccId, []).append(
                                (buildType, fpType, len(mL)))
                    if not selfHit:
                        missedD.setdefault(ccId, []).append(buildType)

                    if maxHits < 1 or not selfHit:
                        logger.info(
                            "%s (%r) buildType %r min hits %d max hits %d (%.4f seconds)",
                            ccId, selfHit, buildType, minHits, maxHits,
                            time.time() - startTime1)
                else:
                    logger.debug("%s missing descriptor %r", ccId, buildType)
            if ii % 100 == 0:
                logger.info("Completed %d of %d missed count %d", ii, numMols,
                            len(missedD))
        #
        for ccId, missL in missedD.items():
            logger.info("%s missed list %r", ccId, missL)
            if ccId in missedFpD:
                logger.info("%s unmatched for fpTypes %r", ccId,
                            missedFpD[ccId])
        # ----
        if doDisplay:
            mD = {}
            for missTup in missTupL:
                mD.setdefault(missTup[0], []).append(missTup[1])

            for ccId, buildTypeL in mD.items():
                idxD = ccIdxD[ccId]
                if "oe-iso-smiles" in idxD:
                    for buildType in buildTypeL:
                        self.__displayAlignedDescriptorPair(
                            ccId,
                            idxD["oe-iso-smiles"],
                            "oe-iso-smiles",
                            idxD[buildType],
                            buildType,
                            title=None,
                            limitPerceptions=True)

        logger.info("%s fingerprints search on %d in (%.4f seconds)",
                    len(fpTypeCuttoffList), numMols,
                    time.time() - startTime)
        return True
Example #14
0
    def testSssWithFingerPrintFromDescriptor(self):
        oemp = OeMoleculeProvider(**self.__myKwargs)
        ok = oemp.testCache()
        ccmP = ChemCompIndexProvider(**self.__myKwargs)
        ccIdxD = ccmP.getIndex()
        ok = ccmP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        limitPerceptions = False
        # minFpScore = 0.5
        maxFpResults = 50
        matchOpts = "graph-relaxed"
        numMols = 20
        oeioU = OeIoUtils()
        oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList)
        missTupL = []
        missedD = {}
        missedFpD = {}
        # ----
        startTime = time.time()
        for ccId, ccD in list(ccIdxD.items())[:numMols]:
            for buildType in [
                    "oe-iso-smiles", "oe-smiles", "acdlabs-smiles",
                    "cactvs-iso-smiles", "cactvs-smiles", "inchi"
            ]:
                if buildType in ccD:
                    logger.debug("Search %s %r", ccId, ccD[buildType])
                    if buildType in ["inchi"]:
                        oemf = OeMoleculeFactory()
                        oemf.setDescriptor(ccD["inchi"], "inchi", ccId)
                        ok = oemf.build(molBuildType="inchi",
                                        limitPerceptions=limitPerceptions)
                        if not ok:
                            logger.info("%s build failed with InChI %r", ccId,
                                        ccD["inchi"])
                        else:
                            oeMol = oemf.getMol()
                            if oemf.getInChI() != ccD["inchi"]:
                                logger.info(
                                    "%s regenerated InChI differs\n%r\n%s",
                                    ccId, ccD["inchi"], oemf.getInChI())
                    else:
                        oeMol = oeioU.smilesToMol(
                            ccD[buildType], limitPerceptions=limitPerceptions)
                    if not oeMol:
                        continue
                    maxHits = 0
                    minHits = maxFpResults
                    selfHit = False
                    for fpType, minFpScore in self.__fpTypeCuttoffList:
                        retStatus, mL = oesU.searchSubStructureWithFingerPrint(
                            oeMol,
                            fpType,
                            minFpScore,
                            maxFpResults,
                            matchOpts=matchOpts)
                        self.assertTrue(retStatus)
                        logger.debug("%s fpType %r hits %d", ccId, fpType,
                                     len(mL))
                        maxHits = max(maxHits, len(mL))
                        minHits = min(minHits, len(mL))
                        matchedSelf = self.__resultContains(ccId, mL)
                        selfHit = selfHit or matchedSelf
                        if not matchedSelf:
                            missedFpD.setdefault(ccId, []).append(
                                (buildType, fpType, len(mL)))
                    if not selfHit:
                        missedD.setdefault(ccId, []).append(buildType)

                    logger.info("%s (%r) buildType %r min hits %d max hits %d",
                                ccId, selfHit, buildType, minHits, maxHits)
                else:
                    logger.info("%s missing descriptor %r", ccId, buildType)
        #
        for ccId, missL in missedD.items():
            logger.info("%s missed list %r", ccId, missL)
            if ccId in missedFpD:
                logger.info("%s unmatched for fpTypes %r", ccId,
                            missedFpD[ccId])
        # ----
        doDepict = False
        if doDepict:
            mD = {}
            for missTup in missTupL:
                mD.setdefault(missTup[0], []).append(missTup[1])

            for ccId, buildTypeL in mD.items():
                idxD = ccIdxD[ccId]
                if "oe-iso-smiles" in idxD:
                    for buildType in buildTypeL:
                        self.__displayAlignedDescriptorPair(
                            ccId,
                            idxD["oe-iso-smiles"],
                            "oe-iso-smiles",
                            idxD[buildType],
                            buildType,
                            title=None,
                            limitPerceptions=True)

        logger.info("%s fingerprints search on %d in (%.4f seconds)",
                    len(self.__fpTypeList), numMols,
                    time.time() - startTime)
    def __fingerPrintScores(self, numMols, **kwargs):
        maxFpResults = kwargs.get("maxResults", 50)
        limitPerceptions = kwargs.get("limitPerceptions", True)
        fpTypeCuttoffList = kwargs.get("fpTypeCuttoffList", [("TREE", 0.6)])
        buildTypeList = kwargs.get("buildTypeList", ["oe-iso-smiles"])
        doDisplay = kwargs.get("doDisplay", False)
        failedIdList = kwargs.get("failedIdList", [])
        #
        oesmP, ccIdxD = self.__getSearchDataProviders(**kwargs)
        oesU = OeSearchUtils(oesmP,
                             fpTypeList=[tup[0] for tup in fpTypeCuttoffList])
        oeioU = OeIoUtils()
        # This will reload the oe binary cache.
        oeMol = oesmP.getMol("004")
        self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12)
        #
        missedFpD = {}
        missedBuildD = {}
        numMols = min(len(ccIdxD), numMols) if numMols else len(ccIdxD)
        logger.info("Begin finger print score search on %d molecules", numMols)
        # ----
        startTime = time.time()
        # for ccId, ccD in list(ccIdxD.items())[:numMols]:
        for ii, ccId in enumerate(failedIdList[:numMols]):
            ccD = ccIdxD[ccId]
            for buildType in buildTypeList:
                if buildType in ccD:
                    oeMol = oeioU.descriptorToMol(
                        ccD[buildType],
                        buildType,
                        limitPerceptions=limitPerceptions,
                        messageTag=ccId + ":" + buildType)
                    if not oeMol:
                        logger.debug("%s build failed for %s - skipping", ccId,
                                     buildType)
                        continue
                    maxHits = 0
                    minHits = maxFpResults
                    selfHit = False
                    #
                    startTime1 = time.time()
                    for fpType, minFpScore in fpTypeCuttoffList:
                        retStatus, mL = oesU.getFingerPrintScores(
                            oeMol, fpType, minFpScore, maxFpResults)
                        self.assertTrue(retStatus)
                        logger.debug("%s fpType %r hits %d", ccId, fpType,
                                     len(mL))
                        maxHits = max(maxHits, len(mL))
                        minHits = min(minHits, len(mL))
                        matchedSelf = self.__resultContains(ccId, mL)
                        selfHit = selfHit or matchedSelf
                        if not matchedSelf:
                            missedFpD.setdefault(ccId, []).append(
                                (buildType, fpType, len(mL)))
                    #
                    if not selfHit:
                        missedBuildD.setdefault(ccId, []).append(buildType)
                    #
                    if maxHits < 1 or not selfHit:
                        logger.info(
                            "%s MISSED for buildType %r min hits %d max hits %d (%.4f seconds)",
                            ccId, buildType, minHits, maxHits,
                            time.time() - startTime1)
                    else:
                        logger.debug(
                            "%s MATCHED for buildType %r min hits %d max hits %d (%.4f seconds)",
                            ccId, buildType, minHits, maxHits,
                            time.time() - startTime1)
                else:
                    logger.debug("%s missing descriptor %r", ccId, buildType)
            if ii % 100 == 0:
                logger.info(
                    "Completed %d of %d missed count %d in (%.4f seconds)", ii,
                    len(failedIdList), len(missedBuildD),
                    time.time() - startTime)

        # ------
        for ccId, bTL in missedBuildD.items():
            logger.info("%s missed all fptypes:  buildtype list %r", ccId, bTL)

        if ccId in missedFpD:
            logger.info("%s unmatched by fpTypes %r", ccId, missedFpD[ccId])

        #
        if doDisplay:
            for ccId, bTL in missedBuildD.items():
                idxD = ccIdxD[ccId]
                if "oe-iso-smiles" in idxD:
                    for bT in bTL:
                        self.__displayAlignedDescriptorPair(
                            ccId,
                            idxD["oe-iso-smiles"],
                            "oe-iso-smiles",
                            idxD[bT],
                            bT,
                            title=None,
                            limitPerceptions=True)

        logger.info("%s fingerprints search on %d in (%.4f seconds)",
                    len(fpTypeCuttoffList), numMols,
                    time.time() - startTime)