Ejemplo n.º 1
0
    def __init__(self, cachePath, prefix=None, **kwargs):
        self.__cachePath = cachePath

        self.__prefix = prefix
        startTime = time.time()
        useCache = True
        self.__timeOut = kwargs.get("timeOut", None)
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc"
        oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe"
        #
        self.__startTime = time.time()
        self.__ccmP = ChemCompMoleculeProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=cachePath,
            useCache=useCache,
            ccFileNamePrefix=ccFileNamePrefix)
        ok1 = self.__ccmP.testCache()

        self.__ccSIdxP = ChemCompSearchIndexProvider(
            cachePath=cachePath,
            useCache=useCache,
            ccFileNamePrefix=ccFileNamePrefix)
        ok2 = self.__ccSIdxP.testCache()

        molLimit = kwargs.get("molLimit", None)
        quietFlag = kwargs.get("quietFlag", True)
        fpTypeList = kwargs.get("fpTypeList", [])
        screenTypeList = kwargs.get("screenTypeList", [])
        limitPerceptions = kwargs.get("limitPerceptions", False)
        numProc = kwargs.get("numProc", 4)
        self.__oesmP = OeSearchMoleculeProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=self.__cachePath,
            ccFileNamePrefix=ccFileNamePrefix,
            oeFileNamePrefix=oeFileNamePrefix,
            useCache=useCache,
            quietFlag=quietFlag,
            fpTypeList=fpTypeList,
            screenTypeList=screenTypeList,
            numProc=numProc,
            molLimit=molLimit,
            limitPerceptions=limitPerceptions,
        )
        ok3 = self.__oesmP.testCache()
        self.__oesmP.getOeMolD()

        logger.info(
            "Completed chemical component search index load %r (%.4f seconds)",
            ok1 & ok2 & ok3,
            time.time() - startTime)
        logUsage("main", "Setup completed", self.__startTime)
        #
        logger.info("Starting model build (%s) at %s", __version__,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
    def __reload(self, **kwargs):
        """Reload or created index of PDB chemical components.

        Args:
            cachePath (str): path to the directory containing cache files
            ccIdxFileName (str): serialized chemical component data index file name


         Returns:
            (list): chemical component data containers
        """
        #
        logger.debug("kwargs %r", kwargs.items())
        ccIdxD = {}
        useCache = kwargs.get("useCache", True)
        molLimit = kwargs.get("molLimit", 0)

        ccIdxFilePath = self.getIndexFilePath()
        #
        if useCache and self.__mU.exists(ccIdxFilePath):
            _, fExt = os.path.splitext(ccIdxFilePath)
            ccIdxFormat = "json" if fExt == ".json" else "pickle"
            rdCcIdxD = self.__mU.doImport(ccIdxFilePath, fmt=ccIdxFormat)
            ccIdxD = {
                k: rdCcIdxD[k]
                for k in sorted(rdCcIdxD.keys())[:molLimit]
            } if molLimit else rdCcIdxD
        else:
            cmpKwargs = {
                k: v
                for k, v in kwargs.items()
                if k not in ["cachePath", "useCache", "molLimit"]
            }
            ccmP = ChemCompMoleculeProvider(cachePath=self.__cachePath,
                                            useCache=useCache,
                                            molLimit=molLimit,
                                            **cmpKwargs)
            ok = ccmP.testCache(minCount=molLimit, logSizes=True)
            if ok:
                molBuildType = cmpKwargs.get("molBuildType", "model-xyz")
                ccIdxD = self.__updateChemCompIndex(ccmP.getMolD(),
                                                    ccIdxFilePath,
                                                    molBuildType=molBuildType)
        #
        for idxD in ccIdxD.values():
            idxD["atom-types"] = set(idxD["type-counts"].keys()
                                     ) if "type-counts" in idxD else set()
            idxD["feature-types"] = set(idxD["feature-counts"].keys(
            )) if "feature-counts" in idxD else set()
        #
        return ccIdxD
Ejemplo n.º 3
0
    def __reload(self, **kwargs):
        """Reload or created index of PDB chemical components.

        Args:
            cachePath (str): path to the directory containing cache files
            ccIdxFileName (str): serialized chemical component data index file name

         Returns:
            (list): chemical component data containers
        """
        #
        searchIdxD = {}
        useChemAxon = kwargs.get("useChemAxon", True)
        useCache = kwargs.get("useCache", True)
        molLimit = kwargs.get("molLimit", 0)
        numProc = kwargs.get("numProc", 1)
        maxChunkSize = kwargs.get("maxChunkSize", 20)
        limitPerceptions = kwargs.get("limitPerceptions", True)
        quietFlag = kwargs.get("quietFlag", True)
        skipObsolete = kwargs.get("skipObsolete", True)
        searchIdxFilePath = self.getIndexFilePath()
        #
        if useCache and self.__mU.exists(searchIdxFilePath):
            _, fExt = os.path.splitext(searchIdxFilePath)
            searchIdxFormat = "json" if fExt == ".json" else "pickle"
            rdCcIdxD = self.__mU.doImport(searchIdxFilePath, fmt=searchIdxFormat)
            searchIdxD = {k: rdCcIdxD[k] for k in sorted(rdCcIdxD.keys())[:molLimit]} if molLimit else rdCcIdxD
        else:
            cmpKwargs = {k: v for k, v in kwargs.items() if k not in ["cachePath", "useCache", "molLimit"]}
            ccmP = ChemCompMoleculeProvider(cachePath=self.__cachePath, useCache=True, molLimit=molLimit, skipObsolete=skipObsolete, **cmpKwargs)
            ok1 = ccmP.testCache(minCount=molLimit, logSizes=True)
            #
            descrD = {}
            ok2 = True
            if useChemAxon:
                caxP = ChemAxonDescriptorProvider(cachePath=self.__cachePath, useCache=True, **cmpKwargs)
                ok2 = caxP.testCache(minCount=molLimit)
                descrD = caxP.getDescriptorIndex()
            #
            if ok1 & ok2:
                searchIdxD = self.__updateChemCompSearchIndex(ccmP.getMolD(), descrD, searchIdxFilePath, molLimit, limitPerceptions, numProc, maxChunkSize, quietFlag)
                logger.info("Storing %s with data for %d search candidates (status=%r) ", searchIdxFilePath, len(searchIdxD), ok1 & ok2)
        # logger.info("Using Chemaxon descriptors for (%d) components", descrD)
        #
        for idxD in searchIdxD.values():
            idxD["atom-types"] = set(idxD["type-counts"].keys()) if "type-counts" in idxD else set()

        return searchIdxD
Ejemplo n.º 4
0
 def __getChemCompDefs(self, molLimit=None):
     ccMolD = {}
     try:
         useCache = True
         ccFileNamePrefix = "cc-abbrev"
         ccmP = ChemCompMoleculeProvider(
             ccUrlTarget=self.__ccUrlTarget,
             birdUrlTarget=self.__birdUrlTarget,
             cachePath=self.__cachePath,
             useCache=useCache,
             ccFileNamePrefix=ccFileNamePrefix,
             molLimit=molLimit,
         )
         ok = ccmP.testCache(minCount=molLimit)
         self.assertTrue(ok)
         ccMolD = ccmP.getMolD()
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return ccMolD
Ejemplo n.º 5
0
 def __getChemCompDefs(self):
     ccMolD = {}
     ccIdxD = {}
     try:
         ccmP = ChemCompMoleculeProvider(**self.__myKwargs)
         ok = ccmP.testCache()
         ccMolD = ccmP.getMolD()
         ccmP = ChemCompIndexProvider(**self.__myKwargs)
         ccIdxD = ccmP.getIndex()
         ok = ccmP.testCache(minCount=10)
         self.assertTrue(ok)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return ccMolD, ccIdxD
 def __testBuildMoleculeCacheFiles(self, **kwargs):
     """Test build chemical component cache files from the input component dictionaries"""
     try:
         ccUrlTarget = kwargs.get("ccUrlTarget", None)
         birdUrlTarget = kwargs.get("birdUrlTarget", None)
         molLimit = kwargs.get("molLimit", None)
         minCount = kwargs.get("minCount", None)
         useCache = kwargs.get("useCache", False)
         logSizes = kwargs.get("logSizes", False)
         filterIdD = kwargs.get("filterIdD", None)
         skipObsolete = kwargs.get("skipObsolete", True)
         ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc")
         #
         if ccUrlTarget and birdUrlTarget:
             ccmP = ChemCompMoleculeProvider(
                 ccUrlTarget=ccUrlTarget,
                 birdUrlTarget=birdUrlTarget,
                 ccFileNamePrefix=ccFileNamePrefix,
                 cachePath=self.__cachePath,
                 useCache=useCache,
                 molLimit=molLimit,
                 filterIdD=filterIdD,
                 skipObsolete=skipObsolete,
             )
             ok = ccmP.testCache(minCount=molLimit, logSizes=logSizes)
             self.assertTrue(ok)
         else:
             ccmP = ChemCompMoleculeProvider(
                 cachePath=self.__cachePath,
                 useCache=useCache,
                 ccFileNamePrefix=ccFileNamePrefix,
                 molLimit=molLimit,
                 filterIdD=filterIdD)
             ok = ccmP.testCache(minCount=minCount, logSizes=logSizes)
             self.assertTrue(ok)
     except Exception as e:
         logger.info("Failing with %s", str(e))
         self.fail()
Ejemplo n.º 7
0
    def __reload(self, **kwargs):
        """Reload the dictionary of OE molecules and related data artifacts for chemical component definitions.

        Args:
            molBuildType (str):  coordinates to use in building OE molecules from CIF components (model, ideal or None)
            limitPerceptions(bool): process input descriptors in essentially verbatim mode (default: True)
            fpTypeList (list): fingerprint type (TREE,PATH,MACCS,CIRCULAR,LINGO)
            screenTypeList (list): fast sub search screen type (MOLECULE, SMARTS, MDL, ... )
            useCache (bool, optional): flag to use cached files. Defaults to True.
            cachePath (str): path to the top cache directory. Defaults to '.'.
            numProc (int): number processors to engage in screen substructure search database generation.
            molLimit (int, optional): limiting number of molecules in data store (default: 0 no limit)
            suppressHydrogens (bool, optional): flag to suppress explicit hydrogens in the OE data store.

        Returns:
            (dict): dictionary of constructed OE molecules

        """
        useCache = kwargs.get("useCache", True)
        cachePath = kwargs.get("cachePath", ".")
        numProc = kwargs.get("numProc", 2)
        molLimit = kwargs.get("molLimit", 0)
        fpTypeList = kwargs.get("fpTypeList",
                                ["TREE", "PATH", "MACCS", "CIRCULAR", "LINGO"])
        # screenTypeList = kwargs.get("screenTypeList", ["SMARTS"])
        screenTypeList = kwargs.get("screenTypeList", [])
        molBuildType = kwargs.get("molBuildType", "model-xyz")
        limitPerceptions = kwargs.get("limitPerceptions", False)
        quietFlag = kwargs.get("quietFlag", True)
        suppressHydrogens = kwargs.get("suppressHydrogens", False)
        logSizes = kwargs.get("logSizes", False)
        fpDbType = "STANDARD"
        #
        ccCount = 0
        oeCount = 0
        errCount = 0
        failIdList = []
        oeIo = OeIoUtils(quietFlag=quietFlag)
        # --------
        oeMolFilePath = os.path.join(self.__dirPath, self.__getOeMolFileName())
        if not useCache or (useCache and not self.__mU.exists(oeMolFilePath)):
            cmpKwargs = {
                k: v
                for k, v in kwargs.items()
                if k not in ["cachePath", "useCache", "molLimit"]
            }
            ccmP = ChemCompMoleculeProvider(cachePath=cachePath,
                                            useCache=True,
                                            molLimit=molLimit,
                                            **cmpKwargs)
            ok = ccmP.testCache(minCount=molLimit, logSizes=logSizes)
            ccObjD = ccmP.getMolD() if ok else {}
            ccCount = len(ccObjD)
            # -------
            startTime = time.time()
            oeCount, errCount, failIdList = oeIo.buildOeBinaryMolCache(
                oeMolFilePath,
                ccObjD,
                molBuildType=molBuildType,
                quietFlag=quietFlag,
                fpTypeList=fpTypeList,
                limitPerceptions=limitPerceptions,
                suppressHydrogens=suppressHydrogens)
            logger.info(
                "Stored %d/%d OeMols (suppressH = %r) created with molBuildType %r (unconverted %d)",
                oeCount, ccCount, suppressHydrogens, molBuildType, errCount)
            if failIdList:
                logger.info("%r failures %r", molBuildType, failIdList)
            endTime = time.time()
            logger.info("Constructed %d/%d cached oeMols (%.4f seconds)",
                        oeCount, ccCount, endTime - startTime)
        # --------
        oeMolDbFilePath = os.path.join(self.__dirPath,
                                       self.__getOeMolDbFileName())
        if not useCache or (useCache
                            and not self.__mU.exists(oeMolDbFilePath)):
            startTime = time.time()
            molCount = oeIo.createOeBinaryDatabaseAndIndex(
                oeMolFilePath, oeMolDbFilePath)
            endTime = time.time()
            logger.info(
                "Created and stored %d indexed OeMols in OE database format (%.4f seconds)",
                molCount, endTime - startTime)

        # --------
        if fpDbType == "FAST":
            for fpType in fpTypeList:
                startTime = time.time()
                #  Fast FP search database file names
                fpPath = os.path.join(self.__dirPath,
                                      self.__getFastFpDbFileName(fpType))
                if not useCache or (useCache and not self.__mU.exists(fpPath)):
                    ok = oeIo.createOeFingerPrintDatabase(oeMolDbFilePath,
                                                          fpPath,
                                                          fpType=fpType)
                    endTime = time.time()
                    logger.info(
                        "Created and stored %s fingerprint database (%.4f seconds)",
                        fpType, endTime - startTime)
        # --------
        if molBuildType in ["oe-iso-smiles"]:
            for screenType in screenTypeList:
                startTime = time.time()
                fp = os.path.join(self.__dirPath,
                                  self.__getSubSearchFileName(screenType))
                if not useCache or (useCache and not self.__mU.exists(fp)):
                    ok = oeIo.createOeSubSearchDatabase(oeMolFilePath,
                                                        fp,
                                                        screenType=screenType,
                                                        numProc=numProc)
                    endTime = time.time()
                    logger.info(
                        "Constructed screened substructure database (status %r) with screenType %s (%.4f seconds)",
                        ok, screenType, endTime - startTime)
                    # ---------
                    ssDb = oeIo.loadOeSubSearchDatabase(fp,
                                                        screenType=screenType,
                                                        numProc=numProc)
                    ok = ssDb.NumMolecules() == oeCount
                    # ----------
        return oeCount
    def setUp(self):

        self.__workPath = os.path.join(HERE, "test-output")
        self.__dataPath = os.path.join(HERE, "test-data")
        self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
        self.__ccUrlTarget = os.path.join(self.__dataPath,
                                          "components-abbrev.cif")
        self.__birdUrlTarget = os.path.join(self.__dataPath,
                                            "prdcc-abbrev.cif")
        self.__doDisplay = True
        self.__numProcPrep = 6
        self.__numProcSearch = 6
        self.__minCount = None
        self.__startTime = time.time()
        #
        if OeSubStructSearchCompareTests.useFull:
            self.__myKwargs = {
                "cachePath": self.__cachePath,
                "useCache": True,
                "ccFileNamePrefix": "cc-full",
                "oeFileNamePrefix": "oe-full",
                "molBuildType": "model-xyz",
                "limitPerceptions": False,
                "screenTypeList": None,
                "numProc": self.__numProcPrep,
                "suppressHydrogens": True,
                "matchOpts": "sub-struct-graph-relaxed",
                "fpTypeCuttoffD": {
                    "TREE": 0.6,
                    "MACCS": 0.9
                },
                "maxFpResults": 50,
            }
        else:
            self.__myKwargs = {
                "ccUrlTarget": self.__ccUrlTarget,
                "birdUrlTarget": self.__birdUrlTarget,
                "cachePath": self.__cachePath,
                "useCache": True,
                "ccFileNamePrefix": "cc-abbrev",
                "oeFileNamePrefix": "oe-abbrev",
                "molBuildType": "model-xyz",
                "limitPerceptions": False,
                "screenTypeList": None,
                "numProc": self.__numProcPrep,
                "suppressHydrogens": True,
                "matchOpts": "sub-struct-graph-relaxed",
                "fpTypeCuttoffD": {
                    "TREE": 0.6,
                    "MACCS": 0.9
                },
                "maxFpResults": 50,
            }
        #
        self.__oesmP = OeSearchMoleculeProvider(**self.__myKwargs)
        ok = self.__oesmP.testCache()
        self.assertTrue(ok)
        #
        self.__ccmP = ChemCompMoleculeProvider(**self.__myKwargs)
        self.__ccmP.testCache()
        #
        self.__ccsidxP = ChemCompSearchIndexProvider(**self.__myKwargs)
        ok = self.__ccsidxP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        self.__oessU = OeSubStructSearchUtils(self.__oesmP)
        ok = self.__oessU.testCache()
        self.assertTrue(ok)
        #
        fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {})
        fpTypeList = [k for k, v in fpTypeCuttoffD.items()]
        self.__oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList)
        ok = self.__oesU.testCache()
        self.assertTrue(ok)
        #
        logger.debug("Running tests on version %s", __version__)
        logger.info("Starting %s at %s", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
class OeSubStructSearchCompareTests(unittest.TestCase):
    useFull = False

    def setUp(self):

        self.__workPath = os.path.join(HERE, "test-output")
        self.__dataPath = os.path.join(HERE, "test-data")
        self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
        self.__ccUrlTarget = os.path.join(self.__dataPath,
                                          "components-abbrev.cif")
        self.__birdUrlTarget = os.path.join(self.__dataPath,
                                            "prdcc-abbrev.cif")
        self.__doDisplay = True
        self.__numProcPrep = 6
        self.__numProcSearch = 6
        self.__minCount = None
        self.__startTime = time.time()
        #
        if OeSubStructSearchCompareTests.useFull:
            self.__myKwargs = {
                "cachePath": self.__cachePath,
                "useCache": True,
                "ccFileNamePrefix": "cc-full",
                "oeFileNamePrefix": "oe-full",
                "molBuildType": "model-xyz",
                "limitPerceptions": False,
                "screenTypeList": None,
                "numProc": self.__numProcPrep,
                "suppressHydrogens": True,
                "matchOpts": "sub-struct-graph-relaxed",
                "fpTypeCuttoffD": {
                    "TREE": 0.6,
                    "MACCS": 0.9
                },
                "maxFpResults": 50,
            }
        else:
            self.__myKwargs = {
                "ccUrlTarget": self.__ccUrlTarget,
                "birdUrlTarget": self.__birdUrlTarget,
                "cachePath": self.__cachePath,
                "useCache": True,
                "ccFileNamePrefix": "cc-abbrev",
                "oeFileNamePrefix": "oe-abbrev",
                "molBuildType": "model-xyz",
                "limitPerceptions": False,
                "screenTypeList": None,
                "numProc": self.__numProcPrep,
                "suppressHydrogens": True,
                "matchOpts": "sub-struct-graph-relaxed",
                "fpTypeCuttoffD": {
                    "TREE": 0.6,
                    "MACCS": 0.9
                },
                "maxFpResults": 50,
            }
        #
        self.__oesmP = OeSearchMoleculeProvider(**self.__myKwargs)
        ok = self.__oesmP.testCache()
        self.assertTrue(ok)
        #
        self.__ccmP = ChemCompMoleculeProvider(**self.__myKwargs)
        self.__ccmP.testCache()
        #
        self.__ccsidxP = ChemCompSearchIndexProvider(**self.__myKwargs)
        ok = self.__ccsidxP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        self.__oessU = OeSubStructSearchUtils(self.__oesmP)
        ok = self.__oessU.testCache()
        self.assertTrue(ok)
        #
        fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {})
        fpTypeList = [k for k, v in fpTypeCuttoffD.items()]
        self.__oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList)
        ok = self.__oesU.testCache()
        self.assertTrue(ok)
        #
        logger.debug("Running tests on version %s", __version__)
        logger.info("Starting %s at %s", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    @unittest.skipIf(not useFull, "Requires full data set")
    def testSubStructSearchDescriptor(self):
        #
        query = "n1ccccc1"
        queryId = "query-smiles"
        queryType = "oe-iso-smiles"
        #
        limitPerceptions = self.__myKwargs.get("limitPerceptions", False)
        suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True)
        numProc = self.__myKwargs.get("numProc", 4)
        # for matchOpts in ["sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo", "sub-struct-graph-strict"]:
        for matchOpts in ["sub-struct-graph-strict"]:
            #
            oeMol = self.__getMol(query,
                                  queryType,
                                  queryId,
                                  limitPerceptions=limitPerceptions,
                                  suppressHydrogens=suppressHydrogens)
            startTime = time.time()
            retStatus, mL = self.__search(oeMol, matchOpts, numProc)
            logger.info(
                "%s status (%r) matchOpts %s result %d in (%.4f seconds)",
                queryId, retStatus, matchOpts, len(mL),
                time.time() - startTime)
            self.assertTrue(retStatus)
            if queryType == "CC":
                self.assertTrue(self.__resultContains(queryId, mL))
            #
            if self.__doDisplay:
                self.__display(mL, query, queryId, queryType, matchOpts)

    @unittest.skipIf(not useFull, "Requires full data set")
    def testSubStructSearchSelected(self):
        #
        query = queryId = "STI"
        queryType = "CC"
        #
        limitPerceptions = self.__myKwargs.get("limitPerceptions", False)
        suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True)
        numProc = self.__myKwargs.get("numProc", 4)
        for matchOpts in [
                "sub-struct-graph-relaxed", "sub-struct-graph-relaxed-stereo",
                "sub-struct-graph-strict"
        ]:
            #
            oeMol = self.__getMol(query,
                                  queryType,
                                  queryId,
                                  limitPerceptions=limitPerceptions,
                                  suppressHydrogens=suppressHydrogens)
            startTime = time.time()
            retStatus, mL = self.__search(oeMol, matchOpts, numProc)
            logger.info(
                "%s status (%r) matchOpts %s result %d in (%.4f seconds)",
                queryId, retStatus, matchOpts, len(mL),
                time.time() - startTime)
            self.assertTrue(retStatus)
            if queryType == "CC":
                self.assertTrue(self.__resultContains(queryId, mL))
            #
            if self.__doDisplay:
                self.__display(mL, query, queryId, queryType, matchOpts)

    @unittest.skipIf(not useFull, "Requires full data set")
    def testSubStructSearchAll(self):
        #
        ccD = self.__ccmP.getMolD()
        for ccId in ccD:
            query = queryId = ccId
            if ccId in ["UNX", "UNL", "UNK", "DUM"]:
                continue
            queryType = "CC"
            #
            limitPerceptions = self.__myKwargs.get("limitPerceptions", False)
            suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True)
            numProc = self.__myKwargs.get("numProc", 5)
            for matchOpts in [
                    "sub-struct-graph-relaxed",
                    "sub-struct-graph-relaxed-stereo",
                    "sub-struct-graph-strict"
            ]:
                #
                oeMol = self.__getMol(query,
                                      queryType,
                                      queryId,
                                      limitPerceptions=limitPerceptions,
                                      suppressHydrogens=suppressHydrogens)
                if oeMol.NumAtoms() < 3:
                    continue
                #
                startTime = time.time()
                retStatus, mL = self.__search(oeMol, matchOpts, numProc)
                logger.info(
                    "%s status (%r) matchOpts %s result %d in (%.4f seconds)",
                    queryId, retStatus, matchOpts, len(mL),
                    time.time() - startTime)
                self.assertTrue(retStatus)
                if queryType == "CC":
                    self.assertTrue(self.__resultContains(queryId, mL))
                #
                if self.__doDisplay:
                    self.__display(mL, query, queryId, queryType, matchOpts)

    @unittest.skipIf(not useFull, "Requires full data set")
    def testMatchSearchSelected(self):
        #
        query = queryId = "STI"
        queryType = "CC"
        #
        limitPerceptions = self.__myKwargs.get("limitPerceptions", False)
        suppressHydrogens = self.__myKwargs.get("suppressHydrogens", True)
        numProc = self.__myKwargs.get("numProc", 4)
        for matchOpts in [
                "fingerprint-similarity", "graph-relaxed",
                "graph-relaxed-stereo", "graph-strict"
        ]:
            #
            oeMol = self.__getMol(query,
                                  queryType,
                                  queryId,
                                  limitPerceptions=limitPerceptions,
                                  suppressHydrogens=suppressHydrogens)
            startTime = time.time()
            retStatus, mL = self.__search(oeMol, matchOpts, numProc)
            logger.info(
                "%s status (%r) matchOpts %s result %d in (%.4f seconds)",
                queryId, retStatus, matchOpts, len(mL),
                time.time() - startTime)
            self.assertTrue(retStatus)
            if queryType == "CC":
                self.assertTrue(self.__resultContains(queryId, mL))
            #
            if self.__doDisplay:
                self.__display(mL, query, queryId, queryType, matchOpts)

    def __search(self, oeMol, matchOpts, numProc):
        if matchOpts.startswith("sub-struct-"):
            retStatus, mL = self.__subStructureSearch(oeMol,
                                                      matchOpts=matchOpts,
                                                      numProc=numProc)
        else:
            retStatus, mL, fpL = self.__matchSearch(oeMol, matchOpts=matchOpts)
        #
        rL = fpL if matchOpts in ["fingerprint-similarity"] else mL
        return retStatus, rL

    #
    def __subStructureSearch(self, oeMol, matchOpts, numProc):
        ##
        ccIdL = self.__oessU.prefilterIndex(oeMol,
                                            self.__ccsidxP,
                                            matchOpts=matchOpts,
                                            skipFeatures=False)
        retStatus, mL = self.__oessU.searchSubStructure(oeMol,
                                                        ccIdList=ccIdL,
                                                        matchOpts=matchOpts,
                                                        numProc=numProc)
        return retStatus, mL

    def __matchSearch(self, oeMol, matchOpts="graph-relaxed"):
        ssL = fpL = []
        try:
            fpTypeCuttoffD = self.__myKwargs.get("fpTypeCuttoffD", {})
            maxFpResults = self.__myKwargs.get("maxFpResults", 50)
            retStatus, ssL, fpL = self.__oesU.searchSubStructureAndFingerPrint(
                oeMol,
                list(fpTypeCuttoffD.items())[:2],
                maxFpResults,
                matchOpts=matchOpts)
            # logger.info("fpL %r", fpL)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            #
        return retStatus, ssL, fpL

    def __getMol(self,
                 query,
                 queryType,
                 queryId,
                 limitPerceptions=False,
                 suppressHydrogens=True):
        oeioU = OeIoUtils()
        if queryType == "CC":
            oeMol = self.__oesmP.getMol(query)
        else:
            oeMol = oeioU.descriptorToMol(query,
                                          queryType,
                                          limitPerceptions=limitPerceptions,
                                          messageTag=queryId)
        #
        if suppressHydrogens:
            oeMol = oeioU.suppressHydrogens(oeMol)
        oeMol.SetTitle(queryId)
        return oeMol

    def __resultContains(self, ccId, matchResultList):
        for matchResult in matchResultList:
            if matchResult.ccId == ccId:
                return True
        return False

    #
    #  ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------ ------
    def __display(self, mL, query, queryId, queryType, matchOpts):
        smL = sorted(mL, key=lambda kv: kv.fpScore, reverse=True)
        # ----
        tD = {}
        for sm in smL:
            ccId = sm.ccId.split("|")[0]
            tD.setdefault(ccId, []).append(sm)
        dL = []
        for ccId, ttL in tD.items():
            if len(ttL) == 1:
                dL.append(ttL[0])
            else:
                parent = False
                for tt in ttL:
                    if tt.ccId == ccId:
                        dL.append(tt)
                        parent = True
                        break
                if not parent:
                    dL.append(ttL[0])
        # ----
        pdfImagePath = os.path.join(self.__workPath,
                                    queryId + "-" + matchOpts + ".pdf")
        self.__displayPaginatedAlignments(pdfImagePath,
                                          query,
                                          queryType,
                                          queryId,
                                          dL,
                                          matchOpts=matchOpts)

    def __displayPaginatedAlignments(self,
                                     pdfImagePath,
                                     query,
                                     queryType,
                                     queryId,
                                     matchResultList,
                                     matchOpts="relaxed-stereo",
                                     alignMode="SS"):
        refId = queryId
        oeMolRef = self.__getMol(query,
                                 queryType,
                                 queryId,
                                 limitPerceptions=False,
                                 suppressHydrogens=True)
        pairList = []
        for mr in sorted(matchResultList,
                         key=lambda kv: kv.fpScore,
                         reverse=True):
            fitId = mr.ccId.split("|")[0]
            if len(mr.ccId) > 4:
                fitId = fitId + " (tautomer/protomer)"
            oeMolFit = self.__oesmP.getMol(mr.ccId)
            pairList.append((refId, oeMolRef, fitId, oeMolFit))
        #
        self.__depictFitList(pdfImagePath,
                             pairList,
                             matchOpts=matchOpts,
                             alignMode=alignMode)

    def __pairDepictPage(self,
                         imagePath,
                         refId,
                         refTitle,
                         refMol,
                         fitId,
                         fitTitle,
                         fitMol,
                         matchOpts="strict"):
        """Depict pairwise alignment of the input reference and fit molecules.

        Args:
            imagePath (str): path to image (format by path extension)
            refId (str): reference molecule identifier
            refTitle (str): reference molecule title
            refMol (obj): reference OE molecule object
            fitId (str): fit molecule identifier
            fitTitle (str): fit molecule title
            fitMol (obj): fit OE molecule object
            matchOpts (str, optional): alignment criteria (relaxed|relaxed-stereo|strict). Defaults to "strict".

        Returns:
            (list): atom mapping in all aligned figures
                    [(reference component Id, reference atom name, fit chemical component Id, fit atom name)
        """
        aML = []
        try:
            oed = OeDepictMCSAlignPage()
            oed.setSearchType(sType=matchOpts)

            oed.setRefMol(refMol, refId, title=refTitle)
            oed.setFitMol(fitMol, fitId, title=fitTitle)
            oed.setDisplayOptions(
                imageSizeX=2000,
                imageSizeY=1000,
                labelAtomName=True,
                labelAtomCIPStereo=True,
                labelAtomIndex=False,
                labelBondIndex=False,
                highlightStyleFit="ballAndStickInverse",
                bondDisplayWidth=0.5,
                highLightMatchColorRef="green",
                highLightNotMatchColorRef="pink",
            )
            aML = oed.alignPair(imagePath=imagePath)
            if aML:
                for (rCC, rAt, tCC, tAt) in aML:
                    logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return aML

    def __depictFitList(self,
                        pdfImagePath,
                        pairList,
                        matchOpts="exact",
                        alignMode="SS"):
        """Depict pairwise alignments with multi-page layout in PDF format.

        Args:
            pdfImagePath (str): PDF image path
            pairList (list): [(refId, refOeMol, fitId, fitOeMol)]

        Returns:
            (list): atom mapping in all aligned figures
                    [(reference component Id, reference atom name, fit chemical component Id, fit atom name)
        """
        aML = []
        try:
            if alignMode == "MCSS":
                oed = OeDepictMCSAlignMultiPage()
            else:
                oed = OeDepictSubStructureAlignMultiPage()
            oed.setSearchType(sType=matchOpts)
            oed.setPairMolList(pairList)

            oed.setDisplayOptions(
                labelAtomName=True,
                labelAtomCIPStereo=True,
                labelAtomIndex=False,
                labelBondIndex=False,
                highlightStyleFit="ballAndStickInverse",
                pageOrientation="portrait",
                gridRows=4,
                bondDisplayWidth=0.5,
                highLightMatchColorRef="green",
                highLightNotMatchColorRef="pink",
            )
            aML = oed.alignPairListMulti(imagePath=pdfImagePath)
            if aML:
                for (rCC, rAt, tCC, tAt) in aML:
                    logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return aML
Ejemplo n.º 10
0
class CODModelBuild(object):
    def __init__(self, cachePath, prefix=None, **kwargs):
        self.__cachePath = cachePath

        self.__prefix = prefix
        startTime = time.time()
        useCache = True
        self.__timeOut = kwargs.get("timeOut", None)
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc"
        oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe"
        #
        self.__startTime = time.time()
        self.__ccmP = ChemCompMoleculeProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=cachePath,
            useCache=useCache,
            ccFileNamePrefix=ccFileNamePrefix)
        ok1 = self.__ccmP.testCache()

        self.__ccSIdxP = ChemCompSearchIndexProvider(
            cachePath=cachePath,
            useCache=useCache,
            ccFileNamePrefix=ccFileNamePrefix)
        ok2 = self.__ccSIdxP.testCache()

        molLimit = kwargs.get("molLimit", None)
        quietFlag = kwargs.get("quietFlag", True)
        fpTypeList = kwargs.get("fpTypeList", [])
        screenTypeList = kwargs.get("screenTypeList", [])
        limitPerceptions = kwargs.get("limitPerceptions", False)
        numProc = kwargs.get("numProc", 4)
        self.__oesmP = OeSearchMoleculeProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=self.__cachePath,
            ccFileNamePrefix=ccFileNamePrefix,
            oeFileNamePrefix=oeFileNamePrefix,
            useCache=useCache,
            quietFlag=quietFlag,
            fpTypeList=fpTypeList,
            screenTypeList=screenTypeList,
            numProc=numProc,
            molLimit=molLimit,
            limitPerceptions=limitPerceptions,
        )
        ok3 = self.__oesmP.testCache()
        self.__oesmP.getOeMolD()

        logger.info(
            "Completed chemical component search index load %r (%.4f seconds)",
            ok1 & ok2 & ok3,
            time.time() - startTime)
        logUsage("main", "Setup completed", self.__startTime)
        #
        logger.info("Starting model build (%s) at %s", __version__,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def getModelDirFilePath(self):
        dN = "cod-%s-model-files" % self.__prefix if self.__prefix else "cod-model-files"
        return os.path.join(self.__cachePath, dN)

    def getModelImageDirFilePath(self):
        dN = "cod-%s-model-image" % self.__prefix if self.__prefix else "cod-model-images"
        return os.path.join(self.__cachePath, dN)

    def __getModelIndexPath(self):
        return os.path.join(self.getModelDirFilePath(), "cod-model-index.json")

    def fetchModelIndex(self):
        mD = {}
        try:
            mU = MarshalUtil(workPath=self.__cachePath)
            fp = self.__getModelIndexPath()
            mD = mU.doImport(fp, fmt="json")
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return mD

    def storeModelIndex(self, mD):
        try:
            mU = MarshalUtil(workPath=self.__cachePath)
            fp = self.__getModelIndexPath()
            ok = mU.doExport(fp, mD, fmt="json", indent=3)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            ok = False
        return ok

    def build(self,
              alignType="relaxed-stereo",
              numProc=4,
              chunkSize=10,
              verbose=False,
              doFigures=True):
        """Run the model build step in the chemical component model workflow.

        Args:
          alignType (str):  "relaxed"|"strict"| relaxed-stereo".  Default: relaxed-stereo
          numProc (int, optional): number of processes to invoke. Defaults to 4.
          chunkSize (int, optional): work chunksize. Defaults to 10.
          verbose (bool, optional): verbose logging.  Defaults to False.

        Returns:
            (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }]

        """
        retD = {}
        try:
            mU = MarshalUtil(workPath=self.__cachePath)
            ccms = CODModelSearch(self.__cachePath, prefix=self.__prefix)
            modelDirPath = self.getModelDirFilePath()
            imageDirPath = self.getModelImageDirFilePath()
            #
            tD = ccms.getResultIndex()
            # Make parent index ---
            idxIdD = {}
            for idxId, iDL in tD.items():
                pId = idxId.split("|")[0]
                idxIdD.setdefault(pId, []).extend(iDL)
            #
            idxIdL = list(idxIdD.keys())
            midxIdL = []
            for pId in idxIdL:
                fp = os.path.join(modelDirPath, pId, "model-index.json")
                if mU.exists(fp):
                    # Skip empty indices
                    fst = os.stat(fp)
                    if fst.st_size > 10:
                        continue
                midxIdL.append(pId)
            #
            logger.info(
                "Starting COD model build using (%d) from a total of results length (%d)",
                len(midxIdL), len(idxIdD))
            #
            cmbw = CODModelBuildWorker(self.__cachePath,
                                       verbose=verbose,
                                       timeOut=self.__timeOut)
            mpu = MultiProcUtil(verbose=True)
            mpu.setWorkingDir(modelDirPath)
            mpu.setOptions(
                optionsD={
                    "modelDirPath": modelDirPath,
                    "imageDirPath": imageDirPath,
                    "alignType": alignType,
                    "ccSIdxP": self.__ccSIdxP,
                    "idxIdD": idxIdD,
                    "oesmP": self.__oesmP,
                    "ccmP": self.__ccmP,
                    "doFigures": doFigures,
                })
            #
            mpu.set(workerObj=cmbw, workerMethod="build")
            ok, failList, resultList, _ = mpu.runMulti(dataList=midxIdL,
                                                       numProc=numProc,
                                                       numResults=1,
                                                       chunkSize=chunkSize)
            logger.info(
                "Run ended with status %r success count %d failures %r", ok,
                len(resultList[0]), len(failList))
            successList = copy.copy(resultList[0])
            #
            if successList:
                logger.info("Completed build with %d models ",
                            len(successList))
            else:
                logger.info("No models built")
            #
            # Build full index -
            #
            logger.info("Building full model index")
            for pId in idxIdL:
                fp = os.path.join(modelDirPath, pId, "model-index.json")
                if mU.exists(fp):
                    tDL = mU.doImport(fp, fmt="json")
                    for tD in tDL:
                        retD.setdefault(tD["parentId"], []).append(tD)
            #
            retD = dict(sorted(retD.items()))
            logger.info("Storing models for %d parent components", len(retD))
            ok = self.storeModelIndex(retD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return retD
Ejemplo n.º 11
0
    def buildSearchFiles(self, **kwargs):
        """Build cif, sdf (optional), and mol2 files for components in the chemical component search index.
           Exclude ions or other extraneous molecules lacking bonds.

        Args:
            ccUrlTarget (str): locator for source chemical component dictionary (default: full public dictionary)
            birdUrlTarget (str): locator for source BIRD dictionary (default: full public dictionary)
            limitPerceptions (bool): restrict automatic perceptions in OE molecular build operations (default: False)
            numProc (int): number of processors
            useCache (bool): use existing resource file where possible (default: True)
            molLimit (str):  limit the number to ingested chemical compont (default: None)
            quietFlag (bool): suppress output in OE library operations (default: True)

        Returns:
            (int): number molfiles generated
        """
        cachePath = self.__cachePath
        ccUrlTarget = kwargs.get("ccUrlTarget", None)
        birdUrlTarget = kwargs.get("birdUrlTarget", None)
        molLimit = kwargs.get("molLimit", None)
        quietFlag = kwargs.get("quietFlag", True)
        fpTypeList = kwargs.get("fpTypeList", [])
        screenTypeList = kwargs.get("screenTypeList", [])
        ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full"
        oeFileNamePrefix = "oe-%s" % self.__prefix if self.__prefix else "oe-cc-full"
        numProc = kwargs.get("numProc", 2)
        minCount = kwargs.get("minCount", 0)
        useCache = kwargs.get("useCache", True)
        useSdf = kwargs.get("useSdf", True)
        useMol2 = kwargs.get("useMol2", False)
        limitPerceptions = kwargs.get("limitPerceptions", False)
        logSizes = False
        #
        startTime = time.time()
        ccmP = ChemCompMoleculeProvider(cachePath=cachePath,
                                        useCache=useCache,
                                        ccFileNamePrefix=ccFileNamePrefix,
                                        ccUrlTarget=ccUrlTarget,
                                        birdUrlTarget=birdUrlTarget,
                                        molLimit=molLimit)
        ok = ccmP.testCache(minCount=minCount, logSizes=logSizes)
        logger.info(
            "Completed chemical component provider load %r (%.4f seconds)", ok,
            time.time() - startTime)
        #
        startTime = time.time()
        oesmp = OeSearchMoleculeProvider(
            ccUrlTarget=ccUrlTarget,
            birdUrlTarget=birdUrlTarget,
            cachePath=cachePath,
            ccFileNamePrefix=ccFileNamePrefix,
            oeFileNamePrefix=oeFileNamePrefix,
            useCache=useCache,
            quietFlag=quietFlag,
            fpTypeList=fpTypeList,
            screenTypeList=screenTypeList,
            numProc=numProc,
            molLimit=molLimit,
            limitPerceptions=limitPerceptions,
        )
        ok = oesmp.testCache()
        logger.info("Completed OE molecule provider load %r (%.4f seconds)",
                    ok,
                    time.time() - startTime)
        #
        startTime = time.time()
        ccSIdxP = ChemCompSearchIndexProvider(
            cachePath=cachePath,
            useCache=useCache,
            ccFileNamePrefix=ccFileNamePrefix,
            limitPerceptions=limitPerceptions,
            numProc=numProc)
        ok = ccSIdxP.testCache()
        logger.info(
            "Completed chemical component search index load %r (%.4f seconds)",
            ok,
            time.time() - startTime)
        #
        ccSIdx = ccSIdxP.getIndex() if ccSIdxP and ok else {}
        logger.info("Search index status %r index length %d", ok, len(ccSIdx))
        #
        ccIdD = {}
        mU = MarshalUtil()
        oeU = OeIoUtils(dirPath=cachePath)
        numMols = 0
        searchFileDirPath = self.getSearchDirFilePath()
        pathTupList = []
        for sId in ccSIdx:
            ccId = sId.split("|")[0]
            # standard CIF definition
            if ccId not in ccIdD:
                cifPath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                       ccId + ".cif")
                if not (useCache and mU.exists(cifPath)):
                    ccMol = ccmP.getMol(ccId)
                    if not self.__checkCif(ccMol):
                        continue
                    mU.doExport(cifPath, [ccMol], fmt="mmcif")
            #
            oeMol = oesmp.getMol(sId)
            if not self.__checkOeMol(oeMol):
                continue
            #
            # Sanity checks on the generated OE molecule
            #
            cifPath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                   sId + ".cif")
            if sId != ccId and not (useCache and mU.exists(cifPath)):
                oeccU = OeChemCompUtils()
                ok = oeccU.addOeMol(sId,
                                    oeMol,
                                    missingModelXyz=True,
                                    writeIdealXyz=False)
                if ok:
                    oeccU.write(cifPath)

            if useSdf:
                molFilePath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                           sId + ".sdf")
                if not (useCache and mU.exists(molFilePath)):
                    ok = oeU.write(molFilePath,
                                   oeMol,
                                   constantMol=False,
                                   addSdTags=True)
                    if ok:
                        pathTupList.append((sId, molFilePath, "sdf"))
            #
            if useMol2:
                mol2FilePath = os.path.join(searchFileDirPath, ccId[0], ccId,
                                            sId + ".mol2")
                if not (useCache and mU.exists(mol2FilePath)):
                    oeU.write(mol2FilePath,
                              oeMol,
                              constantMol=False,
                              addSdTags=True)
                    if ok:
                        pathTupList.append((sId, mol2FilePath, "mol2"))
            numMols += 1
        #
        self.__storePathList(pathTupList)
        return numMols