Esempio n. 1
0
 def testSubStructureSearchScreened(self):
     oeioU = OeIoUtils()
     oemp = OeMoleculeProvider(**self.__myKwargs)
     ok = oemp.testCache()
     ccmP = ChemCompIndexProvider(**self.__myKwargs)
     ccIdxD = ccmP.getIndex()
     ok = ccmP.testCache(minCount=self.__minCount)
     self.assertTrue(ok)
     oesU = OeSearchUtils(oemp,
                          screenType=self.__screenType,
                          numProc=self.__numProc)
     numMols = 20
     missL = []
     for ccId, ccD in list(ccIdxD.items())[:numMols]:
         # ----
         startTime = time.time()
         if "oe-smiles" not in ccD:
             continue
         logger.info("Search %s %r", ccId, ccD["oe-smiles"])
         oeQMol = oeioU.smartsToQmol(ccD["oe-smiles"])
         retStatus, mL = oesU.searchSubStructureScreened(oeQMol,
                                                         maxMatches=100)
         if retStatus:
             logger.info("%s (status=%r) match length %d in (%.4f seconds)",
                         ccId, retStatus, len(mL),
                         time.time() - startTime)
         if not self.__resultContains(ccId, mL):
             missL.append(ccId)
         #
         # self.assertGreaterEqual(len(mL), 1)
         # ----
     logger.info("Missed searches (%d) %r", len(missL), missL)
Esempio n. 2
0
    def testSubStructureSearchBase(self):

        matchOpts = self.__myKwargs.get("matchOpts", "sub-struct-graph-relaxed")
        numProc = self.__numProcSearch
        oemp = OeMoleculeProvider(**self.__myKwargs)
        ok = oemp.testCache()
        self.assertTrue(ok)
        oesU = OeSubStructSearchUtils(oemp)
        #
        ccIdxP = ChemCompIndexProvider(**self.__myKwargs)
        ok = ccIdxP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        ccIdxD = ccIdxP.getIndex()
        #
        ky = next(iter(ccIdxD))
        oeMol = oemp.getMol(ky)
        #
        numMols = 10
        for ccId, _ in list(ccIdxD.items())[:numMols]:
            # ----
            startTime = time.time()
            oeMol = oemp.getMol(ccId)
            ccIdL = oesU.prefilterIndex(oeMol, ccIdxP, matchOpts=matchOpts)
            logger.info("%s search length %d in (%.4f seconds)", ccId, len(ccIdL), time.time() - startTime)
            #
            retStatus, mL = oesU.searchSubStructure(oeMol, ccIdList=ccIdL, matchOpts=matchOpts, numProc=numProc)
            logger.info("%s result length %d in (%.4f seconds)", ccId, len(mL), time.time() - startTime)
            self.assertTrue(retStatus)
            self.assertTrue(self.__resultContains(ccId, mL))
    def updateChemCompIndex(self, useCache=False):
        """Rebuild the basic index of source chemical component and BIRD definitions.
           Update the internal state of this index in the current object instance.

            Resource requirements: 94 sec 1 proc 7GB memory macbook pro

        Args:
            useCache (bool): False to rebuild search index and True to reload

        Returns:
            bool: True for success or false otherwise
        """
        ok = False
        try:
            kwargs = copy.deepcopy(
                self.__configD["ccsiKwargs"]
            ) if "ccsiKwargs" in self.__configD else None
            if kwargs:
                kwargs["useCache"] = useCache
                ccIdxP = ChemCompIndexProvider(**kwargs)
                ok = ccIdxP.testCache()
                self.__ccIdxP = ccIdxP if ok else None
                logger.info("Chemical component index status %r", ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok
Esempio n. 4
0
    def testSubStructureSearchWithFingerPrint(self):
        oemp = OeMoleculeProvider(**self.__myKwargs)
        #
        ok = oemp.testCache()
        ccmP = ChemCompIndexProvider(**self.__myKwargs)
        ccIdxD = ccmP.getIndex()
        ok = ccmP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        minFpScore = 0.40
        maxFpResults = 50
        numMols = 20
        matchOpts = "graph-relaxed"
        oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList)
        # ----
        startTime = time.time()
        for ccId, _ in list(ccIdxD.items())[:numMols]:
            for fpType in self.__fpTypeList:
                oeMol = oemp.getMol(ccId)
                retStatus, mL = oesU.searchSubStructureWithFingerPrint(
                    oeMol,
                    fpType,
                    minFpScore,
                    maxFpResults,
                    matchOpts=matchOpts)
                self.assertTrue(retStatus)
                self.assertTrue(self.__resultContains(ccId, mL))

        logger.info("%s fingerprints search on %d in (%.4f seconds)",
                    len(self.__fpTypeList), numMols,
                    time.time() - startTime)
Esempio n. 5
0
 def testFingerPrintSearch(self):
     oemp = OeMoleculeProvider(**self.__myKwargs)
     # This will reload the oe binary cache.
     oeMol = oemp.getMol("004")
     self.assertGreaterEqual(len(list(oeMol.GetAtoms())), 12)
     #
     ok = oemp.testCache()
     ccmP = ChemCompIndexProvider(**self.__myKwargs)
     ccIdxD = ccmP.getIndex()
     ok = ccmP.testCache(minCount=self.__minCount)
     self.assertTrue(ok)
     minScore = 0.50
     maxResults = 50
     numMols = 50
     oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList)
     # ----
     startTime = time.time()
     for ccId, _ in list(ccIdxD.items())[:numMols]:
         for fpType in self.__fpTypeList:
             oeMol = oemp.getMol(ccId)
             retStatus, mL = oesU.searchFingerPrints(
                 oeMol,
                 fpType=fpType,
                 minFpScore=minScore,
                 maxFpResults=maxResults)
             self.assertTrue(retStatus)
             self.assertTrue(self.__resultContains(ccId, mL))
             # self.assertGreaterEqual(len(mL), 1)
     logger.info("%s fingerprints search on %d in (%.4f seconds)",
                 len(self.__fpTypeList), numMols,
                 time.time() - startTime)
    def updateDescriptors(self, useCache=True):

        ccidxP = ChemCompIndexProvider(
            ccUrlTarget=self.__ccUrlTarget,
            birdUrlTarget=self.__birdUrlTarget,
            cachePath=self.__cachePath,
            useCache=useCache,
            molLimit=None,
            ccFileNamePrefix=self.__ccFileNamePrefix,
        )
        ok = ccidxP.testCache()
        if ok:
            ccIdList = ccidxP.getIdList()
            curIdList = list(self.__descrD.keys())
            updIdList = list(set(ccIdList) - set(curIdList))
            if updIdList:
                logger.info(
                    "Updating Chemaxon descriptors for (%d) components",
                    len(updIdList))
                uD = self.__fetchDescriptors(updIdList,
                                             ccidxP,
                                             chunkSize=self.__chunkSize)
                self.__descrD.update(uD)
                descrFilePath = self.getIndexFilePath()
                tS = datetime.datetime.now().isoformat()
                vS = datetime.datetime.now().strftime("%Y-%m-%d")
                self.__version = vS
                dD = {"created": tS, "version": vS, "smiles": self.__descrD}
                ok = self.__mU.doExport(descrFilePath,
                                        dD,
                                        fmt="json",
                                        indent=3)
        #
        return ok
 def __getSearchDataProviders(self, **kwargs):
     oesmP = OeSearchMoleculeProvider(**kwargs)
     ok = oesmP.testCache()
     ccIdxP = ChemCompIndexProvider(**kwargs)
     ok = ccIdxP.testCache()
     self.assertTrue(ok)
     ccIdxD = ccIdxP.getIndex()
     return oesmP, ccIdxD
Esempio n. 8
0
 def testSubStructureSearch(self):
     oemp = OeMoleculeProvider(**self.__myKwargs)
     ok = oemp.testCache()
     ccmP = ChemCompIndexProvider(**self.__myKwargs)
     ccIdxD = ccmP.getIndex()
     ok = ccmP.testCache(minCount=self.__minCount)
     self.assertTrue(ok)
     oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList)
     numMols = 10
     for ccId, _ in list(ccIdxD.items())[:numMols]:
         # ----
         startTime = time.time()
         oeMol = oemp.getMol(ccId)
         retStatus, mL = oesU.searchSubStructure(oeMol, matchOpts="relaxed")
         logger.info("%s match length %d in (%.4f seconds)", ccId, len(mL),
                     time.time() - startTime)
         self.assertTrue(retStatus)
         self.assertTrue(self.__resultContains(ccId, mL))
 def __testBuildMoleculeCacheFiles(self, **kwargs):
     """Test build chemical component cache files from the input component dictionaries"""
     molLimit = kwargs.get("molLimit", None)
     useCache = kwargs.get("useCache", True)
     logSizes = kwargs.get("logSizes", False)
     ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc")
     ccUrlTarget = kwargs.get("ccUrlTarget", None)
     birdUrlTarget = kwargs.get("birdUrlTarget", None)
     #
     ccidxP = ChemCompIndexProvider(ccUrlTarget=ccUrlTarget,
                                    birdUrlTarget=birdUrlTarget,
                                    cachePath=self.__cachePath,
                                    useCache=useCache,
                                    molLimit=molLimit,
                                    ccFileNamePrefix=ccFileNamePrefix)
     ok = ccidxP.testCache(minCount=molLimit, logSizes=logSizes)
     self.assertTrue(ok)
     logger.info(" ******* Completed operation ******** ")
     #
     return ccidxP
Esempio n. 10
0
    def testSubStructureSearchScreenedFiltered(self):
        myKwargs = {
            "cachePath": self.__cachePath,
            "useCache": True,
            "fpTypeList": self.__fpTypeList,
            "ccFileNamePrefix": "cc-filtered",
            "oeFileNamePrefix": "oe-filtered",
            "molBuildType": "oe-iso-smiles",
            "limitPerceptions": False,
        }
        oeioU = OeIoUtils()
        oemp = OeMoleculeProvider(**myKwargs)
        ok = oemp.testCache()
        ccmP = ChemCompIndexProvider(**myKwargs)
        ccIdxD = ccmP.getIndex()
        ok = ccmP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        oesU = OeSearchUtils(oemp,
                             screenType=self.__screenType,
                             numProc=self.__numProc)
        numMols = 5000
        missL = []
        for ccId, ccD in list(ccIdxD.items())[:numMols]:
            # ----
            startTime = time.time()
            if "oe-smiles" not in ccD:
                continue
            logger.info("Search %s %r", ccId, ccD["oe-smiles"])
            oeQMol = oeioU.smartsToQmol(ccD["oe-smiles"])
            retStatus, mL = oesU.searchSubStructureScreened(oeQMol,
                                                            maxMatches=100)
            logger.info("%s (status=%r)match length %d in (%.4f seconds)",
                        ccId, retStatus, len(mL),
                        time.time() - startTime)
            if not self.__resultContains(ccId, mL):
                missL.append(ccId)

            # self.assertGreaterEqual(len(mL), 1)
            # ----
        logger.info("Missed searches (%d) %r", len(missL), missL)
 def __buildChemCompIndex(self, **kwargs):
     """Build chemical component cache files from the input component dictionaries"""
     try:
         molLimit = kwargs.get("molLimit", None)
         useCache = not kwargs.get("rebuildChemIndices", False)
         logSizes = kwargs.get("logSizes", False)
         ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc-full")
         ccUrlTarget = kwargs.get("ccUrlTarget", None)
         birdUrlTarget = kwargs.get("birdUrlTarget", None)
         cachePath = kwargs.get("cachePath", self.__cachePath)
         #
         ccidxP = ChemCompIndexProvider(ccUrlTarget=ccUrlTarget,
                                        birdUrlTarget=birdUrlTarget,
                                        cachePath=cachePath,
                                        useCache=useCache,
                                        molLimit=molLimit,
                                        ccFileNamePrefix=ccFileNamePrefix)
         ok = ccidxP.testCache(minCount=molLimit, logSizes=logSizes)
         return ok, ccidxP if ok else None
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     #
     return False, None
 def buildDescriptors(self):
     descrFilePath = self.getIndexFilePath()
     ccidxP = ChemCompIndexProvider(
         ccUrlTarget=self.__ccUrlTarget,
         birdUrlTarget=self.__birdUrlTarget,
         cachePath=self.__cachePath,
         useCache=True,
         molLimit=self.__molLimit,
         ccFileNamePrefix=self.__ccFileNamePrefix,
     )
     ok = ccidxP.testCache()
     if ok:
         ccIdList = ccidxP.getIdList()
         self.__descrD = self.__fetchDescriptors(ccIdList,
                                                 ccidxP,
                                                 chunkSize=self.__chunkSize)
         tS = datetime.datetime.now().isoformat()
         vS = datetime.datetime.now().strftime("%Y-%m-%d")
         self.__version = vS
         dD = {"created": tS, "version": vS, "smiles": self.__descrD}
         ok = self.__mU.doExport(descrFilePath, dD, fmt="json", indent=3)
         logger.info("Stored %s descriptors for %d components (status=%r) ",
                     descrFilePath, len(self.__descrD), ok)
Esempio n. 13
0
    def testSssWithFingerPrintFromDescriptor(self):
        oemp = OeMoleculeProvider(**self.__myKwargs)
        ok = oemp.testCache()
        ccmP = ChemCompIndexProvider(**self.__myKwargs)
        ccIdxD = ccmP.getIndex()
        ok = ccmP.testCache(minCount=self.__minCount)
        self.assertTrue(ok)
        limitPerceptions = False
        # minFpScore = 0.5
        maxFpResults = 50
        matchOpts = "graph-relaxed"
        numMols = 20
        oeioU = OeIoUtils()
        oesU = OeSearchUtils(oemp, fpTypeList=self.__fpTypeList)
        missTupL = []
        missedD = {}
        missedFpD = {}
        # ----
        startTime = time.time()
        for ccId, ccD in list(ccIdxD.items())[:numMols]:
            for buildType in [
                    "oe-iso-smiles", "oe-smiles", "acdlabs-smiles",
                    "cactvs-iso-smiles", "cactvs-smiles", "inchi"
            ]:
                if buildType in ccD:
                    logger.debug("Search %s %r", ccId, ccD[buildType])
                    if buildType in ["inchi"]:
                        oemf = OeMoleculeFactory()
                        oemf.setDescriptor(ccD["inchi"], "inchi", ccId)
                        ok = oemf.build(molBuildType="inchi",
                                        limitPerceptions=limitPerceptions)
                        if not ok:
                            logger.info("%s build failed with InChI %r", ccId,
                                        ccD["inchi"])
                        else:
                            oeMol = oemf.getMol()
                            if oemf.getInChI() != ccD["inchi"]:
                                logger.info(
                                    "%s regenerated InChI differs\n%r\n%s",
                                    ccId, ccD["inchi"], oemf.getInChI())
                    else:
                        oeMol = oeioU.smilesToMol(
                            ccD[buildType], limitPerceptions=limitPerceptions)
                    if not oeMol:
                        continue
                    maxHits = 0
                    minHits = maxFpResults
                    selfHit = False
                    for fpType, minFpScore in self.__fpTypeCuttoffList:
                        retStatus, mL = oesU.searchSubStructureWithFingerPrint(
                            oeMol,
                            fpType,
                            minFpScore,
                            maxFpResults,
                            matchOpts=matchOpts)
                        self.assertTrue(retStatus)
                        logger.debug("%s fpType %r hits %d", ccId, fpType,
                                     len(mL))
                        maxHits = max(maxHits, len(mL))
                        minHits = min(minHits, len(mL))
                        matchedSelf = self.__resultContains(ccId, mL)
                        selfHit = selfHit or matchedSelf
                        if not matchedSelf:
                            missedFpD.setdefault(ccId, []).append(
                                (buildType, fpType, len(mL)))
                    if not selfHit:
                        missedD.setdefault(ccId, []).append(buildType)

                    logger.info("%s (%r) buildType %r min hits %d max hits %d",
                                ccId, selfHit, buildType, minHits, maxHits)
                else:
                    logger.info("%s missing descriptor %r", ccId, buildType)
        #
        for ccId, missL in missedD.items():
            logger.info("%s missed list %r", ccId, missL)
            if ccId in missedFpD:
                logger.info("%s unmatched for fpTypes %r", ccId,
                            missedFpD[ccId])
        # ----
        doDepict = False
        if doDepict:
            mD = {}
            for missTup in missTupL:
                mD.setdefault(missTup[0], []).append(missTup[1])

            for ccId, buildTypeL in mD.items():
                idxD = ccIdxD[ccId]
                if "oe-iso-smiles" in idxD:
                    for buildType in buildTypeL:
                        self.__displayAlignedDescriptorPair(
                            ccId,
                            idxD["oe-iso-smiles"],
                            "oe-iso-smiles",
                            idxD[buildType],
                            buildType,
                            title=None,
                            limitPerceptions=True)

        logger.info("%s fingerprints search on %d in (%.4f seconds)",
                    len(self.__fpTypeList), numMols,
                    time.time() - startTime)