def __rebuildCache(self, **kwargs):
     mU = MarshalUtil()
     # source directory path
     srcDirPath = kwargs.get("srcDirPath", None)
     # cache details
     cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
     useCache = kwargs.get("useCache", True)
     entrySaveLimit = kwargs.get("entrySaveLimit", None)
     abbreviated = str(kwargs.get("abbreviated", "TEST")).upper()
     #
     # cacheDirPath = kwargs.get("cacheDirPath", None)
     cacheDirPath = self.__cacheDirPath
     pyVersion = sys.version_info[0]
     ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
     saveFilePath = os.path.join(cacheDirPath, "sifts-summary-py%s.%s" % (str(pyVersion), ext))
     #
     ssD = {}
     try:
         if useCache and os.access(saveFilePath, os.R_OK):
             ssD = mU.doImport(saveFilePath, **cacheKwargs)
         else:
             if not srcDirPath:
                 logger.error("Missing SIFTS source path details")
                 return ssD
             ssD = self.__getSummaryMapping(srcDirPath, abbreviated=abbreviated)
             if entrySaveLimit:
                 ssD = {k: ssD[k] for k in list(ssD.keys())[:entrySaveLimit]}
             mU.mkdir(cacheDirPath)
             ok = mU.doExport(saveFilePath, ssD, **cacheKwargs)
             logger.debug("Saving SIFTS summary serialized data file %s (%d) status %r", saveFilePath, len(ssD), ok)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return ssD
Beispiel #2
0
    def jsonSchemaCompare(self,
                          databaseName,
                          collectionName,
                          encodingType,
                          level,
                          extraOpts=None):
        """Compare computed JSON schema defintion with current source/cached version.

        Args:
            databaseName (str): schema name
            collectionName (str): collection name
            encodingType (str): schema data type conventions (JSON|BSON)
            level (str): metadata level (min|full)
            extraOpts (str): extra schema construction options

        Returns:
            (str): path to the difference file or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaLocator = self.__getJsonSchemaLocator(databaseName,
                                                    collectionName,
                                                    encodingType, level)
        fn = self.__fileU.getFileName(schemaLocator)
        schemaPath = os.path.join(self.__jsonSchemaCachePath, fn)
        #
        sD = self.makeSchema(databaseName,
                             collectionName,
                             encodingType=encodingType,
                             level=level,
                             saveSchema=False,
                             extraOpts=extraOpts)
        v2 = self.__getSchemaVersion(sD)
        # ----
        # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting json schema to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # ----
        #
        sDCache = mU.doImport(schemaPath, fmt="json")
        v1 = self.__getSchemaVersion(sDCache)
        if not v1:
            logger.error("no version for %s - %s %s", schemaLocator,
                         databaseName, collectionName)
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        # jD = diff(sDCache, sD, marshal=True, syntax="explicit")
        diffPath = None
        if numDiff:
            logger.debug("diff for %s %s %s %s = \n%s", databaseName,
                         collectionName, encodingType, level,
                         pprint.pformat(difD, indent=3, width=100))
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath,
                                    bn + "-" + v1 + "-" + v2 + "-diff.json")
            mU.doExport(diffPath, difD, fmt="json", indent=3)

        return diffPath
    def testRoundTripOps(self):
        """Test IO operation on generated related molecules"""
        try:
            oeIoU = OeIoUtils()
            mU = MarshalUtil()
            mU.mkdir(self.__molfileDirPath)
            ccMolD = self.__getChemCompDefs()
            oemf = OeMoleculeFactory()
            for ccId, ccObj in list(ccMolD.items())[:10]:
                # ----
                tId = oemf.setChemCompDef(ccObj)
                self.assertEqual(tId, ccId)
                relatedIdxD = oemf.buildRelated(limitPerceptions=False)
                logger.info("%s generated %d molecular forms", ccId,
                            len(relatedIdxD))
                for sId, idxD in relatedIdxD.items():
                    logger.info("sId %r smiles %r", sId, idxD["smiles"])
                    mol2Path = os.path.join(self.__molfileDirPath,
                                            sId + ".mol2")
                    oeMol = oeIoU.descriptorToMol(idxD["smiles"],
                                                  "oe-iso-smiles",
                                                  limitPerceptions=False,
                                                  messageTag=None)
                    oeIoU.write(mol2Path,
                                oeMol,
                                constantMol=True,
                                addSdTags=True)
                    tMolL = oeIoU.fileToMols(mol2Path)
                    #
                    nextMol2Path = os.path.join(self.__molfileDirPath,
                                                sId + "-next.mol2")
                    oeIoU.write(nextMol2Path,
                                tMolL[0],
                                constantMol=True,
                                addSdTags=True)

                    sdfPath = os.path.join(self.__molfileDirPath, sId + ".mol")
                    oeMol = oeIoU.descriptorToMol(idxD["smiles"],
                                                  "oe-iso-smiles",
                                                  limitPerceptions=False,
                                                  messageTag=None)
                    oeIoU.write(sdfPath,
                                oeMol,
                                constantMol=True,
                                addSdTags=True)
                    #
                    tMolL = oeIoU.fileToMols(sdfPath)
                    nextSdfPath = os.path.join(self.__molfileDirPath,
                                               sId + "-next.sdf")
                    oeIoU.write(nextSdfPath,
                                tMolL[0],
                                constantMol=True,
                                addSdTags=True)
                # ----
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #4
0
    def __rebuildCache(self, urlTargetIsoLtwa, dirPath, useCache):
        """Rebuild the cache of ISO abbreviation term data

        Args:
            urlTargetIsoLtwa (str): URL for ISO4 LTWA title word abbreviations
            dirPath (str):  cache path
            useCache (bool):  flag to use cached files

        Returns:
            tuple: (dict) title word abbreviations
                   (dict) language conflict dictionary
                   (list) multi-word abbreviation targets

        Notes:
            ISO source file (tab delimited UTF-16LE) is maintained at the ISSN site -
            https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt
        """
        aD = {}
        mU = MarshalUtil(workPath=dirPath)
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        isoLtwaNamePath = os.path.join(dirPath, "iso-ltwa.%s" % ext)
        logger.debug("Using cache data path %s", dirPath)
        mU.mkdir(dirPath)
        if not useCache:
            for fp in [isoLtwaNamePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and mU.exists(isoLtwaNamePath):
            aD = mU.doImport(isoLtwaNamePath, fmt=fmt)
            logger.debug("Abbreviation name length %d", len(aD["abbrev"]))
        elif not useCache:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetIsoLtwa,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetIsoLtwa))
            ok = fU.get(urlTargetIsoLtwa, fp)
            aD = self.__getLtwaTerms(dirPath, fp)
            ok = mU.doExport(isoLtwaNamePath, aD, fmt=fmt)
            logger.debug("abbrevD keys %r", list(aD.keys()))
            logger.debug("Caching %d ISO LTWA in %s status %r",
                         len(aD["abbrev"]), isoLtwaNamePath, ok)
        #
        abbrevD = aD["abbrev"] if "abbrev" in aD else {}
        conflictD = aD["conflicts"] if "conflicts" in aD else {}
        multiWordTermL = aD[
            "multi_word_abbrev"] if "multi_word_abbrev" in aD else []
        #
        return abbrevD, conflictD, multiWordTermL
Beispiel #5
0
    def schemaDefCompare(self, databaseName, dataTyping="ANY"):
        """Compare computed schema defintion with current source/cached version.

        Args:
            databaseName (str): schema definition name for comparison
            dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY".

        Returns:
            (str): file path for schema difference or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaPath = self.__getSchemaDefLocator(databaseName,
                                                dataTyping=dataTyping)
        fn = self.__fileU.getFileName(schemaPath)
        sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping)
        v2 = sD["DATABASE_VERSION"]
        # ----
        # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting schema def to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # sD = mU.doImport(tPath, fmt="json")
        # ----
        cPath = os.path.join(self.__schemaCachePath,
                             self.__fileU.getFileName(schemaPath))
        sDCache = mU.doImport(cPath, fmt="json")
        v1 = sDCache["DATABASE_VERSION"]
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        #
        # jD = diff(sDCache, sD, syntax="explicit", marshal=True)
        diffPath = None
        if numDiff:
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath,
                                    bn + "-" + v1 + "-" + v2 + "-diff.json")
            # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100))
            mU.doExport(diffPath, difD, fmt="json", indent=3)
        #
        return diffPath
Beispiel #6
0
    def __reload(self, urlTarget, dirPath, useCache=True):
        """Reload local cache of mapping resources to support validation report reader and translator.

        Args:
            urlTarget (list, str): URL for schema mapping file
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of ValidationReportReader()
        """
        mapD = {}
        #
        mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        mappingFilePath = os.path.join(dirPath, fn)
        mU.mkdir(dirPath)
        #
        # if not useCache:
        #     for fp in [mappingFilePath]:
        #         try:
        #             os.remove(fp)
        #         except Exception:
        #             pass
        # #
        logger.debug("Loading validation mapping data in %s (useCache %r)", fn,
                     useCache)
        if useCache and fU.exists(mappingFilePath):
            mapD = mU.doImport(mappingFilePath, fmt="json")
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        mappingFilePath)
            tS = uuid.uuid4().hex
            tP = os.path.join(dirPath, "._" + tS)
            ok = fU.get(urlTarget, tP)
            if ok:
                mapD = mU.doImport(tP, fmt="json")
                os.replace(tP, mappingFilePath)
        return mapD
class RemovedHoldingsProvider(object):
    """Provide an inventory of removed repository content."""

    def __init__(self, **kwargs):
        self.__dirPath = kwargs.get("holdingsDirPath", ".")
        useCache = kwargs.get("useCache", True)
        baseUrl = kwargs.get("baseUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/")
        urlTarget = kwargs.get("removedTargetUrl", os.path.join(baseUrl, "removed_holdings.json.gz"))
        urlFallbackTarget = kwargs.get("removedTargetUrl", os.path.join(baseUrl, "removed_holdings.json.gz"))
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__invD = self.__reload(urlTarget, urlFallbackTarget, self.__dirPath, useCache=useCache)

    def testCache(self, minCount=1000):
        logger.info("Inventory length cD (%d)", len(self.__invD))
        if len(self.__invD) > minCount:
            return True
        return False

    def getStatusCode(self, entryId):
        """Return the status code for the removed entry"""
        try:
            return self.__invD[entryId.upper()]["status_code"]
        except Exception as e:
            logger.debug("Failing for %r with %s", entryId, str(e))
        return None

    def getRemovedInfo(self, entryId):
        """Return the dictionary describing the details for this removed entry"""
        try:
            return self.__invD[entryId.upper()]
        except Exception as e:
            logger.debug("Failing for %r with %s", entryId, str(e))
        return {}

    def getContentTypes(self, entryId):
        """Return the removed content types for the input entry identifier"""
        try:
            return sorted(self.__invD[entryId.upper()]["content_type"].keys())
        except Exception as e:
            logger.debug("Failing for %r with %s", entryId, str(e))
        return []

    def getContentTypePathList(self, entryId, contentType):
        """Return the removed content types for the input entry identifier"""
        try:
            return (
                self.__invD[entryId.upper()]["content_type"][contentType]
                if isinstance(self.__invD[entryId.upper()]["content_type"][contentType], list)
                else [self.__invD[entryId.upper()]["content_type"][contentType]]
            )
        except Exception as e:
            logger.debug("Failing for %r %r with %s", entryId, contentType, str(e))
        return []

    def getInventory(self):
        """Return the removed inventory dictionary"""
        try:
            return self.__invD
        except Exception as e:
            logger.debug("Failing with %s", str(e))
        return {}

    def __reload(self, urlTarget, urlFallbackTarget, dirPath, useCache=True):
        invD = {}
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        fp = os.path.join(dirPath, fn)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(fp):
            invD = self.__mU.doImport(fp, fmt="json")
            logger.debug("Reading cached inventory (%d)", len(invD))
        else:
            logger.info("Fetch inventory from %s", urlTarget)
            ok = fU.get(urlTarget, fp)
            if not ok:
                ok = fU.get(urlFallbackTarget, fp)
            #
            if ok:
                invD = self.__mU.doImport(fp, fmt="json")
        #
        return invD
Beispiel #8
0
    def search(self,
               queryTargetId,
               queryTargetPath,
               resultPath,
               normalizeFlag=True,
               maxHits=50,
               searchType="similarity",
               suppressMetals=False):
        """Search the CCDC database for similar or substructure matches for the input query molecule.

        Args:
            queryTargetId (str): query identifier
            queryTargetPath (str): path to the query molfile (mol, sdf, mol2)
            resultPath (str): output path to match results
            normalizeFlag (bool, optional): do standard perceptions on matching molecules. Defaults to True.
            maxHits (int, optional): maximum number of matches to return. Defaults to 50.
            searchType (str, optional): search mode (substructure, similarity). Defaults to "similarity".
            suppressMetals (bool, optional): filter structures containing metals. Defaults to False.

        Returns:
            (int): number of matches
        """

        mU = MarshalUtil()
        logger.info("Start search for target %s path %s result path %s",
                    queryTargetId, queryTargetPath, resultPath)
        #
        summaryList = []
        #
        targetDirPath = os.path.dirname(queryTargetPath)
        cifTargetPath = os.path.join(targetDirPath, queryTargetId + ".cif")

        #
        targetStructures = EntryReader(queryTargetPath)
        dirPath = os.path.join(resultPath, queryTargetId)
        numHits = 0
        for ii, e in enumerate(targetStructures, 1):
            numHits = 0
            startTime = time.time()
            targetMol = e.molecule
            if normalizeFlag:
                targetMol.assign_bond_types(which="unknown")
                targetMol.standardise_aromatic_bonds()
                targetMol.standardise_delocalised_bonds()
            #
            logger.info("(%d) begin %s search - query id %s", ii, searchType,
                        queryTargetId)
            if searchType == "similarity":
                hits = self.__similaritySearch(targetMol,
                                               suppressMetals=suppressMetals)
            elif searchType == "substructure":
                hits = self.__moleculeSubstructureSearch(
                    targetMol, suppressMetals=suppressMetals)
            else:
                hits = []
            logger.info("(%d) completed search query id %s in %.3f seconds",
                        ii, queryTargetId,
                        time.time() - startTime)

            if hits:
                numHits += len(hits)
                logger.info("(%d) search for %s matched %d: %r", ii,
                            queryTargetId, numHits,
                            [targetHit.identifier for targetHit in hits])

                #
                for targetHit in hits[:maxHits]:
                    #
                    hI = CcdcMatchIndexInst()
                    hI.setCsdVersion(csd_version())
                    hI.setCsdDirectory(csd_directory())
                    hI.setTargetId(queryTargetId)
                    hI.setTargetPath(queryTargetPath)
                    if mU.exists(cifTargetPath):
                        hI.setTargetCcPath(cifTargetPath)
                    hI.setIdentifier(targetHit.identifier)
                    hI.setMatchType(searchType)
                    try:
                        hI.setRFactor(targetHit.entry.r_factor)
                        hI.setChemicalName(targetHit.entry.chemical_name)
                        hI.setTemperature(targetHit.entry.temperature)
                        hI.setRadiationSource(targetHit.entry.radiation_source)
                        hI.setHasDisorder("N")
                        cit = targetHit.entry.publication
                        if cit.doi is not None:
                            hI.setCitationDOI(cit.doi)
                        if searchType == "similarity":
                            hI.setSimilarityScore(targetHit.similarity)
                        elif searchType == "substructure":
                            hI.setMatchedAtomLength(
                                len(targetHit.match_atoms()))
                    except Exception as e:
                        logger.exception("Failing with %s", str(e))
                        #
                    #
                    mU.mkdir(dirPath)
                    mol2L = []
                    if searchType == "substructure":
                        for jj, mc in enumerate(targetHit.match_components(),
                                                1):
                            fp = os.path.join(
                                dirPath, queryTargetId + "_" +
                                targetHit.identifier + "_%03d" % jj + ".mol2")
                            mol2L.append(fp)
                            with MoleculeWriter(fp) as ofh:
                                ofh.write(mc)
                            # Replace the title line
                            with open(fp) as fin:
                                lines = fin.readlines()
                            lines[1] = lines[1].replace(
                                "00", targetHit.identifier)
                            #
                            with open(fp, "w") as fout:
                                fout.write("".join(lines))
                            #
                            fp = os.path.join(
                                dirPath, queryTargetId + "_" +
                                targetHit.identifier + "_%03d" % jj + ".sdf")
                            with MoleculeWriter(fp) as ofh:
                                ofh.write(mc)

                            # Replace the title line
                            with open(fp) as fin:
                                lines = fin.readlines()
                            lines[0] = lines[0].replace(
                                "00", targetHit.identifier)
                            #
                            with open(fp, "w") as fout:
                                fout.write("".join(lines))
                        #
                        #  Check for multiple generated result files -
                        #
                        for jj, fp in enumerate(mol2L, 1):
                            logger.debug("(%d) adding component fp %s", jj, fp)
                            hI.setMatchNumber(jj)
                            hI.setMol2Path(fp)
                            tt = fp[:-4] + "sdf"
                            hI.setMolPath(tt)
                            summaryList.append(copy.deepcopy(hI.get()))
                            #
                    else:
                        hI.setMatchNumber(1)
                        summaryList.append(copy.deepcopy(hI.get()))
            else:
                logger.info("(%d) search for %s returns no matches", ii,
                            targetMol.identifier)
                hits = None
        #
        if numHits > 0:
            mU.mkdir(dirPath)
            fp = os.path.join(dirPath, queryTargetId + "-index.json")
            cmI = CcdcMatchIndex(indexFilePath=fp, verbose=self.__verbose)
            cmI.load(summaryList)
            cmI.writeIndex()

        return numHits
class EntityPolymerExtractor(object):
    """Utilities to extract polymer related data from entry and entity collections."""

    def __init__(self, cfgOb, **kwargs):
        self.__cfgOb = cfgOb
        self.__resourceName = "MONGO_DB"
        self.__mU = MarshalUtil()
        self.__entryD, self.__authAsymIdIndex = self.__rebuildCache(**kwargs)
        #

    def __rebuildCache(self, **kwargs):
        useCache = kwargs.get("useCache", True)
        dirPath = kwargs.get("exdbDirPath", ".")
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        #
        ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
        fn = "entity-polymer-extracted-data-cache" + "." + ext
        cacheFilePath = os.path.join(dirPath, fn)
        #
        cD = {"entryD": {}, "authIdxD": {}}
        try:
            self.__mU.mkdir(dirPath)
            if not useCache:
                for fp in [cacheFilePath]:
                    try:
                        os.remove(fp)
                    except Exception:
                        pass

            if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK):
                cD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
            else:
                entryD = self.__selectEntries(**kwargs)
                entryD = self.__selectPolymerEntities(entryD, **kwargs)
                authIdxD = self.__buildIndices(entryD)
                cD["entryD"] = entryD
                cD["authIdxD"] = authIdxD
                if cacheFilePath:
                    ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs)
                    logger.info("Saved entity-polymer extracted results (%d) status %r in %s", len(entryD), ok, cacheFilePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return cD["entryD"], cD["authIdxD"]

    def __buildIndices(self, entryD):
        indD = {}
        for entryId, eD in entryD.items():
            entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
            for entityId, pD in entityD.items():
                for authAsymId in pD["auth_asym_ids"]:
                    # avoid tuples for json serialization
                    # indD[(entryId, authAsymId)] = entityId
                    indD[entryId + "_" + authAsymId] = entityId
        return indD

    def getEntryCount(self):
        return len(self.__entryD)

    def getRefSeqAccessions(self, dbName):
        acL = []
        try:
            for _, eD in self.__entryD.items():
                entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
                for _, pD in entityD.items():
                    for dD in pD["struct_ref"]:
                        if "pdbx_db_accession" in dD and dD["db_name"] == dbName:
                            acL.append(dD["pdbx_db_accession"])
            return list(set(acL))
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return acL

    def countRefSeqAccessions(self, dbName):
        cD = {}
        try:
            for _, eD in self.__entryD.items():
                entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
                for _, pD in entityD.items():
                    iCount = 0
                    for dD in pD["struct_ref"]:
                        if "pdbx_db_accession" in dD and dD["db_name"] == dbName:
                            iCount += 1
                    cD[iCount] = cD[iCount] + 1 if iCount in cD else 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return cD

    def countRefSeqAccessionDbType(self):
        cD = {}
        try:
            for _, eD in self.__entryD.items():
                entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
                for _, pD in entityD.items():
                    for dD in pD["struct_ref"]:
                        if "pdbx_db_accession" in dD and "db_name" in dD:
                            cD[dD["db_name"]] = cD[dD["db_name"]] + 1 if dD["db_name"] in cD else 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return cD

    def countRefSeqAccessionAny(self):
        cD = {}
        try:
            for _, eD in self.__entryD.items():
                entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
                for _, pD in entityD.items():
                    iCount = len(pD["struct_ref"])
                    # if iCount == 0:
                    #    logger.info("entryId %r " % (entryId, entityId))
                    cD[iCount] = cD[iCount] + 1 if iCount in cD else 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return cD

    def getUniqueTaxons(self):
        #
        tD = {}
        try:
            for _, eD in self.__entryD.items():
                entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
                for _, pD in entityD.items():
                    # logger.info("Entity dictionary %r", pD.keys())
                    if "rcsb_entity_source_organism" in pD:
                        for dd in pD["rcsb_entity_source_organism"]:
                            if "ncbi_taxonomy_id" in dd:
                                tD[dd["ncbi_taxonomy_id"]] = tD[dd["ncbi_taxonomy_id"]] + 1 if dd["ncbi_taxonomy_id"] in tD else 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        logger.info("Taxon coverage %d", len(tD))
        return tD

    def getOrigTaxons(self):
        #
        tD = {}
        try:
            for entryId, eD in self.__entryD.items():
                entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
                for entityId, pD in entityD.items():
                    # logger.info("Entity dictionary %r", pD.keys())
                    if "original_taxonomy_ids" in pD:
                        for tV in pD["original_taxonomy_ids"]:
                            tD.setdefault(entryId, []).append((entityId, tV))
                if entryId not in tD:
                    logger.debug("No taxonomy for %s", entryId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        logger.info("Taxon coverage %d", len(tD))
        return tD

    def countRefSeqAccessionByTaxon(self, dbNameList=None):
        #
        tD = {}
        iCount = 0
        #
        try:
            for _, eD in self.__entryD.items():
                entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
                for _, pD in entityD.items():
                    # logger.info("Entity dictionary %r", pD.keys())
                    if "rcsb_entity_source_organism" in pD:
                        for dd in pD["rcsb_entity_source_organism"]:
                            if "ncbi_taxonomy_id" in dd:
                                tId = dd["ncbi_taxonomy_id"]
                                for dD in pD["struct_ref"]:
                                    if "pdbx_db_accession" in dD and "db_name" in dD:
                                        if dD["db_name"] in dbNameList:
                                            tD.setdefault(tId, []).append(dD["pdbx_db_accession"])
                                        iCount += 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        logger.info("Total observed accessions %d", iCount)
        return tD

    def checkRefSeqAlignRange(self, dbName):
        ok = True
        try:
            eCount = 0
            aCount = 0
            tCount = 0
            for entryId, eD in self.__entryD.items():
                entityD = eD["selected_polymer_entities"] if "selected_polymer_entities" in eD else {}
                for entityId, pD in entityD.items():
                    for dD in pD["struct_ref"]:
                        if "db_name" in dD and dD["db_name"] == dbName:
                            if "pdbx_db_accession" in dD and "alignD" in dD and "pdbx_seq_one_letter_code" in dD and "pdbx_align_begin" in dD:
                                seqLen = len(dD["pdbx_seq_one_letter_code"])
                                dbBegin = 100000000
                                dbEnd = -1
                                refSeqDbBegin = dD["pdbx_align_begin"]
                                for authAsymId, alDL in dD["alignD"].items():
                                    tCount += 1
                                    difL = []
                                    for alD in alDL:
                                        tBeg = alD["db_align_beg"]
                                        tEnd = alD["db_align_end"]
                                        tDif = tEnd - tBeg + 1
                                        difL.append(tDif)
                                        dbBegin = min(tBeg, dbBegin)
                                        dbEnd = max(tEnd, dbEnd)

                                        # range is calculate on off -
                                        # if seqLen < dbEnd - dbBegin + 1:
                                        if seqLen < dbEnd - dbBegin and not refSeqDbBegin == dbBegin:
                                            fDif = sum(difL)
                                            logger.debug(
                                                "Bad alignment for %r %r %r %r (%d) seqLen %r (%d) dbBegin %r dbEnd %r difL %r tDif %r",
                                                entryId,
                                                entityId,
                                                authAsymId,
                                                alD["pdbx_strand_id"],
                                                len(alDL),
                                                seqLen,
                                                dbEnd - dbBegin + 1,
                                                dbBegin,
                                                dbEnd,
                                                difL,
                                                fDif,
                                            )
                                            aCount += 1

                            else:
                                eCount += 1
            logger.info("Incomplete %s struct_ref record count %d", dbName, eCount)
            logger.info("Inconsistent %s db reference alignments %d/%d", dbName, aCount, tCount)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            ok = False

        return ok

    def getEntityRefSeqAccessions(self, dbName, entryId, entityId):
        acL = []
        try:
            dL = self.__entryD[entryId]["selected_polymer_entities"][entityId]["struct_ref"]
            acL = list(set([d["pdbx_db_accession"] for d in dL if d["db_name"] == dbName]))
        except Exception as e:
            logger.exception("Failing with %s %r %r %s", dbName, entryId, entityId, str(e))
        return acL

    def __selectEntries(self, **kwargs):
        """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""

        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("entrySelectionQuery", {})
        #
        entryD = {}
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(qD)
                    selectL = ["rcsb_entry_container_identifiers"]
                    dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL, len(dL))
                    #
                    for dD in dL:
                        #
                        if (
                            ("rcsb_entry_container_identifiers" in dD)
                            and ("entry_id" in dD["rcsb_entry_container_identifiers"])
                            and ("polymer_entity_ids" in dD["rcsb_entry_container_identifiers"])
                            and dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]
                        ):
                            entryD[dD["rcsb_entry_container_identifiers"]["entry_id"]] = {"polymer_entity_ids": dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]}

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
        #

    def __selectPolymerEntities(self, entryD, **kwargs):
        """Skeleton entity selector recovering essential biological sequence mapping features
        for macromolecules (default type = protein).

         "1CP9": {
             "polymer_entity_ids": [
                "1",
                "2"
             ],
             "selected_polymer_entities": {
                "1": {
                   "rcsb_multiple_source_flag": "N",
                   "asym_ids": [
                      "A"
                   ],
                   "auth_asym_ids": [
                      "A"
                   ],
                   "entity_id": "1",
                   "type": "polypeptide(L)",
                   "rcsb_entity_polymer_type": "Protein",
                   "rcsb_entity_source_organism": [
                      {
                         "ncbi_taxonomy_id": 587,
                         "beg_seq_num": 1,
                         "end_seq_num": 205,
                         "ncbi_scientific_name": "Providencia rettgeri"
                      }
                   ],
                   "struct_ref": [
                      {
                         "id": "1",
                         "db_name": "UNP",
                         "pdbx_db_accession": "Q7WZI9",
                         "entity_id": "1",
                         "pdbx_seq_one_letter_code": "QSTQIKIERDNYGVPHIYANDTYSLFYGYGYA...",
                         "alignD": {
                            "A": [
                               {
                                  "align_id": "1",
                                  "ref_id": "1",
                                  "pdbx_PDB_id_code": "1CP9",
                                  "pdbx_strand_id": "A",
                                  "seq_align_beg": 1,
                                  "seq_align_end": 205,
                                  "pdbx_db_accession": "Q7WZI9",
                                  "db_align_beg": 24,
                                  "db_align_end": 228,
                                  "pdbx_auth_seq_align_beg": "1",
                                  "pdbx_auth_seq_align_end": "205",
                                  "rcsb_entity_id": "1"
                               }
                            ]
                         }
                      }
                   ]
                },
            "2": {
                   "rcsb_multiple_source_flag": "N",
                   "asym_ids": [
                      "B"
                   ],
                   "auth_asym_ids": [
                      "B"
                   ],
                   "entity_id": "2",
                   "type": "polypeptide(L)",
                   "rcsb_entity_polymer_type": "Protein",
                   "rcsb_entity_source_organism": [
                      {
                         "ncbi_taxonomy_id": 587,
                         "beg_seq_num": 1,
                         "end_seq_num": 553,
                         "ncbi_scientific_name": "Providencia rettgeri"
                      }
                   ],
                   "struct_ref": [
                      {
                         "id": "2",
                         "db_name": "UNP",
                         "pdbx_db_accession": "Q7WZI9",
                         "entity_id": "2",
                         "pdbx_seq_one_letter_code": "SNVWLVGKTKASGAKAILLNGPQFGWFNPAYTYGIGLHG",
                         "alignD": {
                            "B": [
                               {
                                  "align_id": "2",
                                  "ref_id": "2",
                                  "pdbx_PDB_id_code": "1CP9",
                                  "pdbx_strand_id": "B",
                                  "seq_align_beg": 1,
                                  "seq_align_end": 553,
                                  "pdbx_db_accession": "Q7WZI9",
                                  "db_align_beg": 285,
                                  "db_align_end": 837,
                                  "pdbx_auth_seq_align_beg": "1",
                                  "pdbx_auth_seq_align_end": "553",
                                  "rcsb_entity_id": "2"
                               }
                            ]
                         }
                      }
                   ]
                }
             }
           },

        """
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity")
        resultKey = kwargs.get("resultKey", "selected_polymer_entities")

        entryLimit = kwargs.get("entryLimit", None)
        selectionQueryD = kwargs.get("entitySelectionQuery", {"entity_poly.rcsb_entity_polymer_type": "Protein"})
        #
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
                    selectL = [
                        "rcsb_polymer_entity_container_identifiers",
                        "entity.rcsb_multiple_source_flag",
                        "entity_poly.type",
                        "entity_poly.rcsb_entity_polymer_type",
                        "entity_poly.pdbx_seq_one_letter_code_can",
                        "rcsb_entity_source_organism.ncbi_taxonomy_id",
                        "rcsb_entity_source_organism.ncbi_scientific_name",
                        "rcsb_entity_source_organism.beg_seq_num",
                        "rcsb_entity_source_organism.end_seq_num",
                        "struct_ref.id",
                        "struct_ref.pdbx_db_accession",
                        "struct_ref.db_name",
                        "struct_ref.entity_id",
                        "struct_ref.pdbx_seq_one_letter_code",
                        "struct_ref.pdbx_align_begin",
                        "struct_ref_seq",
                        #
                        "entity_src_nat.pdbx_ncbi_taxonomy_id",
                        "entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id",
                        "entity_src_gen.pdbx_host_org_ncbi_taxonomy_id",
                        "pdbx_entity_src_syn.ncbi_taxonomy_id",
                    ]
                    iCount = 0
                    for entryId in entryD:
                        #
                        if resultKey in entryD[entryId]:
                            continue
                        #
                        qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId}
                        qD.update(selectionQueryD)
                        #
                        dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                        logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL))
                        eD = {}
                        for ii, dD in enumerate(dL, 1):
                            rD = {}
                            logger.debug("%s (%4d) d is %r", entryId, ii, dD)
                            if "entity" in dD:
                                rD["rcsb_multiple_source_flag"] = dD["entity"]["rcsb_multiple_source_flag"] if "rcsb_multiple_source_flag" in dD["entity"] else "N"
                            #
                            if "rcsb_polymer_entity_container_identifiers" in dD:
                                rD["asym_ids"] = dD["rcsb_entity_container_identifiers"]["asym_ids"] if "asym_ids" in dD["rcsb_entity_container_identifiers"] else []
                                rD["auth_asym_ids"] = dD["rcsb_entity_container_identifiers"]["auth_asym_ids"] if "auth_asym_ids" in dD["rcsb_entity_container_identifiers"] else []
                                rD["entity_id"] = dD["rcsb_entity_container_identifiers"]["entity_id"]
                            #
                            if "entity_poly" in dD:
                                rD["type"] = dD["entity_poly"]["type"] if "type" in dD["entity_poly"] else None
                                rD["rcsb_entity_polymer_type"] = dD["entity_poly"]["rcsb_entity_polymer_type"] if "rcsb_entity_polymer_type" in dD["entity_poly"] else None
                                rD["entity_polymer_length"] = len(dD["entity_poly"]["pdbx_seq_one_letter_code_can"]) if "pdbx_seq_one_letter_code_can" in dD["entity_poly"] else 0
                            #
                            tL = []
                            if "rcsb_entity_source_organism" in dD:
                                for tD in dD["rcsb_entity_source_organism"]:
                                    tL.append(tD)
                            rD["rcsb_entity_source_organism"] = copy.copy(tL)
                            #
                            qDL = []
                            if "struct_ref" in dD:
                                for tD in dD["struct_ref"]:
                                    if "db_name" in tD:
                                        tD["db_name"] = str(tD["db_name"]).upper().strip()
                                        tD["db_name"] = "UNP" if tD["db_name"] in ["TREMBL"] else tD["db_name"]
                                    qDL.append(tD)
                                if "struct_ref_seq" in dD:
                                    for qD in qDL:
                                        refId = qD["id"]
                                        alignL = []
                                        for tD in dD["struct_ref_seq"]:
                                            if refId == tD["ref_id"]:
                                                alignL.append(tD)
                                        # qD['align_list'] = copy.copy(aL)
                                        for align in alignL:
                                            authAsymId = align["pdbx_strand_id"]
                                            qD.setdefault("alignD", {}).setdefault(authAsymId, []).append(align)

                            rD["struct_ref"] = qDL
                            #
                            taxIdL = []
                            if "entity_src_nat" in dD:
                                for tD in dD["entity_src_nat"]:
                                    if "pdbx_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_ncbi_taxonomy_id"])
                            if "entity_src_gen" in dD:
                                for tD in dD["entity_src_gen"]:
                                    if "pdbx_gene_src_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_gene_src_ncbi_taxonomy_id"])
                                    if "pdbx_host_org_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_host_org_ncbi_taxonomy_id"])
                            if "pdbx_entity_src_syn" in dD:
                                for tD in dD["pdbx_entity_src_syn"]:
                                    if "ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["ncbi_taxonomy_id"])
                            qL = []
                            for taxId in taxIdL:
                                ttL = [int(t.strip()) for t in taxId.split(",") if t.strip().isdigit()]
                                qL.extend(ttL)
                            logger.debug("TaxId list %r", qL)
                            rD["original_taxonomy_ids"] = copy.copy(list(set(qL)))
                            #
                            if "entity_id" in rD:
                                eD[rD["entity_id"]] = copy.copy(rD)

                        entryD[entryId][resultKey] = copy.copy(eD)

                        iCount += 1
                        if iCount % 1000 == 0:
                            logger.info("Completed fetch %d/%d entries", iCount, len(entryD))
                        if entryLimit and iCount >= entryLimit:
                            logger.info("Quitting after %d", iCount)
                            break

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return entryD
class ReferenceSequenceAssignmentUpdater(object):
    """Selected utilities to update reference sequence assignments information
    in the core_entity collection.

    """
    def __init__(self,
                 cfgOb,
                 databaseName="pdbx_core",
                 collectionName="pdbx_core_polymer_entity",
                 polymerType="Protein",
                 referenceDatabaseName="UniProt",
                 provSource="PDB",
                 **kwargs):
        self.__cfgOb = cfgOb
        self.__polymerType = polymerType
        self.__mU = MarshalUtil()
        #
        self.__databaseName = databaseName
        self.__collectionName = collectionName
        self.__statusList = []
        #
        self.__ssP = self.__fetchSiftsSummaryProvider(
            self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__assignRefD, self.__refD, self.__matchD = self.__reload(
            databaseName, collectionName, polymerType, referenceDatabaseName,
            provSource, **kwargs)

    def __reload(self, databaseName, collectionName, polymerType,
                 referenceDatabaseName, provSource, **kwargs):
        assignRefD = self.__getPolymerReferenceSequenceAssignments(
            databaseName, collectionName, polymerType, **kwargs)
        # get refIdD = {refId: [entity_id, ....], }
        refIdD, _ = self.__getUniqueAssignments(
            assignRefD,
            referenceDatabaseName=referenceDatabaseName,
            provSource=provSource)
        #
        refD, matchD = self.__rebuildReferenceCache(referenceDatabaseName,
                                                    list(refIdD.keys()),
                                                    **kwargs)
        return assignRefD, refD, matchD

    def doUpdate(self, updateId, updateLimit=None):
        desp = DataExchangeStatus()
        statusStartTimestamp = desp.setStartTime()
        #
        numUpd = 0
        updateDL = self.__buildUpdate(self.__assignRefD)
        if updateDL:
            if updateLimit:
                numUpd = self.__doUpdate(self.__cfgOb, updateDL[:updateLimit],
                                         self.__databaseName,
                                         self.__collectionName)
            else:
                numUpd = self.__doUpdate(self.__cfgOb, updateDL,
                                         self.__databaseName,
                                         self.__collectionName)
        self.__updateStatus(updateId, self.__databaseName,
                            self.__collectionName, True, statusStartTimestamp)
        return len(updateDL), numUpd

    def __doUpdate(self, cfgOb, updateDL, databaseName, collectionName):
        obUpd = ObjectUpdater(cfgOb)
        numUpd = obUpd.update(databaseName, collectionName, updateDL)
        logger.info("Update count is %d", numUpd)

        return numUpd

    def __getPolymerReferenceSequenceAssignments(self, databaseName,
                                                 collectionName, polymerType,
                                                 **kwargs):
        """Get all accessions assigned to input reference sequence database for the input polymerType.

        Returns:
         (dict): {"1abc_1": "rcsb_entity_container_identifiers": {"reference_sequence_identifiers": []},
                            "rcsb_polymer_entity_align": [],
                            "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
        """
        cachePath = kwargs.get("cachePath", ".")
        exDbDir = "exdb"
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
        useCache = kwargs.get("useCache", True)
        fetchLimit = kwargs.get("fetchLimit", None)
        cacheFilePath = os.path.join(cachePath, exDbDir,
                                     "entity-poly-ref-seq-assign-cache.json")
        #
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName=databaseName,
                collectionName=collectionName,
                cacheFilePath=cacheFilePath,
                useCache=useCache,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=cacheKwargs,
                objectLimit=fetchLimit,
                selectionQuery={
                    "entity_poly.rcsb_entity_polymer_type": polymerType
                },
                selectionList=[
                    "rcsb_id",
                    "rcsb_entity_container_identifiers.reference_sequence_identifiers",
                    "rcsb_entity_container_identifiers.auth_asym_ids",
                    "rcsb_polymer_entity_align",
                    "rcsb_entity_source_organism.ncbi_taxonomy_id",
                ],
            )
            eCount = obEx.getCount()
            logger.info("Entity count is %d", eCount)
            objD = obEx.getObjects()
            logger.info(
                "Reading polymer entity entity count %d ref accession length %d ",
                eCount, len(objD))
            #
        except Exception as e:
            logger.exception("Failing for %s (%s) with %s", databaseName,
                             collectionName, str(e))
        return objD

    def __getUniqueAssignments(self,
                               objD,
                               referenceDatabaseName="UniProt",
                               provSource="PDB"):
        refIdD = defaultdict(list)
        taxIdD = defaultdict(list)
        numMissing = 0
        for entityKey, eD in objD.items():
            try:
                accS = set()
                for ii, tD in enumerate(eD["rcsb_entity_container_identifiers"]
                                        ["reference_sequence_identifiers"]):
                    if tD["database_name"] == referenceDatabaseName and tD[
                            "provenance_source"] == provSource:
                        accS.add(tD["database_accession"])
                        refIdD[tD["database_accession"]].append(entityKey)
                        #
                        # pick up the corresponding taxonomy -
                        try:
                            taxIdD[tD["database_accession"]].append(
                                eD["rcsb_entity_source_organism"][ii]
                                ["ncbi_taxonomy_id"])
                        except Exception:
                            logger.warning("Failing taxonomy lookup for %s %r",
                                           entityKey, tD["database_accession"])

                logger.debug("PDB assigned sequences length %d", len(accS))
            except Exception as e:
                numMissing += 1
                logger.debug("No sequence assignments for %s with %s",
                             entityKey, str(e))
        #
        for refId, taxIdL in taxIdD.items():
            taxIdL = list(set(taxIdL))
            if len(taxIdL) > 1:
                logger.info(
                    "Multitple taxIds assigned to reference sequence id %s: %r",
                    refId, taxIdL)

        logger.info("Unique %s accession assignments by %s %d (missing %d) ",
                    referenceDatabaseName, provSource, len(refIdD), numMissing)
        return refIdD, taxIdD

    def __reMapAccessions(self,
                          rsiDL,
                          referenceDatabaseName="UniProt",
                          provSourceL=None,
                          excludeReferenceDatabases=None):
        """Internal method to re-map accessions for the input databae and assignment source

        Args:
            rsiDL (list): list of accession
            databaseName (str, optional): resource database name. Defaults to 'UniProt'.
            provSource (str, optional): assignment provenance. Defaults to 'PDB'.

        Returns:
            bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input object list
        """
        isMatched = False
        unMapped = 0
        matched = 0
        excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else [
            "PDB"
        ]
        provSourceL = provSourceL if provSourceL else []
        retDL = []
        for rsiD in rsiDL:
            if rsiD["database_name"] in excludeReferenceDatabases:
                unMapped += 1
                continue
            if rsiD["database_name"] == referenceDatabaseName and rsiD[
                    "provenance_source"] in provSourceL:
                try:
                    if len(self.__matchD[rsiD["database_accession"]]
                           ["matchedIds"]) == 1:
                        rsiD["database_accession"] = self.__matchD[
                            rsiD["database_accession"]]["matchedIds"][0]
                        matched += 1
                    else:
                        logger.info(
                            "Skipping mapping to multiple superseding accessions %s",
                            rsiD["database_accession"])
                    #
                except Exception:
                    unMapped += 1
            retDL.append(rsiD)
        if matched == len(retDL):
            isMatched = True
        return not unMapped, isMatched, retDL

    def __reMapAlignments(self,
                          alignDL,
                          referenceDatabaseName="UniProt",
                          provSourceL=None,
                          excludeReferenceDatabases=None):
        """Internal method to re-map alignments for the input databae and assignment source

        Args:
            alignDL (list): list of aligned regions
            databaseName (str, optional): resource database name. Defaults to 'UniProt'.
            provSourceL (list, optional): assignment provenance. Defaults to 'PDB'.

        Returns:
            bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input align list
        """
        isMatched = False
        unMapped = 0
        matched = 0
        excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else [
            "PDB"
        ]
        retDL = []
        provSourceL = provSourceL if provSourceL else []
        for alignD in alignDL:
            if alignD["reference_database_name"] in excludeReferenceDatabases:
                unMapped += 1
                continue
            if alignD[
                    "reference_database_name"] == referenceDatabaseName and alignD[
                        "provenance_code"] in provSourceL:
                try:
                    if len(self.__matchD[
                            alignD["reference_database_accession"]]
                           ["matchedIds"]) == 1:
                        alignD["reference_database_accession"] = self.__matchD[
                            alignD["reference_database_accession"]][
                                "matchedIds"][0]
                        matched += 1
                    else:
                        logger.info(
                            "Skipping alignment mapping to multiple superseding accessions %s",
                            alignD["reference_database_accession"])
                except Exception:
                    unMapped += 1
            retDL.append(alignD)
        if matched == len(retDL):
            isMatched = True
        #
        return not unMapped, isMatched, retDL

    def __getSiftsAccessions(self, entityKey, authAsymIdL):
        retL = []
        saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
        for (_, dbAccession), _ in saoLD.items():
            retL.append({
                "database_name": "UniProt",
                "database_accession": dbAccession,
                "provenance_source": "SIFTS"
            })
        return retL

    def __getSiftsAlignments(self, entityKey, authAsymIdL):
        retL = []
        saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
        for (_, dbAccession), saoL in saoLD.items():
            dD = {
                "reference_database_name": "UniProt",
                "reference_database_accession": dbAccession,
                "provenance_code": "SIFTS",
                "aligned_regions": []
            }
            for sao in saoL:
                dD["aligned_regions"].append({
                    "ref_beg_seq_id":
                    sao.getDbSeqIdBeg(),
                    "entity_beg_seq_id":
                    sao.getEntitySeqIdBeg(),
                    "length":
                    sao.getEntityAlignLength()
                })
            retL.append(dD)
        return retL

    def __buildUpdate(self, assignRefD):
        #
        updateDL = []
        for entityKey, eD in assignRefD.items():
            selectD = {"rcsb_id": entityKey}
            try:
                updateD = {}
                authAsymIdL = []
                ersDL = (eD["rcsb_entity_container_identifiers"]
                         ["reference_sequence_identifiers"]
                         if "reference_sequence_identifiers"
                         in eD["rcsb_entity_container_identifiers"] else None)
                #
                #
                if ersDL:
                    authAsymIdL = eD["rcsb_entity_container_identifiers"][
                        "auth_asym_ids"]
                    isMapped, isMatched, updErsDL = self.__reMapAccessions(
                        ersDL,
                        referenceDatabaseName="UniProt",
                        provSourceL=["PDB"])
                    #
                    if not isMapped or not isMatched:
                        tL = self.__getSiftsAccessions(entityKey, authAsymIdL)
                        if tL:
                            logger.debug(
                                "Using SIFTS accession mapping for %s",
                                entityKey)
                        else:
                            logger.info(
                                "No alternative SIFTS accession mapping for %s",
                                entityKey)
                        updErsDL = tL if tL else []
                    #
                    if len(updErsDL) < len(ersDL):
                        logger.info(
                            "Incomplete reference sequence mapping update for %s",
                            entityKey)
                    updateD[
                        "rcsb_entity_container_identifiers.reference_sequence_identifiers"] = updErsDL
                #
                alignDL = eD[
                    "rcsb_polymer_entity_align"] if "rcsb_polymer_entity_align" in eD else None
                if alignDL and authAsymIdL:
                    isMapped, isMatched, updAlignDL = self.__reMapAlignments(
                        alignDL,
                        referenceDatabaseName="UniProt",
                        provSourceL=["PDB"])
                    #
                    if not isMapped or not isMatched:
                        tL = self.__getSiftsAlignments(entityKey, authAsymIdL)
                        if tL:
                            logger.debug(
                                "Using SIFTS alignment mapping for %s",
                                entityKey)
                        else:
                            logger.info(
                                "No alternative SIFTS alignment mapping for %s",
                                entityKey)
                        updAlignDL = tL if tL else updAlignDL
                    #
                    if len(updAlignDL) < len(alignDL):
                        logger.info(
                            "Incomplete alignment mapping update for %s",
                            entityKey)
                    updateD["rcsb_polymer_entity_align"] = updAlignDL
                #
                if updateD:
                    updateDL.append({"selectD": selectD, "updateD": updateD})
            except Exception as e:
                logger.exception("Mapping error for %s with %s", entityKey,
                                 str(e))
        #
        return updateDL

    def __rebuildReferenceCache(self, refDbName, idList, **kwargs):
        """ """
        dD = {}
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, "exdb")
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
        useCache = kwargs.get("useCache", True)
        fetchLimit = kwargs.get("fetchLimit", None)
        saveText = kwargs.get("saveText", False)
        #
        ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
        fn = "ref-sequence-data-cache" + "." + ext
        cacheFilePath = os.path.join(dirPath, fn)
        #
        self.__mU.mkdir(dirPath)
        if not useCache:
            for fp in [cacheFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and cacheFilePath and self.__mU.exists(cacheFilePath):
            dD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
            # Check for completeness -
            missingS = set(dD["refDbCache"].keys()) - set(idList)
            if missingS:
                logger.info("Reference sequence cache missing %d accessions",
                            len(missingS))
                extraD = self.__fetchReferenceEntries(refDbName,
                                                      list(missingS),
                                                      saveText=saveText,
                                                      fetchLimit=fetchLimit)
                dD["refDbCache"].update(extraD["refDbCache"])
                dD["matchInfo"].update(extraD["matchInfo"])
                if cacheFilePath and cacheKwargs:
                    self.__mU.mkdir(dirPath)
                    ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
                    logger.info("Cache updated with status %r", ok)
            #
        else:
            dD = self.__fetchReferenceEntries(refDbName,
                                              idList,
                                              saveText=saveText,
                                              fetchLimit=fetchLimit)
            if cacheFilePath and cacheKwargs:
                self.__mU.mkdir(dirPath)
                ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
                logger.info("Cache save status %r", ok)

        return dD["refDbCache"], dD["matchInfo"]

    def __fetchReferenceEntries(self,
                                refDbName,
                                idList,
                                saveText=False,
                                fetchLimit=None):
        """Fetch database entries from the input reference sequence database name."""
        dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}}

        try:
            idList = idList[:fetchLimit] if fetchLimit else idList
            logger.info("Starting fetch for %d %s entries", len(idList),
                        refDbName)
            if refDbName == "UniProt":
                fobj = UniProtUtils(saveText=saveText)
                refD, matchD = fobj.fetchList(idList)
                dD = {
                    "refDbName": refDbName,
                    "refDbCache": refD,
                    "matchInfo": matchD
                }

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return dD

    def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
        abbreviated = kwargs.get("siftsAbbreviated", "PROD")
        cachePath = kwargs.get("cachePath", ".")
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        useCache = kwargs.get("useCache", True)
        #
        siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH",
                                             sectionName=configName)
        # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath)
        if siftsSummaryDataPath.lower().startswith("http"):
            srcDirPath = siftsSummaryDataPath
        else:
            srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
        cacheDirPath = os.path.join(
            cachePath,
            cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
        logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
        ssP = SiftsSummaryProvider(srcDirPath=srcDirPath,
                                   cacheDirPath=cacheDirPath,
                                   useCache=useCache,
                                   abbreviated=abbreviated,
                                   cacheKwargs=cacheKwargs)
        logger.info("ssP entry count %d", ssP.getEntryCount())
        return ssP

    def __dumpEntries(self, refD):
        for (eId, eDict) in refD.items():
            logger.info("------ Reference id %s", eId)
            for k, v in eDict.items():
                logger.info("%-15s = %r", k, v)

    def __getUpdateAssignmentCandidates(self, objD):
        totCount = 0
        difCount = 0
        pdbUnpIdD = defaultdict(list)
        siftsUnpIdD = defaultdict(list)
        assignIdDifD = defaultdict(list)
        #
        for entityKey, eD in objD.items():
            try:
                siftsS = set()
                pdbS = set()
                for tD in eD["rcsb_entity_container_identifiers"][
                        "reference_sequence_identifiers"]:
                    if tD["database_name"] == "UniProt":
                        if tD["provenance_source"] == "SIFTS":
                            siftsS.add(tD["database_accession"])
                            siftsUnpIdD[tD["database_accession"]].append(
                                entityKey)
                        elif tD["provenance_source"] == "PDB":
                            pdbS.add(tD["database_accession"])
                            pdbUnpIdD[tD["database_accession"]].append(
                                entityKey)
                    else:
                        logger.debug("No UniProt for %r",
                                     eD["rcsb_entity_container_identifiers"])
                logger.debug("PDB assigned sequence length %d", len(pdbS))
                logger.debug("SIFTS assigned sequence length %d", len(siftsS))

                if pdbS and siftsS:
                    totCount += 1
                    if pdbS != siftsS:
                        difCount += 1
                        for idV in pdbS:
                            assignIdDifD[idV].append(entityKey)

            except Exception as e:
                logger.warning("No identifiers for %s with %s", entityKey,
                               str(e))
        #
        logger.info("Total %d differences %d", totCount, difCount)
        logger.info("Unique UniProt accession assignments PDB %d  SIFTS %d",
                    len(pdbUnpIdD), len(siftsUnpIdD))
        logger.info("Current unique overalapping assignment differences %d ",
                    len(assignIdDifD))
        logger.info("Current unique overalapping assignment differences %r ",
                    assignIdDifD)
        return assignIdDifD, pdbUnpIdD, siftsUnpIdD

    def getReferenceAccessionAlignSummary(self):
        """Summarize the alignment of PDB accession assignments with the current reference sequence database."""
        numPrimary = 0
        numSecondary = 0
        numNone = 0
        for _, mD in self.__matchD.items():
            if mD["matched"] == "primary":
                numPrimary += 1
            elif mD["matched"] == "secondary":
                numSecondary += 1
            else:
                numNone += 1
        logger.debug("Matched primary:  %d secondary: %d none %d", numPrimary,
                     numSecondary, numNone)
        return numPrimary, numSecondary, numNone

    def getLoadStatus(self):
        return self.__statusList

    def __updateStatus(self, updateId, databaseName, collectionName, status,
                       startTimestamp):
        try:
            sFlag = "Y" if status else "N"
            desp = DataExchangeStatus()
            desp.setStartTime(tS=startTimestamp)
            desp.setObject(databaseName, collectionName)
            desp.setStatus(updateId=updateId, successFlag=sFlag)
            desp.setEndTime()
            self.__statusList.append(desp.getStatus())
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
Beispiel #11
0
    def searchSmarts(self,
                     queryTargetId,
                     smarts,
                     resultPath,
                     maxHits=50,
                     suppressMetals=False):
        """Search the CCDC database for substructure matches for the input SMARTS pattern.

        Args:
            queryTargetId (str): query identifier
            smarts (srt): smarts search pattern (NON STEREO)
            resultPath (str): output path to match results
            maxHits (int, optional): maximum number of matches to return. Defaults to 50.
            suppressMetals (bool, optional): filter structures containing metals. Defaults to False.

        Returns:
            (int): number of matches
        """

        mU = MarshalUtil()
        logger.info("Start smarts search for target %s result path %s",
                    queryTargetId, resultPath)
        #
        ii = 1
        searchType = "substructure"
        summaryList = []
        dirPath = os.path.join(resultPath, queryTargetId)
        numHits = 0
        startTime = time.time()
        logger.info("(%d) begin %s search - query id %s", ii, searchType,
                    queryTargetId)

        if searchType == "substructure":
            hits = self.__smartsSubstructureSearch(
                smarts, suppressMetals=suppressMetals)
        else:
            hits = []
        logger.info("(%d) completed search query id %s in %.3f seconds", ii,
                    queryTargetId,
                    time.time() - startTime)

        if hits:
            numHits += len(hits)
            logger.info("(%d) search for %s matched %d: %r", ii, queryTargetId,
                        numHits, [targetHit.identifier for targetHit in hits])

            #
            for targetHit in hits[:maxHits]:
                #
                hI = CcdcMatchIndexInst()
                hI.setTargetId(queryTargetId)
                hI.setIdentifier(targetHit.identifier)
                hI.setMatchType(searchType)
                try:
                    hI.setRFactor(targetHit.entry.r_factor)
                    hI.setChemicalName(targetHit.entry.chemical_name)
                    hI.setTemperature(targetHit.entry.temperature)
                    hI.setRadiationSource(targetHit.entry.radiation_source)
                    hI.setHasDisorder("N")
                    cit = targetHit.entry.publication
                    if cit.doi is not None:
                        hI.setCitationDOI(cit.doi)
                    if searchType == "similarity":
                        hI.setSimilarityScore(targetHit.similarity)
                    elif searchType == "substructure":
                        hI.setMatchedAtomLength(len(targetHit.match_atoms()))
                except Exception as e:
                    logger.exception("Failing with %s", str(e))
                    #
                #
                mU.mkdir(dirPath)
                mol2L = []
                for jj, mc in enumerate(targetHit.molecule.components, 1):
                    fp = os.path.join(
                        dirPath, queryTargetId + "_" + targetHit.identifier +
                        "_%03d" % jj + ".mol2")
                    mol2L.append(fp)
                    with MoleculeWriter(fp) as ofh:
                        ofh.write(mc)
                    # Replace the title line
                    with open(fp) as fin:
                        lines = fin.readlines()
                    lines[1] = lines[1].replace("00", targetHit.identifier)
                    #
                    with open(fp, "w") as fout:
                        fout.write("".join(lines))
                    #
                    fp = os.path.join(
                        dirPath, queryTargetId + "_" + targetHit.identifier +
                        "_%03d" % jj + ".sdf")
                    with MoleculeWriter(fp) as ofh:
                        ofh.write(mc)
                    # Replace the title line
                    with open(fp) as fin:
                        lines = fin.readlines()
                    lines[0] = lines[0].replace("00", targetHit.identifier)
                    #
                    with open(fp, "w") as fout:
                        fout.write("".join(lines))
                #
                #  Check for multiple generated result files -
                #
                for jj, fp in enumerate(mol2L, 1):
                    logger.debug("(%d) adding component fp %s", jj, fp)
                    hI.setMatchNumber(jj)
                    hI.setMol2Path(fp)
                    tt = fp[:-4] + "sdf"
                    hI.setMolPath(tt)
                    summaryList.append(copy.deepcopy(hI.get()))
                    #
        else:
            logger.info("(%d) se sarch for %s returns no matches", ii,
                        queryTargetId)
            hits = None
        #
        if numHits > 0:
            mU.mkdir(dirPath)
            fp = os.path.join(dirPath, queryTargetId + "-index.json")
            cmI = CcdcMatchIndex(indexFilePath=fp, verbose=self.__verbose)
            cmI.load(summaryList)
            cmI.writeIndex()

        return numHits
Beispiel #12
0
class ObjectExtractor(object):
    """Utilities to extract document features from the document object server."""
    def __init__(self, cfgOb, **kwargs):
        self.__cfgOb = cfgOb
        self.__resourceName = "MONGO_DB"
        self.__mU = MarshalUtil()
        #
        self.__objectD = self.__rebuildCache(**kwargs)
        self.__objPathD = {}
        self.__stringPathList = []
        self.__objValD = {}
        #

    def getObjects(self):
        return self.__objectD

    def getPathList(self, filterList=True):
        kL = []
        if filterList:
            tL = []
            for ky in self.__objPathD:
                if ky and (ky.find(".") != -1
                           or ky.startswith("_")) and ky not in [
                               "_id"
                           ] and not ky.endswith("[]"):
                    tL.append(ky)
            for ky in tL:
                for tky in tL:
                    ok = True
                    if ky in tky and ky != tky:
                        ok = False
                        break
                if ok:
                    kL.append(ky)
        else:
            kL = list(self.__objPathD.keys())
        #
        return sorted(kL)

    def getValues(self):
        return self.__objValD

    def setPathList(self, stringPathList):
        self.__objPathD = {k: True for k in stringPathList}
        return True

    def getCount(self):
        return len(self.__objectD)

    def __rebuildCache(self, **kwargs):
        cacheFilePath = kwargs.get("cacheFilePath", None)
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        useCache = kwargs.get("useCache", True)
        keyAttribute = kwargs.get("keyAttribute", "entry")
        selectL = kwargs.get("selectionList", [])
        #
        cD = {keyAttribute: {}}
        try:
            if useCache and cacheFilePath and os.access(
                    cacheFilePath, os.R_OK):
                cD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
            else:
                if selectL:
                    objectD = self.__select(**kwargs)
                else:
                    objectD = self.__selectObjects(**kwargs)
                cD[keyAttribute] = objectD
                if cacheFilePath:
                    pth, _ = os.path.split(cacheFilePath)
                    ok = self.__mU.mkdir(pth)
                    ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs)
                    logger.info("Saved object results (%d) status %r in %s",
                                len(objectD), ok, cacheFilePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return cD[keyAttribute]

    def __selectObjects(self, **kwargs):
        """Return a dictionary of objects satisfying the input conditions (e.g. method, resolution limit)"""
        databaseName = kwargs.get("databaseName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("selectionQuery", {})
        #
        uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
        #
        tV = kwargs.get("objectLimit", None)
        objLimit = int(tV) if tV is not None else None
        stripObjectId = kwargs.get("stripObjectId", False)
        logIncrement = kwargs.get("logIncrement", 10000)
        #
        objectD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    selectL = ["_id"]
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD)
                    numDoc = len(dL) if dL else 0
                    logger.info("Selection %r fetch result count %d", selectL,
                                numDoc)
                    #
                    for ii, dD in enumerate(dL, 1):
                        if "_id" not in dD:
                            continue
                        rObj = mg.fetchOne(databaseName, collectionName, "_id",
                                           dD["_id"])
                        if stripObjectId and rObj and "_id" in rObj:
                            rObj.pop("_id")
                        else:
                            rObj["_id"] = str(rObj["_id"])
                        #
                        stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
                        objectD[stKey] = copy.copy(rObj)
                        if objLimit and ii >= objLimit:
                            break
                        logger.debug("Saving %d %s", ii, stKey)
                        if ii % logIncrement == 0 or ii == numDoc:
                            logger.info("Extracting object (%d of %d)", ii,
                                        numDoc)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return objectD
        #

    def __select(self, **kwargs):
        """Return a dictionary of object content satisfying the input conditions
        (e.g. method, resolution limit) and selection options.
        """
        databaseName = kwargs.get("databaseName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("selectionQuery", {})
        uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
        selectL = kwargs.get("selectionList", [])
        stripObjectId = kwargs.get("stripObjectId", False)
        #
        tV = kwargs.get("objectLimit", None)
        objLimit = int(tV) if tV is not None else None
        #
        objectD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD,
                                  suppressId=True)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))
                    #
                    for ii, rObj in enumerate(dL, 1):
                        stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
                        if stripObjectId and rObj and "_id" in rObj:
                            rObj.pop("_id")
                        objectD[stKey] = copy.copy(rObj)
                        if objLimit and ii >= objLimit:
                            break
                        # logger.debug("Saving %d %s", ii, stKey)
                        # logger.debug("Current objectD keys %r", list(objectD.keys()))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return objectD
        #

    def __getKeyValues(self, dct, keyNames):
        """Return the tuple of values of corresponding to the input dictionary key names expressed in dot notation.

        Args:
            dct (dict): source dictionary object (nested)
            keyNames (list): list of dictionary keys in dot notation

        Returns:
            tuple: tuple of values corresponding to the input key names

        """
        rL = []
        try:
            for keyName in keyNames:
                rL.append(self.__getKeyValue(dct, keyName))
        except Exception as e:
            logger.exception("Failing for key names %r with %s", keyNames,
                             str(e))

        return tuple(rL)

    def __getKeyValue(self, dct, keyName):
        """Return the value of the corresponding key expressed in dot notation in the input dictionary object (nested)."""
        try:
            kys = keyName.split(".")
            for key in kys:
                try:
                    dct = dct[key]
                except KeyError:
                    return None
            return dct
        except Exception as e:
            logger.exception("Failing for key %r with %s", keyName, str(e))

        return None

    def __toJsonPathString(self, path):
        pL = [ky if ky else "[]" for ky in path]
        sp = ".".join(pL)
        sp = sp.replace(".[", "[")
        return sp

    def __pathCallBack(self, path, value):
        sp = self.__toJsonPathString(path)
        self.__objPathD[
            sp] = self.__objPathD[sp] + 1 if sp in self.__objPathD else 1
        return value

    def __saveCallBack(self, path, value):
        sP = self.__toJsonPathString(path)
        if sP in self.__objPathD:
            ky = sP.replace("[]", "")
            if sP.find("[") != -1:  # multivalued
                if isinstance(value, list):
                    self.__objValD.setdefault(ky, []).extend(value)
                else:
                    self.__objValD.setdefault(ky, []).append(value)
            else:
                self.__objValD[ky] = value
        return value

    def genPathList(self, dObj, path=None):
        return self.__walk(dObj, jsonPath=path, funct=self.__pathCallBack)

    def genValueList(self, dObj, path=None, clear=True):
        self.__objValD = {} if clear else self.__objValD
        return self.__walk(dObj, jsonPath=path, funct=self.__saveCallBack)

    def __walk(self, jsonObj, jsonPath=None, funct=None):
        """Walk JSON data types. An optional funct() is called to mutate
        the value of each element. The jsonPath is updated at each element.
        """
        if jsonPath is None:
            jsonPath = []

        if isinstance(jsonObj, dict):
            value = {
                k: self.__walk(v, jsonPath + [k], funct)
                for k, v in jsonObj.items()
            }
        elif isinstance(jsonObj, list):
            value = [
                self.__walk(elem, jsonPath + [[]], funct) for elem in jsonObj
            ]
        else:
            value = jsonObj

        if funct is None:
            return value
        else:
            return funct(jsonPath, value)

    def __toPath(self, path):
        """Convert path strings into path lists."""
        if isinstance(path, list):
            return path  # already in list format

        def _iterPath(path):
            for parts in path.split("[]"):
                for part in parts.strip(".").split("."):
                    yield part
                yield []

        return list(_iterPath(path))[:-1]
class InterProProvider(object):
    """Manage mappings of InterPro identifiers to description and parent/child relationships"""

    def __init__(self, **kwargs):
        urlTargetInterPro = kwargs.get("urlTargetInterPro", "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/entry.list")
        urlTargetInterProFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/InterPro/entry.list"
        urlTargetInterProParent = kwargs.get("urlTargetInterPro", "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/ParentChildTreeFile.txt")
        urlTargetInterProParentFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/InterPro/ParentChildTreeFile.txt"
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, "interPro")
        useCache = kwargs.get("useCache", True)
        #
        self.__mU = MarshalUtil(workPath=dirPath)
        self.__interProD, self.__interProParentD = self.__rebuildCache(urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache)

    def getDescription(self, interProId):
        ret = None
        try:
            ret = self.__interProD[interProId]["description"]
        except Exception:
            pass
        return ret

    def getType(self, interProId):
        ret = None
        try:
            ret = self.__interProD[interProId]["type"]
        except Exception:
            pass
        return ret

    def testCache(self):
        # Check length ...
        logger.info("Length InterPro %d", len(self.__interProD))
        return len(self.__interProD) > 1000

    #
    def __rebuildCache(self, urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache):
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        interProDataPath = os.path.join(dirPath, "interPro-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(interProDataPath):
            rD = self.__mU.doImport(interProDataPath, fmt=fmt)
            interProD = rD["index"]
            interProParentD = rD["parents"]
            logger.debug("InterPro index length %d parent length %d", len(interProD), len(interProParentD))
        else:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetInterPro, dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetInterPro))
            ok = fU.get(urlTargetInterPro, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProFB))
                ok = fU.get(urlTargetInterProFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            interProD = self.__getInterProIndex(fp)

            logger.info("Caching %d in %s status %r", len(interProD), interProDataPath, ok)
            # ------
            logger.info("Fetch data from source %s in %s", urlTargetInterProParent, dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParent))
            ok = fU.get(urlTargetInterProParent, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParentFB))
                ok = fU.get(urlTargetInterProParentFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            interProParentD = self.__getInterProParents(fp)
            #
            ok = self.__mU.doExport(interProDataPath, {"index": interProD, "parents": interProParentD}, fmt=fmt)
        #
        return interProD, interProParentD

    def getLineage(self, idCode):
        pList = []
        try:
            pList.append(idCode)
            pt = self.getParentId(idCode)
            while (pt is not None) and (pt != 1):
                pList.append(pt)
                pt = self.getParentId(pt)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        pList.reverse()
        return pList

    def getLineageWithNames(self, idCode):
        linL = []
        try:
            idCodeL = self.getLineage(idCode)
            for ii, idCode in enumerate(idCodeL, 1):
                linL.append((idCode, self.getDescription(idCode), ii))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return linL

    def getParentId(self, idCode):
        try:
            return self.__interProParentD[idCode]
        except Exception:
            pass
        return None

    def getTreeNodeList(self, filterD=None):
        dL = []
        try:
            for idCode, _ in self.__interProD.items():
                if filterD and idCode not in filterD:
                    continue
                displayName = self.getDescription(idCode)
                pId = self.getParentId(idCode)
                linL = self.getLineage(idCode)
                #
                if pId is None:
                    dD = {"id": idCode, "name": displayName, "depth": 0}
                else:
                    dD = {"id": idCode, "name": displayName, "parents": [pId], "depth": len(linL) - 1}
                dL.append(dD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return dL

    def __getInterProParents(self, filePath):
        """Read the InterPro parent hierarchy and return a dictionary parent ids.

        Args:
            filePath (str): path to InterPro parent/child hierachy

        Returns:
            dict: {idCode: parentIdCode or None}
        """
        interProParentD = {}
        lineL = self.__mU.doImport(filePath, fmt="list")
        stack = []
        for line in lineL:
            content = line.rstrip()  # drop \n
            row = content.split("--")
            ff = row[-1].split("::")
            tS = ff[0].strip()
            # stack[:] = stack[: len(row) - 1] + [row[-1]]
            stack[:] = stack[: len(row) - 1] + [tS]
            for ii, idCode in enumerate(stack):
                if idCode not in interProParentD:  # prevents overwriting the parent of idCode, in case idCode has already been iterated over in ParentChildTreeFile.txt
                    interProParentD[idCode] = None if ii == 0 else stack[ii - 1]
                else:
                    # This will correct the parent of idCode from being None if it's later identified as having a parent at another point in ParentChildTreeFile.txt
                    if interProParentD[idCode] is None and ii != 0:
                        interProParentD[idCode] = stack[ii - 1]
            logger.debug("Lineage %r", "\t".join(stack))
        #
        return interProParentD

    def __getInterProIndex(self, filePath):
        """Read CSV file of InterPro accessions and descriptions

        Args:
            filePath (str): path to InterPro accession/description csv file

        Returns:
            dict: {idCode: description}
        """

        interProD = {}
        encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {}
        rowL = self.__mU.doImport(filePath, fmt="tdd", rowFormat="list", **encodingD)
        for row in rowL:
            try:
                interProId = row[0].strip().upper()
                interProType = row[1].strip()
                descr = row[2].strip()
                interProD[interProId] = {"description": descr, "type": interProType}
            except Exception:
                pass
        #
        return interProD
Beispiel #14
0
class ChemCompDepictWrapper(SingletonClass):
    """Wrapper for chemical component depiction operations."""
    def __init__(self):
        self.__startTime = time.time()
        # ---
        self.__workPath = "."
        self.__mU = MarshalUtil(workPath=self.__workPath)
        self.__configD = None
        self.__cachePath = None
        # ---
        self.__statusDescriptorError = -100
        self.__searchError = -200
        self.__searchSuccess = 0
        self.__imageCount = 0

    def readConfig(self, resetImagePath=True):
        #
        ok = False
        try:
            self.__cachePath = os.environ.get("CHEM_DEPICT_CACHE_PATH", ".")
            configFileName = os.environ.get("CHEM_DEPICT_CONFIG_FILE_NAME",
                                            "depict-config.json")
            #
            configFilePath = os.path.join(self.__cachePath, "config",
                                          configFileName)
            configD = {}
            if self.__mU.exists(configFilePath):
                configD = self.__mU.doImport(configFilePath, fmt="json")
            logger.debug("configD: %r", configD)
            if configD and (len(configD) >= 2) and float(
                    configD["versionNumber"]) > 0.1:
                logger.info("Read version %r sections %r from %s",
                            configD["versionNumber"], list(configD.keys()),
                            configFilePath)
                ok = True
                #
                if resetImagePath:
                    # Allow the configuration to be relocatable.
                    tS = configD[
                        "imageDir"] if "imageDir" in configD else "images"
                    configD["imageDirPath"] = os.path.join(
                        self.__cachePath, tS)
                    configD["versionNumber"] = "0.2"
            else:
                # Handle missing config for now
                configD["imageDir"] = "images"
                configD["imageDirPath"] = os.path.join(self.__cachePath,
                                                       configD["imageDir"])
                logger.warning("Reading config file fails from path %r",
                               configFilePath)
                logger.warning("Using config %r", configD)
                ok = True
            #
            self.__configD = configD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            ok = False
        return ok

    def setConfig(self, cachePath, **kwargs):
        """Provide dependencies for rebuilding depict file dependencies.

        Args:
            cachePath (str): path to cache data files.

            Other options are propagated to configurations of the wrapped classes in __bootstrapDepictConfig()

        """
        self.__configD = self.__makeBootstrapDepictConfig(cachePath, **kwargs)
        return len(self.__configD) >= 2

    def __makeBootstrapDepictConfig(self, cachePath, **kwargs):
        """Create depict configuration bootstrap file"""
        configD = {}
        try:
            storeConfig = kwargs.get("storeConfig", True)
            os.environ["CHEM_DEPICT_CACHE_PATH"] = os.path.join(cachePath)
            configDirPath = os.path.join(cachePath, "config")
            configFilePath = os.path.join(configDirPath, "depict-config.json")
            #
            logger.info("Updating depict configuration using %s",
                        configFilePath)
            #
            imageDirPath = os.path.join(cachePath, "images")
            self.__mU.mkdir(imageDirPath)
            configD = {"versionNumber": 0.20, "imageDir": "images"}
            if storeConfig:
                self.__mU.mkdir(configDirPath)
                self.__mU.doExport(configFilePath,
                                   configD,
                                   fmt="json",
                                   indent=3)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return configD
        #

    def setImageCount(self, imageCount):
        self.__imageCount = imageCount

    def getImageCount(self):
        return self.__imageCount

    def __makeImagePath(self):
        imageDirPath = self.__configD[
            "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "."
        fileRotateIncrement = self.__configD[
            "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50
        ic = self.__imageCount % fileRotateIncrement
        imagePath = os.path.join(imageDirPath, "image-%s.svg" % ic)
        return imagePath

    def depictMolecule(self,
                       identifier,
                       identifierType,
                       imagePath=None,
                       **kwargs):
        """Create depiction from InChI, SMILES descriptors or PDB identifier."""
        try:
            imagePath = imagePath if imagePath else self.__makeImagePath()
            oeio = OeIoUtils()
            if identifierType.lower() in ["smiles"]:
                oeMol = oeio.smilesToMol(identifier)
            elif identifierType.lower() in ["inchi"]:
                oeMol = oeio.inchiToMol(identifier)
            elif identifierType.lower() in ["identifierpdb"]:
                ccsw = ChemCompSearchWrapper()
                oesmP = ccsw.getSearchMoleculeProvider()
                oeMol = oesmP.getMol(identifier)
            #
            ok = self.__depictOne(oeMol, imagePath, **kwargs)
            return imagePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __depictOne(self, oeMol, imagePath, **kwargs):
        """Single

        Args:
            oeMol (object): instance of an OE graph molecule
            imagePath (string): file path for image

        Returns:
            bool: True for success or False otherwise
        """
        try:
            title = kwargs.get("title", None)
            oed = OeDepict()
            oed.setMolTitleList([("Target", oeMol, title)])

            # ---
            bondDisplayWidth = 10.0
            numAtoms = oeMol.NumAtoms()
            if numAtoms > 100 and numAtoms <= 200:
                bondDisplayWidth = 6.0
            elif numAtoms > 200:
                bondDisplayWidth = 4.0
            # ---
            oed.setDisplayOptions(
                imageSizeX=kwargs.get("imageSizeX", 2500),
                imageSizeY=kwargs.get("imageSizeX", 2500),
                labelAtomName=kwargs.get("labelAtomName", False),
                labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True),
                labelAtomIndex=kwargs.get("labelAtomIndex", False),
                labelBondIndex=kwargs.get("labelBondIndex", False),
                labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True),
                cellBorders=kwargs.get("cellBorders", True),
                bondDisplayWidth=bondDisplayWidth,
            )
            oed.setGridOptions(rows=1, cols=1, cellBorders=False)
            oed.prepare()
            oed.write(imagePath)
            self.__imageCount += 1
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def status(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6,
                    unitS)
        endTime = time.time()
        logger.info("Status at %s (up %.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def alignMoleculePair(self,
                          refIdentifier,
                          refIdentifierType,
                          fitIdentifier,
                          fitIdentifierType,
                          imagePath=None,
                          **kwargs):
        """Create aligned depiction for a target molecule InChI, SMILES descriptors or PDB identifier."""
        try:
            imagePath = imagePath if imagePath else self.__makeImagePath()
            oeio = OeIoUtils()
            ccsw = ChemCompSearchWrapper()
            oesmP = ccsw.getSearchMoleculeProvider()
            # ---
            if refIdentifierType.lower() in ["smiles"]:
                oeMolRef = oeio.smilesToMol(refIdentifier)
            elif refIdentifierType.lower() in ["inchi"]:
                oeMolRef = oeio.inchiToMol(refIdentifier)
            elif refIdentifierType.lower() in ["identifierpdb"]:
                oeMolRef = oesmP.getMol(refIdentifier)
            #
            if fitIdentifierType.lower() in ["smiles"]:
                oeMolFit = oeio.smilesToMol(fitIdentifier)
            elif fitIdentifierType.lower() in ["inchi"]:
                oeMolFit = oeio.inchiToMol(fitIdentifier)
            elif fitIdentifierType.lower() in ["identifierpdb"]:
                oeMolFit = oesmP.getMol(fitIdentifier)
            # ---
            logger.info("oeMolRef atoms %r", oeMolRef.NumAtoms())
            logger.info("oeMolFit atoms %r", oeMolFit.NumAtoms())

            displayIdRef = "Ref"
            displayIdFit = "Fit"
            ok = self.__depictAlignedPair(oeMolRef, displayIdRef, oeMolFit,
                                          displayIdFit, imagePath, **kwargs)
            return imagePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __depictAlignedPair(self, oeMolRef, displayIdRef, oeMolFit,
                            displayIdFit, imagePath, **kwargs):
        """Depict pairwise MCSS alignment"""
        try:
            #
            oed = OeDepictMCSAlignPage()
            oed.setSearchType(sType="relaxed")
            #
            oed.setRefMol(oeMolRef, displayIdRef)
            oed.setFitMol(oeMolFit, displayIdFit)
            #
            # imagePath = self.__makeImagePath()
            # ---
            bondDisplayWidth = 10.0
            numAtomsRef = oeMolRef.NumAtoms()
            if numAtomsRef > 100 and numAtomsRef <= 200:
                bondDisplayWidth = 6.0
            elif numAtomsRef > 200:
                bondDisplayWidth = 4.0
            # ---
            oed.setDisplayOptions(
                imageSizeX=kwargs.get("imageSizeX", 2500),
                imageSizeY=kwargs.get("imageSizeX", 2500),
                labelAtomName=kwargs.get("labelAtomName", False),
                labelAtomCIPStereo=kwargs.get("labelAtomCIPStereo", True),
                labelAtomIndex=kwargs.get("labelAtomIndex", False),
                labelBondIndex=kwargs.get("labelBondIndex", False),
                labelBondCIPStereo=kwargs.get("labelBondCIPStereo", True),
                cellBorders=kwargs.get("cellBorders", True),
                bondDisplayWidth=bondDisplayWidth,
                highlightStyleFit=kwargs.get("highlightStyleFit",
                                             "ballAndStickInverse"),
            )
            #
            aML = oed.alignPair(imagePath=imagePath)
            logger.info("Aligned atom count %d", len(aML))
            #
            # self.assertGreater(len(aML), 1)
            # if aML:
            #    for (rCC, rAt, tCC, tAt) in aML:
            #        logger.debug("%5s %-5s %5s %-5s", rCC, rAt, tCC, tAt)
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def toMolFile(self,
                  identifier,
                  identifierType,
                  molfilePath=None,
                  fmt="mol",
                  **kwargs):
        """Create molfile (fmt) from InChI, SMILES descriptors or PDB identifier."""
        try:
            molfilePath = molfilePath if molfilePath else self.__makeMolfilePath(
                fmt=fmt)
            oeio = OeIoUtils()
            if identifierType.lower() in ["smiles"]:
                oeMol = oeio.smilesToMol(identifier)
                oeMol.SetTitle("From SMILES")
            elif identifierType.lower() in ["inchi"]:
                oeMol = oeio.inchiToMol(identifier)
                oeMol.SetTitle("From InChI")
            elif identifierType.lower() in ["identifierpdb"]:
                ccsw = ChemCompSearchWrapper()
                oesmP = ccsw.getSearchMoleculeProvider()
                oeMol = oesmP.getMol(identifier)
            #
            ok = self.__toMolFile(oeMol, molfilePath, **kwargs)
            return molfilePath if ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def __toMolFile(self, oeMol, molfilePath, **kwargs):
        """Write the

        Args:
            oeMol (object): instance of an OE graph molecule
            molfilePath (string): file path for molfile (type determined by extension)

        Returns:
            bool: True for success or False otherwise
        """
        try:
            _ = kwargs
            oeio = OeIoUtils()
            oeio.write(molfilePath, oeMol, constantMol=True)
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __makeMolfilePath(self, fmt="mol"):
        imageDirPath = self.__configD[
            "imageDirPath"] if self.__configD and "imageDirPath" in self.__configD else "."
        fileRotateIncrement = self.__configD[
            "fileRotateIncrement"] if self.__configD and "fileRotateIncrement" in self.__configD else 50
        ic = self.__imageCount % fileRotateIncrement
        molPath = os.path.join(imageDirPath, "molfile-%s.%s" % (ic, fmt))
        return molPath
Beispiel #15
0
class OeIoUtils(object):
    """Utility methods to manage OE specific IO and format conversion operations."""
    def __init__(self, **kwargs):
        self.__dirPath = kwargs.get("dirPath", ".")
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__oeErrorLevel = oechem.OEErrorLevel_Info
        if kwargs.get("quietFlag", False):
            self.setQuiet()
        #

    def setQuiet(self):
        """Suppress OE warnings and processing errors"""
        oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Quiet)
        self.__oeErrorLevel = oechem.OEErrorLevel_Quiet

    def getComponentDefinitions(self, ccdFilePath):
        rdCcObjL = []
        try:
            rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif")
            logger.info("Read %s with %d definitions", ccdFilePath,
                        len(rdCcObjL))
        except Exception as e:
            logger.exception("Loading %s failing with %s", ccdFilePath, str(e))
        return rdCcObjL

    def suppressHydrogens(self, oeMol):
        tMol = oechem.OEMol(oeMol) if oeMol else None
        if tMol:
            oechem.OESuppressHydrogens(tMol)
        return tMol

    def chemCompToMol(self,
                      ccdFilePath,
                      molBuildType="model-xyz",
                      quietFlag=False):
        retMolL = []
        try:
            rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif")
            logger.info("Read %s with %d definitions", ccdFilePath,
                        len(rdCcObjL))
            oemf = OeMoleculeFactory()
            if quietFlag:
                oemf.setQuiet()
            for ccObj in rdCcObjL:
                ccId = oemf.setChemCompDef(ccObj)
                if ccId:
                    ok = oemf.build(molBuildType=molBuildType)
                    if ok:
                        oeMol = oemf.getMol()
                        retMolL.append(oeMol)
        except Exception as e:
            logger.exception("Loading %s failing with %s", ccdFilePath, str(e))
        return retMolL

    def descriptorToSmiles(self,
                           descr,
                           descrType,
                           limitPerceptions=False,
                           messageTag=None):
        """Parse the input descriptor string and return an OE smiles.

        Args:
            descr (str): descriptor
            descrType (str): descriptor type
            limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor
            messageTag (srt, optional): prefix string for error messages. Defaults to None.

        Returns:
            str: SMILES string
        """
        try:
            if "SMILES" in descrType.upper() and "ISO" in descrType.upper():
                oeMol = self.smilesToMol(descr,
                                         limitPerceptions=limitPerceptions,
                                         messageTag=messageTag)
                if oeMol:
                    return oechem.OECreateIsoSmiString(oeMol)
                else:
                    return None
            if "SMILES" in descrType.upper():
                oeMol = self.smilesToMol(descr,
                                         limitPerceptions=limitPerceptions,
                                         messageTag=messageTag)
                if oeMol:
                    return oechem.OECreateCanSmiString(oeMol)
                else:
                    return None
            elif "INCHI" in descrType.upper():
                oeMol = self.inchiToMol(descr,
                                        limitPerceptions=limitPerceptions,
                                        messageTag=messageTag)
                if oeMol:
                    return oechem.OECreateIsoSmiString(oeMol)
            else:
                return None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def descriptorToMol(self,
                        descr,
                        descrType,
                        limitPerceptions=False,
                        messageTag=None):
        """Parse the input descriptor string and return a molecule object (OeGraphMol/OeQMol).

        Args:
            descr (str): descriptor
            descrType (str): descriptor type
            limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor
            messageTag (srt, optional): prefix string for error messages. Defaults to None.

        Returns:
            object: OeGraphMol()/OeQmol() object or None for failure

            ifs.SetFlavor(oechem.OEFormat_PDB, oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA | oechem.OEIFlavor_PDB_ALTLOC)  # noq
        """
        try:
            if "SMILES" in descrType.upper() and "ISO" in descrType.upper():
                oeMol = self.smilesToMol(descr,
                                         limitPerceptions=limitPerceptions,
                                         messageTag=messageTag)
                if oeMol:
                    isoSmiles = oechem.OECreateIsoSmiString(oeMol)
                    return self.smilesToMol(isoSmiles,
                                            limitPerceptions=limitPerceptions,
                                            messageTag=messageTag)
                else:
                    return None
            if "SMILES" in descrType.upper():
                oeMol = self.smilesToMol(descr,
                                         limitPerceptions=limitPerceptions,
                                         messageTag=messageTag)
                if oeMol:
                    smiles = oechem.OECreateCanSmiString(oeMol)
                    return self.smilesToMol(smiles,
                                            limitPerceptions=limitPerceptions,
                                            messageTag=messageTag)
                else:
                    return None
            elif "INCHI" in descrType.upper():
                oeMol = self.inchiToMol(descr,
                                        limitPerceptions=limitPerceptions,
                                        messageTag=messageTag)
                if oeMol:
                    isoSmiles = oechem.OECreateIsoSmiString(oeMol)
                    return self.smilesToMol(isoSmiles,
                                            limitPerceptions=limitPerceptions,
                                            messageTag=messageTag)
            elif "SMARTS" in descrType.upper():
                return self.smartsToQmol(descr, messageTag=messageTag)
            else:
                return None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def smilesToMol(self, smiles, limitPerceptions=False, messageTag=None):
        """Parse the input SMILES string and return a molecule object (OeGraphMol).

        Args:
            smiles (str): SMILES string
            limitPerceptions (bool): flag to limit the perceptions/transformations of input SMILES

        Returns:
            object: OeGraphMol() object or None for failure
        """
        try:
            label = messageTag if messageTag else ""
            mol = oechem.OEGraphMol()
            smiles.strip()
            if limitPerceptions:
                # convert the SMILES string into a molecule
                if oechem.OEParseSmiles(mol, smiles, False, False):
                    return mol
                else:
                    logger.debug(
                        "%s parsing failed for input SMILES string %s", label,
                        smiles)
                    logger.error("%s parsing failed for input SMILES string",
                                 label)
            else:
                if oechem.OESmilesToMol(mol, smiles):
                    return mol
                else:
                    logger.debug(
                        "%s converting failed for input SMILES string %s",
                        label, smiles)
                    logger.error(
                        "%s converting failed for input SMILES string", label)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def inchiToMol(self, inchi, limitPerceptions=False, messageTag=None):
        """Parse the input InChI string and return a molecule object (OeGraphMol).

        Args:
            inchi (str): InChI string

        Returns:
            object: OeGraphMol() object or None for failure

        """
        try:
            label = messageTag if messageTag else ""
            mol = oechem.OEGraphMol()
            inchi = inchi.strip()
            if limitPerceptions:
                if oechem.OEParseInChI(mol, inchi):
                    return mol
                else:
                    logger.debug("%s parsing failed for InChI string %r",
                                 label, inchi)
                    logger.error("%s parsing failed for InChI string", label)
            else:
                if oechem.OEInChIToMol(mol, inchi):
                    return mol
                else:
                    logger.debug("%s converting failed for InChI string %r",
                                 label, inchi)
                    logger.error("%s converting failed for InChI string",
                                 label)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def descriptorToQMol(self,
                         descr,
                         descrType,
                         limitPerceptions=False,
                         messageTag=None):
        """Parse the input descriptor string and return a query molecule object (OeQMol).

        Args:
            descr (str): descriptor
            descrType (str): descriptor type
            limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor
            messageTag (srt, optional): prefix string for error messages. Defaults to None.

        Returns:
            object: OeQmol() object or None for failure

        """
        oeQMol = label = None
        try:
            label = messageTag if messageTag else ""
            tMol = self.descriptorToMol(descr,
                                        descrType,
                                        limitPerceptions=limitPerceptions,
                                        messageTag=messageTag)
            if tMol:
                oeQMol = oechem.OEQMol(tMol)

        except Exception as e:
            logger.error("%s Failing for with %s", label, str(e))
        return oeQMol if oeQMol else None

    def smartsToQmol(self, smarts, messageTag=None):
        """Parse the input SMARTS query string and return a query molecule object (OeQMol).

        Args:
            smarts (str): SMARTS query string

        Returns:
            object : OeQMol() object or None for failure
        """
        try:
            label = messageTag if messageTag else ""
            qmol = oechem.OEQMol()
            if oechem.OEParseSmarts(qmol, smarts):
                return qmol
            else:
                logger.debug("%s parsing failed for SMARTS string %s", label,
                             smarts)
                logger.error("%s parsing failed for SMARTS string", label)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def fileToMols(self, filePath, use3D=False, largestPart=False):
        """Parse the input path returning a list of molecule objects (OeGraphMol).

        Args:
            filePath (str): file path must have strandard recognized extension ('mol', 'sdf', 'smi', 'oeb').

        Returns:
            list : list of OeGraphMol() objects

        """
        mL = []
        oemf = OeMoleculeFactory()
        try:
            ifs = oechem.oemolistream()
            if ifs.open(filePath):
                for tMol in ifs.GetOEGraphMols():
                    oeMol = oechem.OEGraphMol(tMol)
                    # if oechem.OEReadMolecule(ifs, oeMol):
                    if largestPart:
                        molL = oemf.getParts(oeMol)
                        if len(molL) > 0:
                            oeMol = molL[0]
                            logger.info(
                                "Using largest bonded molecule part (%d/%d)",
                                len(molL), oeMol.NumAtoms())
                    if use3D:
                        mL.append(
                            oemf.updateOePerceptions3D(
                                oeMol, aromaticModel=oechem.OEAroModelOpenEye))
                    else:
                        mL.append(
                            oemf.updateOePerceptions2D(
                                oeMol, aromaticModel=oechem.OEAroModelOpenEye))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return mL

    def stringToMols(self, txt, sType="mol2", use3D=False):
        """Parse the input string as input format type (sType) returning a list of
        molecule objects (OeGraphMol)

        Args:
            txt (str): string text of molecule data
            sType (str, optional): string data format (mol2, sdf, smiles) . Defaults to "mol2".

        Returns:
            list: list of OeGraphMol() objects
        """
        #
        mL = []
        oemf = OeMoleculeFactory()
        try:
            if sType not in ["mol2", "sdf", "smiles"]:
                logger.error("Unsupported string data format")
                return None
            fD = {
                "mol2": oechem.OEFormat_MOL2,
                "sdf": oechem.OEFormat_SDF,
                "smiles": oechem.OEFormat_SMI
            }
            ifs = oechem.oemolistream()
            ifs.SetFormat(fD["sType"])
            if not ifs.openstring(txt):
                logger.error("Unable open string data for molecule reader")
                return None
            for tMol in ifs.GetOEGraphMols():
                oeMol = oechem.OEGraphMol(tMol)
                if use3D:
                    mL.append(
                        oemf.updateOePerceptions3D(
                            oeMol, aromaticModel=oechem.OEAroModelOpenEye))
                else:
                    mL.append(
                        oemf.updateOePerceptions2D(
                            oeMol, aromaticModel=oechem.OEAroModelOpenEye))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return mL

    def readOeBinaryMolCache(self, filePath):
        """Return a list of OeGraphMol() objects read from the cached binary file.

        Args:
            filePath (str): file path for the binary OeMol cache

        Returns:
            dict: dictionary of OeGraphMol()'s {<ccId>: OeGraphMol(), ... }
        """
        retD = {}
        startTime = time.time()
        try:
            ifs = oechem.oemolistream()
            if ifs.open(filePath):
                for oeMol in ifs.GetOEGraphMols():
                    tMol = oechem.OEGraphMol(oeMol)
                    retD[tMol.GetTitle()] = tMol
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        endTime = time.time()
        logger.info("Completed operation at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
        return retD

    def createOeFingerPrintDatabase(self,
                                    oeMolDbFilePath,
                                    oeFpDbFilePath,
                                    fpType="TREE",
                                    dbType="FAST"):
        if dbType == "FAST":
            return self.__createOeFastFingerPrintDatabase(oeMolDbFilePath,
                                                          oeFpDbFilePath,
                                                          fpType=fpType)
        else:
            return True

    def __createOeFastFingerPrintDatabase(self,
                                          oeMolDbFilePath,
                                          oeFpDbFilePath,
                                          fpType="TREE"):
        """Create fast search fingerprint database from the input molecular database.

        Args:
            oeMolDbFilePath (str): path to the input molecular database
            oeFpDbFilePath (str): path to the output fingerprint database
            fpType (str):  finger print type

        Returns:
            bool: True for success or False otherwise

        Supports:
            OEFPType_Circular
            OEFPType_Path
            OEFPType_Tree

        Not currently supported by OE fp search -
            OEFPType_MACCS166
            OEFPType_Lingo
        """
        startTime = time.time()
        ok = False
        try:
            _ = fpType
            fpD = {
                "TREE": oegraphsim.OEFPType_Tree,
                "CIRCULAR": oegraphsim.OEFPType_Circular,
                "PATH": oegraphsim.OEFPType_Path
            }
            myFpType = fpD[
                fpType] if fpType in fpD else oegraphsim.OEFPType_Tree
            opts = oegraphsim.OECreateFastFPDatabaseOptions(
                oegraphsim.OEGetFPType(myFpType))
            ok = oegraphsim.OECreateFastFPDatabaseFile(oeFpDbFilePath,
                                                       oeMolDbFilePath, opts)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        endTime = time.time()
        logger.info("Completed operation at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
        return ok

    def loadOeFingerPrintDatabase(self,
                                  oeMolDbFilePath,
                                  oeFpDbFilePath,
                                  inMemory=False,
                                  fpType="TREE",
                                  fpDbType="FAST"):
        if fpDbType == "FAST":
            return self.__loadOeFastFingerPrintDatabase(oeFpDbFilePath,
                                                        inMemory=inMemory,
                                                        fpType=fpType)
        else:
            return self.__loadOeFingerPrintDatabase(oeMolDbFilePath,
                                                    fpType=fpType)

    def __loadOeFingerPrintDatabase(self, oeMolDbFilePath, fpType="TREE"):
        """Create conventional search fingerprint database from the input molecular database.

        Args:
            oeMolDbFilePath (str): path to the input molecular database
            oeFpDbFilePath (str): path to the output fingerprint database
            fpType (str):  finger print type

        Returns:
            bool: True for success or False otherwise

        Supports:
            OEFPType_Circular
            OEFPType_Path
            OEFPType_Tree
            OEFPType_MACCS166
            OEFPType_Lingo
        """
        fpDb = None
        ok = False
        startTime = time.time()
        try:
            fpD = {
                "TREE": oegraphsim.OEFPType_Tree,
                "CIRCULAR": oegraphsim.OEFPType_Circular,
                "PATH": oegraphsim.OEFPType_Path,
                "MACCS": oegraphsim.OEFPType_MACCS166,
                "LINGO": oegraphsim.OEFPType_Lingo,
            }
            fpType = fpType if fpType and fpType in fpD else "TREE"
            tag = "FP_" + fpType
            oeFpType = fpD[
                fpType] if fpType in fpD else oegraphsim.OEFPType_Tree
            oeMolDb = self.loadOeBinaryDatabaseAndIndex(oeMolDbFilePath)
            #
            fpDb = oegraphsim.OEFPDatabase(oeFpType)
            numMols = oeMolDb.GetMaxMolIdx()
            logger.debug("fpType %r tag %r oeFpType %r", fpType, tag, oeFpType)
            oeMol = oechem.OEGraphMol()
            for idx in range(0, numMols):
                if oeMolDb.GetMolecule(oeMol, idx):
                    if oeMol.HasData(tag):
                        tfp = oeMol.GetData(tag)
                        fpDb.AddFP(tfp)
                    else:
                        fpDb.AddFP(oeMol)
                else:
                    logger.info("Missing molecule at index %r", idx)

            numFp = fpDb.NumFingerPrints()
            ok = numMols == numFp
            logger.info(
                "Loaded molecules  %d %s fingerprints %d (%.4f seconds)",
                numMols, fpType, numFp,
                time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            fpDb = None
        endTime = time.time()
        logger.debug("Completed with status %r operation at %s (%.4f seconds)",
                     ok, time.strftime("%Y %m %d %H:%M:%S",
                                       time.localtime()), endTime - startTime)
        return fpDb

    def __loadOeFastFingerPrintDatabase(self,
                                        oeFpDbFilePath,
                                        inMemory=False,
                                        fpType="TREE"):
        #
        _ = fpType
        startTime = time.time()
        if inMemory:
            memType = oegraphsim.OEFastFPDatabaseMemoryType_InMemory
        else:
            memType = oegraphsim.OEFastFPDatabaseMemoryType_MemoryMapped
        if not self.__mU.exists(oeFpDbFilePath):
            logger.error("Missing fingerprint database file %r",
                         oeFpDbFilePath)
        fpDb = oegraphsim.OEFastFPDatabase(oeFpDbFilePath, memType)
        if not fpDb.IsValid():
            logger.error("Cannot open fingerprint database %r", oeFpDbFilePath)
        #
        lenFp = fpDb.NumFingerPrints()
        memTypeStr = fpDb.GetMemoryTypeString()
        endTime = time.time()
        logger.info(
            "Read fingerprint database length %d loaded %s (%.4f seconds)",
            lenFp, memTypeStr, endTime - startTime)
        return fpDb

    def loadOeBinaryDatabaseAndIndex(self, oeMolDbFilePath):
        molDb = None
        try:
            molDb = oechem.OEMolDatabase()
            if not molDb.Open(oeMolDbFilePath):
                logger.error("Unable to open %r", oeMolDbFilePath)
            molCount = molDb.NumMols()
            logger.info("Loaded OE database file containing %d molecules",
                        molCount)
        except Exception as e:
            logger.exception("Loading %r failing with %s", oeMolDbFilePath,
                             str(e))
        return molDb

    def createOeBinaryDatabaseAndIndex(self, oebMolFilePath, oeMolDbFilePath):
        """Create OE binary database file and associated index from the input serial
        binary data file.

        Args:
            oebMolFilePath (str): input OeMol stream binary file path
            oeMolDbFilePath (str): output OeMolDatabase file path

        Returns:
           int:  number of molecules processed in the database.
        """
        molCount = 0
        try:
            startTime = time.time()
            moldb = oechem.OEMolDatabase()
            if not moldb.Open(oebMolFilePath):
                logger.error("Read fails for %r", oebMolFilePath)
                return molCount
            #
            logger.info(
                "Opened database in format %r num mols %d max index %d",
                moldb.GetFormat(), moldb.NumMols(), moldb.GetMaxMolIdx())
            moldb.Save(oeMolDbFilePath)
            tL = list(moldb.GetTitles())
            logger.info("First and last titles: %r %r", tL[0], tL[-1])
            molCount = moldb.NumMols()
            endTime = time.time()
            logger.info("Completed operation at %s (%.4f seconds)",
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        endTime - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return molCount

    def buildOeBinaryMolCache(self,
                              filePath,
                              ccObjD,
                              molBuildType="model-xyz",
                              quietFlag=False,
                              fpTypeList=None,
                              limitPerceptions=False,
                              suppressHydrogens=False):
        """Build cache of OEMol() objects from the input chemical component definition list.

        Args:
            filePath (str): output cache file path
            ccObjD (dict):  chemical component object dictionary
            molBuildType (str, optional): [description]. Defaults to "model-xyz".
            quietFlag (bool, optional): [description]. Defaults to False.
            fpTypeList (list, optional): fingerprint type list. Defaults to None.
            limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False.
            suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False.

        Returns:
            (int, int, list): chem comp success count, error count, chem comp identifier failure list

        """
        ok = False
        startTime = time.time()
        failIdList = []
        ccCount = 0
        errCount = 0
        try:
            ofs = oechem.oemolostream()
            ofs.SetFormat(oechem.OEFormat_OEB)
            if ofs.open(filePath):
                oemf = OeMoleculeFactory()
                if quietFlag:
                    oemf.setQuiet()
                for ccId, ccObj in ccObjD.items():
                    tId = oemf.setChemCompDef(ccObj)
                    if tId and tId == ccId:
                        ok = oemf.build(molBuildType=molBuildType,
                                        limitPerceptions=limitPerceptions)
                        if ok and fpTypeList:
                            fpOk = oemf.addFingerPrints(fpTypeList)
                            if not fpOk:
                                logger.info(
                                    "Fingerprint generation fails for %r",
                                    ccId)
                        if ok:
                            oeMol = oemf.getMol(
                                suppressHydrogens=suppressHydrogens)
                            oechem.OEWriteMolecule(ofs, oeMol)
                            ccCount += 1
                    if not ok or not tId:
                        # build failed incomplete component (e.g. missing atoms or bonds)
                        errCount += 1
                        failIdList.append(ccId)
            else:
                logger.error("Unable to open cache database %s", filePath)
                errCount += 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        endTime = time.time()
        logger.info("Completed operation at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
        return ccCount, errCount, failIdList

    #
    def buildOeBinaryMolCacheFromIndex(self,
                                       filePath,
                                       ccIdxD,
                                       quietFlag=False,
                                       fpTypeList=None,
                                       limitPerceptions=False,
                                       suppressHydrogens=False):
        """Build cache of OEGraphMol() objects from the input chemical component search index.

        Args:
            filePath (str): output cache file path
            ccIdxD (dict): search index dictionary
            quietFlag (bool, optional): suppress OE output. Defaults to False.
            fpTypeList (list, optional): list of fingerprint types. Defaults to None.
            limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False.
            suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False.

        Returns:
            (int, int, list): chem comp success count, error count, chem comp identifier failure list
        """
        failIdList = []
        ccCount = 0
        errCount = 0
        startTime = time.time()
        try:
            ofs = oechem.oemolostream()
            ofs.SetFormat(oechem.OEFormat_OEB)
            if ofs.open(filePath):
                oemf = OeMoleculeFactory()
                if quietFlag:
                    oemf.setQuiet()
                for searchCcId, ccIdx in ccIdxD.items():
                    oemf.setDescriptor(ccIdx["smiles"], "oe-iso-smiles",
                                       searchCcId)
                    ok = oemf.build(molBuildType="oe-iso-smiles",
                                    limitPerceptions=limitPerceptions)
                    if ok and fpTypeList:
                        fpOk = oemf.addFingerPrints(fpTypeList)
                        if not fpOk:
                            logger.info("Fingerprint generation fails for %r",
                                        searchCcId)
                    if ok:
                        if not suppressHydrogens:
                            oemf.addExplicitHydrogens()
                            oemf.setSimpleAtomNames()
                        oeMol = oemf.getMol(
                            suppressHydrogens=suppressHydrogens)
                        oechem.OEWriteMolecule(ofs, oeMol)
                        ccCount += 1
                    if not ok:
                        # build failed incomplete component (e.g. missing atoms or bonds)
                        errCount += 1
                        failIdList.append(searchCcId)
            else:
                logger.error("Unable to open cache database %s", filePath)
                errCount += 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        endTime = time.time()
        logger.info("Completed operation at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
        return ccCount, errCount, failIdList

    def createOeSubSearchDatabase(self,
                                  oebMolFilePath,
                                  oeSubSearchFilePath,
                                  screenType="SMARTS",
                                  numProc=2):
        sort = True
        keepTitle = True
        myScreenType = None
        if screenType == "MOLECULE":
            myScreenType = oechem.OEGetSubSearchScreenType(
                oechem.OESubSearchScreenType_Molecule)
        elif screenType == "MDL":
            myScreenType = oechem.OEGetSubSearchScreenType(
                oechem.OESubSearchScreenType_MDL)
        elif screenType == "SMARTS":
            myScreenType = oechem.OEGetSubSearchScreenType(
                oechem.OESubSearchScreenType_SMARTS)

        opts = oechem.OECreateSubSearchDatabaseOptions(myScreenType)
        opts.SetSortByBitCounts(sort)
        opts.SetKeepTitle(keepTitle)
        opts.SetNumProcessors(numProc)

        screenStr = myScreenType.GetName()
        logger.info("Using %d processor(s) to generate database with %s",
                    numProc, screenStr)

        tracer = oechem.OEConsoleProgressTracer()
        ok = oechem.OECreateSubSearchDatabaseFile(oeSubSearchFilePath,
                                                  oebMolFilePath, opts, tracer)
        return ok

    def loadOeSubSearchDatabase(self,
                                oeSubSearchFilePath,
                                screenType=None,
                                numProc=1):
        ssDb = None
        try:
            _ = screenType
            ssDb = oechem.OESubSearchDatabase(
                oechem.OESubSearchDatabaseType_Default, numProc)
            tracer = oechem.OEConsoleProgressTracer()
            if not ssDb.Open(oeSubSearchFilePath, tracer):
                logger.error("Unable to open %r", oeSubSearchFilePath)
            logger.info("Opened %r with %r molecules", oeSubSearchFilePath,
                        ssDb.NumMolecules())
        except Exception as e:
            logger.exception("Loading %r failing with %s", oeSubSearchFilePath,
                             str(e))
        return ssDb

    def write(self, filePath, oeMol, constantMol=False, addSdTags=True):
        """Write an oeMol with format type inferred from the filePath extension (e.g. .mol)

        Args:
            filePath (str): filepath with a chemical type extension
            constantMol (bool, optional): copies molecule before performing format specific perceptions

        Returns:
            bool: True for success or False otherwise
        """
        try:
            molId = os.path.splitext(os.path.basename(filePath))[0]
            fmt = os.path.splitext(os.path.basename(filePath))[1][1:].lower()
            #
            if addSdTags:
                oemf = OeMoleculeFactory()
                oemf.setOeMol(oeMol, molId)
                oemf.addSdTags()
                oeMol = oemf.getMol()
            #
            self.__mU.mkdir(os.path.dirname(filePath))
            ofs = oechem.oemolostream()
            ofs.open(filePath)
            logger.debug("Writing (fmt=%s) molId %s path %s title %s", fmt,
                         molId, filePath, oeMol.GetTitle())
            #
            if constantMol:
                oechem.OEWriteConstMolecule(ofs, oeMol)
            else:
                oechem.OEWriteMolecule(ofs, oeMol)
            #
            # If this is a mol2 file, we need to replace the resname
            if fmt.startswith("mol2"):
                # If this is a mol2/mol2h substitute the default substructure id
                with open(filePath, "r", encoding="utf-8") as ifh:
                    lines = ifh.readlines()
                lines = [line.replace("<0>", molId) for line in lines]
                with open(filePath, "w", encoding="utf-8") as ofh:
                    ofh.writelines(lines)
            return True
        except Exception as e:
            logger.exception("Failing for %s with %s", filePath, str(e))
        return False

    def serializeOe(self, oeMol):
        """Create a string representing the content of the current OE molecule.   This
        serialization uses the OE internal binary format.
        """
        try:
            oms = oechem.oemolostream()
            oms.SetFormat(oechem.OEFormat_OEB)
            oms.openstring()
            oechem.OEWriteMolecule(oms, oeMol)
            logger.debug("SMILES %s", oechem.OECreateCanSmiString(oeMol))
            logger.debug("Atoms = %d", oeMol.NumAtoms())
            return oms.GetString()
        except Exception as e:
            logger.exception("Failing with %s", str(e))

    def deserializeOe(self, oeS):
        """Reconstruct an OE molecule from the input string serialization (OE binary).

        The deserialized molecule is used to initialize the internal OE molecule
        within this object.

        Returns:
            list:  OE GraphMol list
        """
        molList = []
        try:
            ims = oechem.oemolistream()
            ims.SetFormat(oechem.OEFormat_OEB)
            ims.openstring(oeS)
            for mol in ims.GetOEGraphMols():
                logger.debug("SMILES %s", oechem.OECreateCanSmiString(mol))
                logger.debug("title  %s", mol.GetTitle())
                logger.debug("atoms  %d", mol.NumAtoms())
                molList.append(oechem.OEGraphMol(mol))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return molList
class EcodClassificationProvider(StashableBase):
    """Extract ECOD domain assignments, term descriptions and ECOD classification hierarchy
    from ECOD flat files.

    http://prodata.swmed.edu/ecod/

    See:
    H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H. Kim, N. V. Grishin. (2014)
    ECOD: An evolutionary classification of protein domains. PLoS Comput Biol 10(12): e1003926.

    Linking details:  http://prodata.swmed.edu/ecod/complete/domain/<domainId>

                      http://prodata.swmed.edu/ecod/complete/domain/e6sl5G1
    """

    #
    # --
    def __init__(self, cachePath, useCache, **kwargs):
        self.__cachePath = cachePath
        self.__useCache = useCache
        dirName = "ecod"
        super(EcodClassificationProvider, self).__init__(self.__cachePath, [dirName])
        self.__dirPath = os.path.join(cachePath, "ecod")
        self.__version = None
        #
        urlTarget = kwargs.get("ecodTargetUrl", "http://prodata.swmed.edu/ecod/distributions/ecod.latest.domains.txt")
        urlBackup = kwargs.get("ecodUrlBackupPath", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/ECOD/ecod.latest.domains.txt.gz")
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__pD, self.__nD, self.__ntD, self.__pdbD = self.__reload(urlTarget, urlBackup, self.__dirPath, useCache=useCache)

    def testCache(self):
        logger.info("ECOD Lengths nD %d pdbD %d", len(self.__nD), len(self.__pdbD))
        if (len(self.__nD) > 100) and (len(self.__pdbD) > 5000):
            return True
        return False

    def getVersion(self):
        return self.__version

    # --
    def getFamilyIds(self, pdbId, authAsymId):
        try:
            return list(set([tup[1] for tup in self.__pdbD[(pdbId.lower(), authAsymId)]]))
        except Exception as e:
            logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e))
        return []

    def getDomainIds(self, pdbId, authAsymId):
        try:
            return list(set([tup[0] for tup in self.__pdbD[(pdbId.lower(), authAsymId)]]))
        except Exception as e:
            logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e))
        return []

    def getFamilyNames(self, pdbId, authAsymId):
        try:
            return list(set([self.getName(tup[1]) for tup in self.__pdbD[(pdbId.lower(), authAsymId)]]))
        except Exception as e:
            logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e))
        return []

    def getFamilyResidueRanges(self, pdbId, authAsymId):
        try:
            # pdbD.setdefault((pdbId, authAsymId), []).append((domId, fId, authAsymId, authSeqBeg, authSeqEnd))
            return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__pdbD[(pdbId.lower(), authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e))
        return []

    def getName(self, domId):
        try:
            return self.__nD[domId].split("|")[0]
        except Exception:
            logger.debug("Undefined ECOD id %r", domId)
        return None

    def getNameType(self, domId):
        qD = {"A": "Architecture", "X": "Possible Homology", "H": "Homology", "T": "Topology", "F": "Family"}
        try:
            return qD[self.__ntD[domId]]
        except Exception:
            logger.debug("Undefined ECOD id %r", domId)
        return None

    def getIdLineage(self, domId):
        pList = []
        try:
            pList.append(domId)
            if domId == 0:
                return pList
            pt = self.__pD[domId]
            while (pt is not None) and (pt != 0):
                pList.append(pt)
                pt = self.__pD[pt]
        except Exception as e:
            logger.exception("Failing for %r with %s", domId, str(e))
        #
        pList.reverse()
        return pList

    def getNameLineage(self, domId):
        try:
            nL = []
            for dId in self.getIdLineage(domId):
                tN = self.getName(dId)
                tN = tN if tN else "Unnamed"
                nL.append(tN)
            return nL
        except Exception as e:
            logger.exception("Failing for %r with %s", domId, str(e))
        return None

    def getTreeNodeList(self):
        return self.__exportTreeNodeList(self.__pD)

    def __getDomainFileName(self):
        pyVersion = sys.version_info[0]
        fn = "ecod_domains-py%s.pic" % str(pyVersion)
        return fn

    def __reload(self, urlTarget, urlBackup, ecodDirPath, useCache=True):
        pD = nD = ntD = pdbD = {}
        fn = self.__getDomainFileName()
        ecodDomainPath = os.path.join(ecodDirPath, fn)
        self.__mU.mkdir(ecodDirPath)
        #
        if useCache and self.__mU.exists(ecodDomainPath):
            sD = self.__mU.doImport(ecodDomainPath, fmt="pickle")
            logger.debug("ECOD domain length %d", len(sD))
            nD = sD["names"]
            ntD = sD["nametypes"]
            pD = sD["parents"]
            pdbD = sD["assignments"]
            self.__version = sD["version"]
        elif not useCache:
            minLen = 1000
            logger.info("Fetch ECOD name and domain assignment data from primary data source %s", urlTarget)
            nmL = self.__fetchFromSource(urlTarget)
            if not nmL:
                nmL = self.__fetchFromSource(urlBackup)
            #
            logger.info("ECOD raw file length (%d)", len(nmL))
            ok = False
            pD, nD, ntD, pdbD = self.__extractDomainHierarchy(nmL)
            #
            tS = datetime.datetime.now().isoformat()
            vS = self.__version
            sD = {"version": vS, "created": tS, "names": nD, "nametypes": ntD, "parents": pD, "assignments": pdbD}
            if (len(nD) > minLen) and (len(pD) > minLen):
                ok = self.__mU.doExport(ecodDomainPath, sD, fmt="pickle")
            logger.debug("Cache save status %r", ok)
            #
        return pD, nD, ntD, pdbD

    def __fetchFromSource(self, urlTarget):
        """Fetch the classification names and domain assignments from the ECOD repo."""
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        fp = os.path.join(self.__dirPath, fn)
        if not fU.exists(fp):
            fU.get(urlTarget, fp)
        #
        with open(fp, "r", encoding="utf-8") as ifh:
            line = ifh.readline()
            line = ifh.readline()
            line = ifh.readline()
            ff = line[:-1].split()
            self.__version = ff[-1]
        #
        nmL = self.__mU.doImport(fp, fmt="list", uncomment=True)
        fU.remove(fp)
        #
        return nmL

    def __extractDomainHierarchy(self, nmL):
        """
        #/data/ecod/database_versions/v280/ecod.develop280.domains.txt
        #ECOD version develop280
        #Domain list version 1.6
        #Grishin lab (http://prodata.swmed.edu/ecod)
        #uid	ecod_domain_id	manual_rep	f_id	pdb	chain	pdb_range	seqid_range	unp_acc	arch_name	x_name	h_name	t_name	f_name	asm_status	ligand
        002728551	e7d2xA1	AUTO_NONREP	1.1.1	7d2x	A	A:-3-183	A:20-206	NO_UNP	beta barrels	"cradle loop barrel"	"RIFT-related"	"acid protease"	F_UNCLASSIFIED
        002728572	e7d5aA2	AUTO_NONREP	1.1.1	7d5a	A	A:-3-183	A:20-206	NO_UNP	beta barrels	"cradle loop barrel"	"RIFT-related"	"acid protease"	F_UNCLASSIFIED
        002726563	e7b1eA1	AUTO_NONREP	1.1.1	7b1e	A	A:46P-183	A:14-199	NO_UNP	beta barrels	"cradle loop barrel"	"RIFT-related"	"acid protease"	F_UNCLASSIFIED
        002726573	e7b1pA2	AUTO_NONREP	1.1.1	7b1p	A	A:47P-183	A:15-199	NO_UNP	beta barrels	"cradle loop barrel"	"RIFT-related"	"acid protease"	F_UNCLASSIFIED
        """
        assignD = {}
        pD = {}
        ntD = {}
        hD = {}
        pIdD = {}
        nmD = {}
        #
        logger.info("Length of input ECOD name list %d", len(nmL))
        for nm in nmL:
            ff = nm.split("\t")
            # uId = ff[0]
            # ecodId is the linkable identifier -
            ecodId = ff[1]
            entryId = ff[4].lower()
            authAsymId = ff[5]
            resRange = ff[6]
            #
            #  There are no unique identifiers published for the internal elements of the hierarchy
            #   so these are assigned here similar to scop -   There are also many unnamed nodes
            #   that are conventionally filled in from the leaf levels of the tree...
            #  {"A": "Architecture", "X": "Possible Homology", "H": "Homology", "T": "Topology", "F": "Family"}
            aGroupOrg = "A: " + ff[9].replace('"', "")
            xGroupOrg = "X: " + ff[10].replace('"', "")
            hGroupOrg = "H: " + ff[11].replace('"', "")
            tGroupOrg = "T: " + ff[12].replace('"', "")
            fGroupOrg = "F: " + ff[13].replace('"', "")
            if hGroupOrg == "H: NO_H_NAME":
                # hGroupOrg = tGroupOrg  + "|(NO_H)"
                hGroupOrg = "H: " + ff[12].replace('"', "") + " (From Topology)" + "|(NO_H)"
            if xGroupOrg == "X: NO_X_NAME":
                if ff[11].replace('"', "") == "NO_H_NAME":
                    # xGroupOrg = hGroupOrg + "|(NO_X)"
                    xGroupOrg = "X: " + ff[12].replace('"', "") + " (From Topology)" + "|(NO_X)"
                else:
                    xGroupOrg = "X: " + ff[11].replace('"', "") + " (From Homology)" + "|(NO_X)"
                #
            fGroupOrg = fGroupOrg if fGroupOrg != "F_UNCLASSIFIED" else "Unmapped domain of " + tGroupOrg
            #
            # Remove redundancy in names and assign unique ids
            #
            aGroup = aGroupOrg
            xGroup = xGroupOrg + "|" + aGroupOrg
            hGroup = hGroupOrg + "|" + xGroupOrg + "|" + aGroupOrg
            tGroup = tGroupOrg + "|" + hGroupOrg + "|" + xGroupOrg
            fGroup = fGroupOrg + "|" + tGroupOrg
            #
            hD.setdefault("A", set()).add(aGroup)
            hD.setdefault("X", set()).add(xGroup)
            hD.setdefault("H", set()).add(hGroup)
            hD.setdefault("T", set()).add(tGroup)
            hD.setdefault("F", set()).add(fGroup)
            aId = 100000 + len(hD["A"])
            xId = 200000 + len(hD["X"])
            hId = 300000 + len(hD["H"])
            tId = 400000 + len(hD["T"])
            fId = 500000 + len(hD["F"])
            #
            #
            if xGroup in pD and pD[xGroup] != aGroup:
                logger.error("skipping %r multiple parents for xGroup %r  %r and %r ", ecodId, xGroup, pD[xGroup], aGroup)
                continue
            #
            if hGroup in pD and pD[hGroup] != xGroup:
                logger.error("skipping %r multiple parents for hGroup %r  %r and %r ", ecodId, hGroup, pD[hGroup], xGroup)
                continue
            #
            if tGroup in pD and pD[tGroup] != hGroup:
                logger.error("skipping %r multiple parents for tGroup %r  %r and %r ", ecodId, tGroup, pD[tGroup], hGroup)
                continue
            #
            if fGroup in pD and pD[fGroup] != tGroup:
                logger.error("skipping %r multiple parents for fGroup %r  %r and %r ", ecodId, fGroup, pD[fGroup], tGroup)
                continue

            if xId in pIdD and pIdD[xId] != aId:
                logger.error("skipped %r multiple parents for xId %r  %r and %r ", ecodId, xId, pIdD[xId], aId)
            #
            if hId in pIdD and pIdD[hId] != xId:
                logger.error("skipped %r multiple parents for hId %r  %r and %r ", ecodId, hId, pIdD[hId], xId)
            #
            if tId in pIdD and pIdD[tId] != hId:
                logger.error("skipped %r multiple parents for tId %r  %r and %r ", ecodId, tId, pIdD[tId], hId)
            #
            if fId in pIdD and pIdD[fId] != tId:
                logger.error("skipped %r multiple parents for fId %r  %r and %r ", ecodId, fId, pIdD[fId], tId)

            #
            pIdD[aId] = 0
            pIdD[xId] = aId
            pIdD[hId] = xId
            pIdD[tId] = hId
            pIdD[fId] = tId
            #
            nmD[aId] = aGroupOrg
            nmD[xId] = xGroupOrg
            nmD[hId] = hGroupOrg
            nmD[tId] = tGroupOrg
            nmD[fId] = fGroupOrg
            #
            ntD[aId] = "A"
            ntD[xId] = "X"
            ntD[hId] = "H"
            ntD[tId] = "T"
            ntD[fId] = "F"
            rL = self.__parseRanges(resRange)
            if (entryId, authAsymId) not in assignD:
                assignD[(entryId, authAsymId)] = [(ecodId, fId, t[0], t[1], t[2]) for t in rL]
            else:
                for t in rL:
                    assignD[(entryId, authAsymId)].append((ecodId, fId, t[0], t[1], t[2]))
            #
        return pIdD, nmD, ntD, assignD

    def __parseRanges(self, rS):
        rL = []
        authAsymId = authSeqBeg = authSeqEnd = None
        try:
            tSL = rS.split(",")
            for tS in tSL:
                fL = tS.split(":")
                authAsymId = fL[0]
                rS = fL[1]
                if rS[0] == "-":
                    authSeqBeg = -int(rS[1:].split("-")[0])
                    authSeqEnd = int(rS[1:].split("-")[1])
                else:
                    authSeqBeg = int(rS.split("-")[0])
                    authSeqEnd = int(rS.split("-")[1])
            rL.append((authAsymId, authSeqBeg, authSeqEnd))
        except Exception:
            pass
        return rL

    def __exportTreeNodeList(self, pD):
        """Create node list from name dictionary and lineage dictionaries."""
        #
        rootId = 0
        pL = [rootId]
        #
        logger.info("pD %d pL %r", len(pD), pL)
        # --
        #
        # create child dictionary
        cD = {}
        for ctId, ptId in pD.items():
            cD.setdefault(ptId, []).append(ctId)
        #
        logger.info("cD %d", len(cD))
        #
        idL = []
        for rootId in sorted(pL):
            visited = set([rootId])
            queue = collections.deque(visited)
            while queue:
                tId = queue.popleft()
                idL.append(tId)
                if tId not in cD:
                    # logger.debug("No children for Ecod tId %s", tId)
                    continue
                for childId in cD[tId]:
                    if childId not in visited:
                        queue.append(childId)
                        visited.add(childId)
        #
        dL = []
        for tId in idL:
            displayName = self.getName(tId)
            ptId = pD[tId] if tId in pD else None
            lL = self.getIdLineage(tId)[1:]
            #
            if tId == rootId:
                continue
            elif ptId == rootId:
                dD = {"id": str(tId), "name": displayName, "depth": 0}
            else:
                dD = {"id": str(tId), "name": displayName, "parents": [str(ptId)], "depth": len(lL)}
            dL.append(dD)

        return dL
class ReferenceSequenceAssignmentProvider(object):
    """Utilities to cache content required to update referencence sequence assignments."""

    def __init__(
        self,
        cfgOb,
        databaseName="pdbx_core",
        collectionName="pdbx_core_polymer_entity",
        polymerType="Protein",
        referenceDatabaseName="UniProt",
        provSource="PDB",
        maxChunkSize=100,
        fetchLimit=None,
        **kwargs
    ):
        self.__cfgOb = cfgOb
        self.__polymerType = polymerType
        self.__mU = MarshalUtil()
        #
        self.__maxChunkSize = maxChunkSize
        self.__statusList = []
        #
        self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs)

    def goIdExists(self, goId):
        try:
            return self.__goP.exists(goId)
        except Exception as e:
            logger.exception("Failing for %r with %s", goId, str(e))
        return False

    def getGeneOntologyLineage(self, goIdL):
        # "id"     "name"
        gL = []
        try:
            gTupL = self.__goP.getUniqueDescendants(goIdL)
            for gTup in gTupL:
                gL.append({"id": gTup[0], "name": gTup[1]})
        except Exception as e:
            logger.exception("Failing for %r with %s", goIdL, str(e))
        return gL

    def getPfamProvider(self):
        return self.__pfP

    def getInterProProvider(self):
        return self.__ipP

    def getEcProvider(self):
        return self.__ecP

    def getSiftsSummaryProvider(self):
        return self.__ssP

    def getMatchInfo(self):
        return self.__matchD

    def getRefData(self):
        return self.__refD

    def getDocuments(self, formatType="exchange"):
        fobj = UniProtUtils(saveText=False)
        exObjD = fobj.reformat(self.__refD, formatType=formatType)
        return list(exObjD.values())

    def getRefIdMap(self):
        return self.__refIdMapD

    def getRefDataCount(self):
        return len(self.__refD)

    def testCache(self, minMatchPrimaryPercent=None, logSizes=False):
        okC = True
        logger.info("Reference cache lengths: refIdMap %d matchD %d refD %d", len(self.__refIdMapD), len(self.__matchD), len(self.__refD))
        ok = bool(self.__refIdMapD and self.__matchD and self.__refD)
        #
        numRef = len(self.__refIdMapD)
        countD = defaultdict(int)
        logger.info("Match dictionary length %d", len(self.__matchD))
        for _, mD in self.__matchD.items():
            if "matched" in mD:
                countD[mD["matched"]] += 1
        logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items())
        if minMatchPrimaryPercent:
            try:
                okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent
            except Exception:
                okC = False
            logger.info("Primary reference match percent test status %r", okC)
        #
        if logSizes:
            logger.info(
                "Pfam %.2f InterPro %.2f SIFTS %.2f GO %.2f EC %.2f RefIdMap %.2f RefMatchD %.2f RefD %.2f",
                getObjSize(self.__pfP) / 1000000.0,
                getObjSize(self.__ipP) / 1000000.0,
                getObjSize(self.__ssP) / 1000000.0,
                getObjSize(self.__goP) / 1000000.0,
                getObjSize(self.__ecP) / 1000000.0,
                getObjSize(self.__refIdMapD) / 1000000.0,
                getObjSize(self.__matchD) / 1000000.0,
                getObjSize(self.__refD) / 1000000.0,
            )
        return ok and okC

    def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs):
        assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, fetchLimit)
        refIdMapD, _ = self.__getAssignmentMap(assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource)
        #
        entryIdL = [rcsbId[:4] for rcsbId in assignRefD]
        siftsUniProtL = self.__ssP.getEntryUniqueIdentifiers(entryIdL, idType="UNPID")
        logger.info("Incorporating %d SIFTS accessions for %d entries", len(siftsUniProtL), len(entryIdL))
        unpIdList = sorted(set(list(refIdMapD.keys()) + siftsUniProtL))
        #
        logger.info("Rebuild cache for %d UniProt accessions (consolidated)", len(unpIdList))
        #
        matchD, refD = self.__rebuildReferenceCache(unpIdList, referenceDatabaseName, **kwargs)
        return refIdMapD, matchD, refD

    def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit):
        """Get all accessions assigned to input reference sequence database for the input polymerType.

        Returns:
         (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []},
                            "rcsb_polymer_entity_align": [],
                            "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
        """
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName=databaseName,
                collectionName=collectionName,
                cacheFilePath=None,
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=None,
                objectLimit=fetchLimit,
                selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
                selectionList=[
                    "rcsb_id",
                    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers",
                    "rcsb_polymer_entity_container_identifiers.auth_asym_ids",
                    # "rcsb_polymer_entity_align",
                    # "rcsb_entity_source_organism.ncbi_taxonomy_id",
                    # "rcsb_polymer_entity_container_identifiers.related_annotation_identifiers",
                    # "rcsb_polymer_entity_annotation",
                    "rcsb_entity_source_organism.ncbi_taxonomy_id",
                ],
            )
            eCount = obEx.getCount()
            logger.info("Polymer entity count type %s is %d", polymerType, eCount)
            objD = obEx.getObjects()
            logger.info("Reading polymer entity count %d ref accession length %d ", eCount, len(objD))
            #
        except Exception as e:
            logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
        return objD

    def __getAssignmentMap(self, objD, referenceDatabaseName="UniProt", provSource="PDB"):
        refIdD = defaultdict(list)
        taxIdD = defaultdict(list)
        numMissing = 0
        numMissingTaxons = 0
        for entityKey, eD in objD.items():
            try:
                accS = set()
                for ii, tD in enumerate(eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]):
                    if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource:
                        accS.add(tD["database_accession"])
                        refIdD[tD["database_accession"]].append(entityKey)
                        #
                        # pick up the corresponding taxonomy -
                        try:
                            taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"])
                        except Exception:
                            logger.debug("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"])
                            numMissingTaxons += 1

                logger.debug("PDB assigned sequences length %d", len(accS))
            except Exception as e:
                numMissing += 1
                logger.debug("No sequence assignments for %s with %s", entityKey, str(e))
        #
        numMultipleTaxons = 0
        for refId, taxIdL in taxIdD.items():
            taxIdL = list(set(taxIdL))
            if len(taxIdL) > 1:
                logger.debug("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL)
                numMultipleTaxons += 1

        logger.info("Entities with missing taxonomy %d", numMissingTaxons)
        logger.info("Reference sequences with multiple taxonomies %d", numMultipleTaxons)
        logger.info("Unique %s accession assignments by %s %d (entities missing archive accession assignments %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing)
        return refIdD, taxIdD

    #
    def __rebuildReferenceCache(self, idList, refDbName, **kwargs):
        """ """
        fetchLimit = None
        doMissing = True
        dD = {}
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, "exdb")
        # cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        useCache = kwargs.get("useCache", True)
        saveText = kwargs.get("saveText", False)
        #
        ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
        fn = refDbName + "-ref-sequence-data-cache" + "." + ext
        dataCacheFilePath = os.path.join(dirPath, fn)
        #
        fn = refDbName + "-ref-sequence-id-cache" + ".json"
        accCacheFilePath = os.path.join(dirPath, fn)
        #
        self.__mU.mkdir(dirPath)
        if not useCache:
            for fp in [dataCacheFilePath, accCacheFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and accCacheFilePath and self.__mU.exists(accCacheFilePath) and dataCacheFilePath and self.__mU.exists(dataCacheFilePath):
            dD = self.__mU.doImport(dataCacheFilePath, **cacheKwargs)
            idD = self.__mU.doImport(accCacheFilePath, fmt="json")
            logger.info("Reading cached reference sequence ID and data cache files - cached match reference length %d", len(idD["matchInfo"]))
            idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"])
            # Check for completeness -
            if doMissing:
                missingS = set(idList) - set(idD["matchInfo"].keys())
                if missingS:
                    logger.info("Reference sequence cache missing %d accessions", len(missingS))
                    extraD, extraIdD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit)
                    dD["refDbCache"].update(extraD["refDbCache"])
                    idD["matchInfo"].update(extraIdD["matchInfo"])
                    #
                    idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"])
                    #
                    if accCacheFilePath and dataCacheFilePath and cacheKwargs:
                        self.__mU.mkdir(dirPath)
                        ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs)
                        ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3)
                        logger.info("Cache updated with missing references with status %r", ok1 and ok2)
            #
        else:
            logger.info("Rebuilding reference cache for %s for %d accessions with limit %r", refDbName, len(idList), fetchLimit)
            dD, idD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit)
            if accCacheFilePath and dataCacheFilePath and cacheKwargs:
                self.__mU.mkdir(dirPath)
                ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs)
                ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3)
                logger.info("Cache save status %r", ok1 and ok2)

        return idD["matchInfo"], dD["refDbCache"]

    def __rebuildReferenceMatchIndex(self, idList, referenceD):
        fobj = UniProtUtils()
        logger.info("Rebuilding match index on idList (%d) using reference data (%d) %r", len(idList), len(referenceD), type(referenceD))
        matchD = fobj.rebuildMatchResultIndex(idList, referenceD)
        return matchD

    def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None):
        """Fetch database entries from the input reference sequence database name."""
        dD = {"refDbName": refDbName, "refDbCache": {}}
        idD = {"matchInfo": {}, "refIdMap": {}}

        try:
            idList = idList[:fetchLimit] if fetchLimit else idList
            logger.info("Starting fetch for %d %s entries", len(idList), refDbName)
            if refDbName == "UniProt":
                fobj = UniProtUtils(saveText=saveText)
                logger.info("Maximum reference chunk size %d", self.__maxChunkSize)
                refD, matchD = fobj.fetchList(idList, maxChunkSize=self.__maxChunkSize)
                dD = {"refDbName": refDbName, "refDbCache": refD}
                idD = {"matchInfo": matchD}
            #
            # Check the coverage -
            #
            countD = defaultdict(int)
            logger.info("Match dictionary length %d", len(matchD))
            for _, mD in matchD.items():
                if "matched" in mD:
                    countD[mD["matched"]] += 1
            logger.info("Reference length %d match length %d coverage %r", len(refD), len(matchD), countD.items())
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return dD, idD

    def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
        abbreviated = kwargs.get("siftsAbbreviated", "TEST")
        cachePath = kwargs.get("cachePath", ".")
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        useCache = kwargs.get("useCache", True)
        #
        siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)
        # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath)
        if siftsSummaryDataPath.lower().startswith("http"):
            srcDirPath = siftsSummaryDataPath
        else:
            srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
        cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
        logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
        ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs)
        ok = ssP.testCache()
        logger.debug("SIFTS cache status %r", ok)
        logger.debug("ssP entry count %d", ssP.getEntryCount())
        return ssP

    def __fetchGoProvider(self, cfgOb, configName, **kwargs):
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        #
        cacheDirPath = os.path.join(cachePath, cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        logger.debug("goP %r %r", cacheDirPath, useCache)
        goP = GeneOntologyProvider(goDirPath=cacheDirPath, useCache=useCache)
        ok = goP.testCache()
        logger.debug("Gene Ontology (%r) root node count %r", ok, goP.getRootNodes())
        return goP

    def __fetchEcProvider(self, cfgOb, configName, **kwargs):
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        #
        cacheDirPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName))
        logger.debug("ecP %r %r", cacheDirPath, useCache)
        ecP = EnzymeDatabaseProvider(enzymeDirPath=cacheDirPath, useCache=useCache)
        ok = ecP.testCache()
        logger.debug("Enzyme cache status %r", ok)
        return ecP

    def __fetchPfamProvider(self, cfgOb, configName, **kwargs):
        _ = cfgOb
        _ = configName
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        pfP = PfamProvider(cachePath=cachePath, useCache=useCache)
        ok = pfP.testCache()
        return pfP if ok else None

    def __fetchInterProProvider(self, cfgOb, configName, **kwargs):
        _ = cfgOb
        _ = configName
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        ipP = InterProProvider(cachePath=cachePath, useCache=useCache)
        ok = ipP.testCache()
        return ipP if ok else None
class ChemCompSearchWrapper(SingletonClass):
    """Wrapper for chemical component search operations."""
    def __init__(self, **kwargs):
        """Wrapper class for chemical search/depiction operations.

        Path and prefix data for wrapper class may be set as keyword arguments
        as environmental variables.

        Args:
            cachePath (str): path to top-level cache directory used to store search index file dependencies
                             (default environment variable CHEM_SEARCH_CACHE_PATH or ".")
            ccFileNamePrefix (str): prefix code used to distinguish different subsets of chemical definitions
                                    (default environment variable CHEM_SEARCH_CC_PREFIX or "cc-full")

        """
        self.__startTime = time.time()
        #
        self.__cachePath = kwargs.get(
            "cachePath", os.environ.get("CHEM_SEARCH_CACHE_PATH", "."))
        self.__ccFileNamePrefix = kwargs.get(
            "ccFileNamePrefix",
            os.environ.get("CHEM_SEARCH_CC_PREFIX", "cc-full"))
        #
        self.__dependFileName = "ChemCompSearchWrapperData.tar.gz"
        self.__dependTarFilePath = os.path.join(self.__cachePath,
                                                self.__dependFileName)
        # ---
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        # ---
        self.__configD = {}
        self.__ccIdxP = None
        self.__siIdxP = None
        self.__siIdx = {}
        self.__oesmP = None
        self.__oesU = None
        self.__oesubsU = None
        # ---
        self.__statusDescriptorError = -100
        self.__searchError = -200
        self.__searchSuccess = 0

    def setConfig(self, ccUrlTarget, birdUrlTarget, **kwargs):
        """Provide the chemical definition source path details for rebuilding search
           index file dependencies.

        Args:
            ccUrlTarget (str): path to concatenated chemical component definition file
            birdUrlTarget (str): path to the concatenated BIRD definition file

            Other options are propagated to configurations of the wrapped classes in __bootstrapConfig()

        """
        kwargs["ccUrlTarget"] = ccUrlTarget
        kwargs["birdUrlTarget"] = birdUrlTarget
        kwargs["cachePath"] = self.__cachePath
        kwargs["ccFileNamePrefix"] = self.__ccFileNamePrefix
        self.__configD = self.__bootstrapConfig(**kwargs)
        return len(self.__configD) >= 3

    def __bootstrapConfig(self, **kwargs):
        """Build on-the-fly default configuration for this wrapper class."""
        # The following few options have no defaults -- and should be specified.
        ccUrlTarget = kwargs.get("ccUrlTarget", None)
        birdUrlTarget = kwargs.get("birdUrlTarget", None)
        cachePath = kwargs.get("cachePath", None)
        ccFileNamePrefix = kwargs.get("ccFileNamePrefix", None)
        logger.info("Bootstrap configuration for prefix %r cc %r bird %r",
                    ccFileNamePrefix, ccUrlTarget, birdUrlTarget)
        # ---
        #  Reasonable values are selected for the remaining options...
        oeFileNamePrefix = "oe-" + ccFileNamePrefix
        try:
            storeConfig = kwargs.get("storeConfig", True)
            molLimit = kwargs.get("molLimit", None)
            useCache = kwargs.get("useCache", False)
            logSizes = kwargs.get("logSizes", False)
            #
            numProc = kwargs.get("numProc", 12)
            maxProc = os.cpu_count()
            numProc = min(numProc, maxProc)
            maxChunkSize = kwargs.get("maxChunkSize", 50)

            #
            logger.debug("+++ >>> Assigning numProc as %d", numProc)
            #
            limitPerceptions = kwargs.get("limitPerceptions", False)
            quietFlag = kwargs.get("quietFlag", True)
            #
            # fpTypeCuttoffD = {"TREE": 0.6, "MACCS": 0.9, "PATH": 0.6, "CIRCULAR": 0.6, "LINGO": 0.9}
            fpTypeCuttoffD = kwargs.get("fpTypeCuttoffD", {
                "TREE": 0.6,
                "MACCS": 0.9
            })
            buildTypeList = kwargs.get("buildTypeList", [
                "oe-iso-smiles", "oe-smiles", "cactvs-iso-smiles",
                "cactvs-smiles", "inchi"
            ])
            #
            oesmpKwargs = {
                "ccUrlTarget": ccUrlTarget,
                "birdUrlTarget": birdUrlTarget,
                "cachePath": cachePath,
                "useCache": useCache,
                "ccFileNamePrefix": ccFileNamePrefix,
                "oeFileNamePrefix": oeFileNamePrefix,
                "limitPerceptions": limitPerceptions,
                "minCount": None,
                "maxFpResults": 50,
                "fpTypeCuttoffD": fpTypeCuttoffD,
                "buildTypeList": buildTypeList,
                "screenTypeList": None,
                "quietFlag": quietFlag,
                "numProc": numProc,
                "maxChunkSize": maxChunkSize,
                "molLimit": molLimit,
                "logSizes": logSizes,
                "suppressHydrogens": True,
            }
            ccsiKwargs = {
                "ccUrlTarget": ccUrlTarget,
                "birdUrlTarget": birdUrlTarget,
                "cachePath": cachePath,
                "useCache": useCache,
                "ccFileNamePrefix": ccFileNamePrefix,
                "oeFileNamePrefix": oeFileNamePrefix,
                "limitPerceptions": limitPerceptions,
                "minCount": None,
                "numProc": numProc,
                "quietFlag": quietFlag,
                "maxChunkSize": maxChunkSize,
                "molLimit": None,
                "logSizes": False,
            }
            configD = {
                "versionNumber": 0.30,
                "ccsiKwargs": ccsiKwargs,
                "oesmpKwargs": oesmpKwargs
            }
            #
            if storeConfig:
                configDirPath = os.path.join(cachePath, "config")
                configFilePath = os.path.join(
                    configDirPath, ccFileNamePrefix + "-config.json")
                logger.info("Saving configuration bootstrap in %r",
                            configFilePath)
                self.__mU.mkdir(configDirPath)
                self.__mU.doExport(configFilePath,
                                   configD,
                                   fmt="json",
                                   indent=3)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return configD

    def readConfig(self, resetCachePath=True):
        """Read a prepared configuration file for the search wrapper class. This will override
        any default configuration settings.

        Args:
             resetCachPath (bool): update cachePath configuration option with the current cachePath setting.

        Returns:
            bool : True for success or False otherwise
        """
        #
        #
        ok = False
        try:
            #
            configFilePath = os.path.join(
                self.__cachePath, "config",
                self.__ccFileNamePrefix + "-config.json")
            configD = self.__mU.doImport(configFilePath, fmt="json")
            logger.debug("ConfigD: %r", configD)
            if configD and (len(configD) > 2) and float(
                    configD["versionNumber"]) > 0.2:
                logger.info("Read version %r sections %r from %s",
                            configD["versionNumber"], list(configD.keys()),
                            configFilePath)
                ok = True
                self.__configD = configD
                if resetCachePath:
                    # Allow the configuration to be relocatable.
                    configD["ccsiKwargs"]["cachePath"] = self.__cachePath
                    configD["oesmpKwargs"]["cachePath"] = self.__cachePath
            else:
                logger.error("Reading config file fails from %r",
                             configFilePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            ok = False
        return ok

    def buildDependenices(self, ccUrlTarget, birdUrlTarget, **kwargs):
        """Convenience method to build configuration and static dependencies for the chemical search services.

        Args:
            ccUrlTarget (str): path to source concatenated chemical component definition file
            birdUrlTarget (str): path to the source concatenated BIRD definition file

            Other options are propagated to configurations of the wrapped classes in __bootstrapConfig()

        """
        try:
            okT = False
            ok1 = self.setConfig(ccUrlTarget=ccUrlTarget,
                                 birdUrlTarget=birdUrlTarget,
                                 **kwargs)
            useCache = kwargs.get("useCache", False)
            ok2 = self.updateChemCompIndex(useCache=useCache)
            ok3 = self.updateSearchIndex(useCache=useCache)
            ok4 = self.updateSearchMoleculeProvider(useCache=useCache)
            okBuild = ok1 and ok2 and ok3 and ok4
            if okBuild:
                fileU = FileUtil()
                dirPathList = [
                    os.path.join(self.__cachePath, subDir)
                    for subDir in ["chem_comp", "oe_mol", "config"]
                ]
                okT = fileU.bundleTarfile(self.__dependTarFilePath,
                                          dirPathList,
                                          mode="w:gz",
                                          recursive=True)
            #
            return okT and okBuild
        except Exception as e:
            logger.exception("Failing build with %r and %r with %s",
                             ccUrlTarget, birdUrlTarget, str(e))
        return False

    def stashDependencies(self,
                          url,
                          dirPath,
                          bundleLabel="A",
                          userName=None,
                          pw=None):
        """Store a copy of the bundled search dependencies remotely -

        Args:
            url (str): URL string for the destination host (e.g. sftp://myserver.net or None for a local file)
            dirPath (str): directory path on the remote resource
            bundleLabel (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            userName (str, optional): optional access information. Defaults to None.
            password (str, optional): optional access information. Defaults to None.

        Returns:
          bool:  True for success or False otherwise

        """
        try:
            ok = False
            fn = self.__makeBundleFileName(self.__dependFileName,
                                           bundleLabel=bundleLabel)
            if url and url.startswith("sftp://"):
                sftpU = SftpUtil()
                hostName = url[7:]
                ok = sftpU.connect(hostName, userName, pw=pw, port=22)
                if ok:
                    remotePath = os.path.join("/", dirPath, fn)
                    ok = sftpU.put(self.__dependTarFilePath, remotePath)
            elif not url:
                fileU = FileUtil()
                remotePath = os.path.join(dirPath, fn)
                ok = fileU.put(self.__dependTarFilePath, remotePath)
            else:
                logger.error("Unsupported stash protocol %r", url)
            return ok
        except Exception as e:
            logger.exception("For %r %r failing with %s", url, dirPath, str(e))
        return False

    def __makeBundleFileName(self, rootName, bundleLabel="A"):
        fn = rootName
        try:
            fn = rootName
            fn = "%s-%s" % (bundleLabel.upper(),
                            rootName) if bundleLabel else rootName
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return fn

    def restoreDependencies(self,
                            url,
                            dirPath,
                            bundleLabel="A",
                            userName=None,
                            pw=None):
        """Restore bundled dependencies from remote storage and unbundle these in the
           current local cache directory.

        Args:
            url (str): remote URL
            dirPath (str): remote directory path on the
            bundleLabel (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            userName (str, optional): optional access information. Defaults to None.
            password (str, optional): optional access information. Defaults to None.
        """
        try:
            ok = False
            fileU = FileUtil()
            fn = self.__makeBundleFileName(self.__dependFileName,
                                           bundleLabel=bundleLabel)
            if not url:
                remotePath = os.path.join(dirPath, fn)
                ok = fileU.get(remotePath, self.__dependTarFilePath)

            elif url and url.startswith("http://"):
                remotePath = url + os.path.join("/", dirPath, fn)
                ok = fileU.get(remotePath, self.__dependTarFilePath)

            elif url and url.startswith("sftp://"):
                sftpU = SftpUtil()
                ok = sftpU.connect(url[7:], userName, pw=pw, port=22)
                if ok:
                    remotePath = os.path.join(dirPath, fn)
                    ok = sftpU.get(remotePath, self.__dependTarFilePath)
            else:
                logger.error("Unsupported protocol %r", url)
            if ok:
                ok = fileU.unbundleTarfile(self.__dependTarFilePath,
                                           dirPath=self.__cachePath)
            return ok
        except Exception as e:
            logger.exception("For %r %r Failing with %s", url, dirPath, str(e))
            ok = False
        return ok

    def updateChemCompIndex(self, useCache=False):
        """Rebuild the basic index of source chemical component and BIRD definitions.
           Update the internal state of this index in the current object instance.

            Resource requirements: 94 sec 1 proc 7GB memory macbook pro

        Args:
            useCache (bool): False to rebuild search index and True to reload

        Returns:
            bool: True for success or false otherwise
        """
        ok = False
        try:
            kwargs = copy.deepcopy(
                self.__configD["ccsiKwargs"]
            ) if "ccsiKwargs" in self.__configD else None
            if kwargs:
                kwargs["useCache"] = useCache
                ccIdxP = ChemCompIndexProvider(**kwargs)
                ok = ccIdxP.testCache()
                self.__ccIdxP = ccIdxP if ok else None
                logger.info("Chemical component index status %r", ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def getChemCompIndex(self):
        return self.__ccIdxP.getIndex() if self.__ccIdxP else {}

    def getSearchMoleculeProvider(self):
        return self.__oesmP if self.__oesmP else None

    def updateSearchIndex(self, useCache=False):
        """Rebuild the search index from source chemical component and BIRD definitions.
           Update the internal state of this index in the current object instance.

            Resource requirements 771 secs 6 proc macbook pro 7GB memory.

        Args:
            useCache (bool): False to rebuild search index and True to reload

        Returns:
            bool: True for success or false otherwise
        """
        ok = False
        try:
            kwargs = copy.deepcopy(
                self.__configD["ccsiKwargs"]
            ) if "ccsiKwargs" in self.__configD else None
            if kwargs:
                kwargs["useCache"] = useCache
                siIdxP = ChemCompSearchIndexProvider(**kwargs)
                ok = siIdxP.testCache()
                self.__siIdxP = siIdxP if siIdxP else None
                self.__siIdx = siIdxP.getIndex() if siIdxP and ok else {}
                logger.info("Search index status %r index len %d", ok,
                            len(self.__siIdx) if self.__siIdx else 0)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def updateSearchMoleculeProvider(self, useCache=False):
        """Rebuild the search molecule provider.
           Update the internal state of this object reference in the current object instance.

           Resource requirements: 151 seconds 1 proc  0.5GB memory macbook pro

        Args:
            useCache (bool): False to rebuild molecule store and True to reload

        Returns:
            bool: True for success or false otherwise
        """
        ok = False
        try:
            kwargs = copy.deepcopy(
                self.__configD["oesmpKwargs"]
            ) if "oesmpKwargs" in self.__configD else None
            if kwargs:
                kwargs["useCache"] = useCache
                oesmP = OeSearchMoleculeProvider(**kwargs)
                ok = oesmP.testCache()
                self.__oesmP = oesmP if oesmP and ok else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def reloadSearchDatabase(self):
        """Reload the in-memory search databases from the OE molecule provider.
           Resource requirements: ~90sec load time 0.35 GB memory

        Returns:
            bool: True for success or False otherwise
        """
        ok = False
        try:
            okmp = self.updateSearchMoleculeProvider(useCache=True)
            if not okmp:
                return ok
            fpTypeCuttoffD = self.__configD["oesmpKwargs"][
                "fpTypeCuttoffD"] if "fpTypeCuttoffD" in self.__configD[
                    "oesmpKwargs"] else {}
            fpTypeList = [k for k, v in fpTypeCuttoffD.items()]
            oesU = OeSearchUtils(self.__oesmP, fpTypeList=fpTypeList)
            ok1 = oesU.testCache()
            self.__oesU = oesU if ok1 else None
            #
            oesubsU = OeSubStructSearchUtils(self.__oesmP)
            ok2 = oesubsU.testCache()
            self.__oesubsU = oesubsU if ok2 else None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok1 and ok2

    def searchByDescriptor(self,
                           descriptor,
                           descriptorType,
                           matchOpts="graph-relaxed",
                           searchId=None):
        """Wrapper method for descriptor match and descriptor substructure search methods.

        Args:
            descriptor (str):  molecular descriptor (SMILES, InChI)
            descriptorType (str): descriptor type (SMILES, InChI
            matchOpts (str, optional): graph match criteria (graph-relaxed, graph-relaxed-stereo, graph-strict,
                                       fingerprint-similarity, sub-struct-graph-relaxed, sub-struct-graph-relaxed-stereo,
                                       sub-struct-graph-strict Defaults to "graph-relaxed")
            searchId (str, optional): search identifier for logging. Defaults to None.

        Returns:
            (statusCode, list, list): status, graph match and finger match lists of type (MatchResults)
                                      -100 descriptor processing error
                                      -200 search execution error
                                         0 search execution success
        """
        if matchOpts.startswith("sub-struct-"):
            return self.subStructSearchByDescriptor(descriptor,
                                                    descriptorType,
                                                    matchOpts=matchOpts,
                                                    searchId=searchId)
        else:
            return self.matchByDescriptor(descriptor,
                                          descriptorType,
                                          matchOpts=matchOpts,
                                          searchId=searchId)

    def matchByDescriptor(self,
                          descriptor,
                          descriptorType,
                          matchOpts="graph-relaxed",
                          searchId=None):
        """Return graph match (w/  finger print pre-filtering) and finger print search results for the
           input desriptor.

        Args:
            descriptor (str):  molecular descriptor (SMILES, InChI)
            descriptorType (str): descriptor type (SMILES, InChI
            matchOpts (str, optional): graph match criteria (graph-relaxed, graph-relaxed-stereo, graph-strict,
                                       fingerprint-similarity, Defaults to "graph-relaxed")
            searchId (str, optional): search identifier for logging. Defaults to None.

        Returns:
            (statusCode, list, list): status, graph match and finger match lists of type (MatchResults)
                                      -100 descriptor processing error
                                      -200 search execution error
                                         0 search execution success
        """
        ssL = fpL = []
        retStatus = False
        statusCode = -200
        try:
            fpTypeCuttoffD = self.__configD["oesmpKwargs"][
                "fpTypeCuttoffD"] if "fpTypeCuttoffD" in self.__configD[
                    "oesmpKwargs"] else {}
            maxFpResults = self.__configD["oesmpKwargs"][
                "maxFpResults"] if "maxFpResults" in self.__configD[
                    "oesmpKwargs"] else 50
            limitPerceptions = self.__configD["oesmpKwargs"][
                "limitPerceptions"] if "limitPerceptions" in self.__configD[
                    "oesmpKwargs"] else False
            #
            searchId = searchId if searchId else "query"
            messageTag = searchId + ":" + descriptorType
            oeioU = OeIoUtils()
            oeMol = oeioU.descriptorToMol(descriptor,
                                          descriptorType,
                                          limitPerceptions=limitPerceptions,
                                          messageTag=messageTag)
            oeMol = oeioU.suppressHydrogens(oeMol)
            if not oeMol:
                logger.warning("descriptor type %r molecule build fails: %r",
                               descriptorType, descriptor)
                return self.__statusDescriptorError, ssL, fpL
            #
            retStatus, ssL, fpL = self.__oesU.searchSubStructureAndFingerPrint(
                oeMol,
                list(fpTypeCuttoffD.items())[:2],
                maxFpResults,
                matchOpts=matchOpts)
            statusCode = 0 if retStatus else self.__searchError
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            #
        return statusCode, ssL, fpL

    def subStructSearchByDescriptor(self,
                                    descriptor,
                                    descriptorType,
                                    matchOpts="sub-struct-graph-relaxed",
                                    searchId=None):
        """Return graph match (w/  finger print pre-filtering) and finger print search results for the
           input desriptor.

        Args:
            descriptor (str):  molecular descriptor (SMILES, InChI)
            descriptorType (str): descriptor type (SMILES, InChI)
            matchOpts (str, optional): graph match criteria (sub-struct-graph-relaxed, sub-struct-graph-relaxed-stereo,
                                       sub-struct-graph-strict). Defaults to "sub-struct-graph-relaxed".
            searchId (str, optional): search identifier for logging. Defaults to None.

        Returns:
            (statusCode, list, list): status, substructure search results of type (MatchResults), empty list placeholder
                                      -100 descriptor processing error
                                      -200 search execution error
                                         0 search execution success
        """
        ssL = []
        retStatus = False
        statusCode = -200
        try:
            limitPerceptions = self.__configD["oesmpKwargs"][
                "limitPerceptions"] if "limitPerceptions" in self.__configD[
                    "oesmpKwargs"] else False
            numProc = self.__configD["oesmpKwargs"][
                "numProc"] if "numProc" in self.__configD["oesmpKwargs"] else 4
            #
            searchId = searchId if searchId else "query"
            messageTag = searchId + ":" + descriptorType
            oeioU = OeIoUtils()
            oeMol = oeioU.descriptorToMol(descriptor,
                                          descriptorType,
                                          limitPerceptions=limitPerceptions,
                                          messageTag=messageTag)
            oeMol = oeioU.suppressHydrogens(oeMol)
            if not oeMol:
                logger.warning("descriptor type %r molecule build fails: %r",
                               descriptorType, descriptor)
                return self.__statusDescriptorError, ssL, []
            #
            ccIdL = self.__oesubsU.prefilterIndex(oeMol,
                                                  self.__siIdxP,
                                                  matchOpts=matchOpts)
            retStatus, ssL = self.__oesubsU.searchSubStructure(
                oeMol, ccIdList=ccIdL, matchOpts=matchOpts, numProc=numProc)
            statusCode = 0 if retStatus else self.__searchError
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            #
        return statusCode, ssL, []

    def matchByFormulaRange(self,
                            elementRangeD,
                            matchSubset=False,
                            searchId=None):
        """Return formula match results for input element range dictionary.

        Args:
            elementRangeD (dict): {'<element_name>: {'min': <int>, 'max': <int>}, ... }
            matchSubset (bool, optional): query for formula subset (default: False)
            searchId (str, optional): search identifier for logging. Defaults to None.

        Returns:
            (statusCode, list): status, list of chemical component identifiers
        """
        ok = False
        rL = []
        try:
            startTime = time.time()
            searchId = searchId if searchId else "query"
            rL = self.__ccIdxP.matchMolecularFormulaRange(
                elementRangeD, matchSubset=matchSubset)
            ok = True
            logger.info("%s formula %r matched %d (%.4f seconds)", searchId,
                        elementRangeD, len(rL),
                        time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok, rL

    def matchByFormula(self, formula, matchSubset=False, searchId=None):
        """Return formula match results for input molecular formula.

        Args:
            formula (str): molecular formula  (ex. 'C6H6')
            matchSubset (bool, optional): query for formula subset (default: False)
            searchId (str, optional): search identifier for logging. Defaults to None.

        Returns:
            (statusCode, list): status, list of chemical component identifiers
        """
        ok = False
        rL = []
        try:
            startTime = time.time()
            searchId = searchId if searchId else "query"
            mf = MolecularFormula()
            eD = mf.parseFormula(formula)
            elementRangeD = {
                k.upper(): {
                    "min": v,
                    "max": v
                }
                for k, v in eD.items()
            }
            rL = self.__ccIdxP.matchMolecularFormulaRange(
                elementRangeD, matchSubset=matchSubset)
            ok = True
            logger.info("%s formula %r matched %d (%.4f seconds)", searchId,
                        elementRangeD, len(rL),
                        time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok, rL

    def status(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6,
                    unitS)
        endTime = time.time()
        logger.info("Status at %s (up %.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)
class CurrentHoldingsProvider(object):
    """Provide inventory of current repository content."""
    def __init__(self, **kwargs):
        self.__dirPath = kwargs.get("holdingsDirPath", ".")
        useCache = kwargs.get("useCache", True)
        baseUrl = kwargs.get(
            "baseUrl",
            "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/"
        )
        urlTargetContent = kwargs.get(
            "currentTargetUrl",
            os.path.join(baseUrl, "current_holdings.json.gz"))
        urlFallbackTargetContent = kwargs.get(
            "currentTargetUrl",
            os.path.join(baseUrl, "current_holdings.json.gz"))
        #
        urlTargetIds = kwargs.get(
            "currentTargetUrl", os.path.join(baseUrl,
                                             "current_pdb_ids.json.gz"))
        urlFallbackTargetIds = kwargs.get(
            "currentTargetUrl", os.path.join(baseUrl,
                                             "current_pdb_ids.json.gz"))
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__invD = self.__reloadEntryContent(urlTargetContent,
                                                urlFallbackTargetContent,
                                                self.__dirPath,
                                                useCache=useCache)
        self.__idD = self.__reloadEntryIds(urlTargetIds,
                                           urlFallbackTargetIds,
                                           self.__dirPath,
                                           useCache=useCache)

    def testCache(self, minCount=170000):
        logger.info("Inventory length cD (%d) id list (%d)", len(self.__invD),
                    len(self.__idD))
        # JDW - restore consistency checks
        # if len(self.__invD) > minCount and len(self.__idD) > minCount and len(self.__invD) == len(self.__idD):
        if len(self.__invD) > minCount and len(self.__idD) > minCount:
            return True

        return False

    def getEntryContentTypes(self, entryId):
        """Return the current content types for the input entry identifier"""
        try:
            return sorted(self.__invD[entryId.upper()].keys())
        except Exception as e:
            logger.exception("Failing for %r with %s", entryId, str(e))
        return []

    def getEntryContentTypePathList(self, entryId, contentType):
        """Return the current content types for the input entry identifier"""
        try:
            return self.__invD[entryId.upper()][contentType]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", entryId, contentType,
                         str(e))
        return []

    def getEntryInventory(self):
        """Return the current inventory dictionary"""
        try:
            return self.__invD
        except Exception as e:
            logger.debug("Failing with %s", str(e))
        return {}

    def getEntryIdList(self, afterDateTimeStamp=None):
        """Return the ID code list or optionally IDs changed after the input time stamp.

        Args:
            afterDateTimeStamp (str, optional): ISO format date time stamp. Defaults to None.
        """
        try:
            if afterDateTimeStamp:
                dt = datetime.datetime.fromisoformat(
                    afterDateTimeStamp).replace(tzinfo=pytz.utc)
                return [k for k, v in self.__idD.items() if v > dt]
            else:
                return list(self.__idD.keys())
        except Exception as e:
            logger.error("Failing with %s", str(e))
        return []

    def __reloadEntryContent(self,
                             urlTarget,
                             urlFallbackTarget,
                             dirPath,
                             useCache=True):
        invD = {}
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        fp = os.path.join(dirPath, fn)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(fp):
            invD = self.__mU.doImport(fp, fmt="json")
            logger.debug("Reading cached inventory (%d)", len(invD))
        else:
            logger.info("Fetch inventory from %s", urlTarget)
            ok = fU.get(urlTarget, fp)
            if not ok:
                ok = fU.get(urlFallbackTarget, fp)
            #
            if ok:
                invD = self.__mU.doImport(fp, fmt="json")
        return invD

    def __reloadEntryIds(self,
                         urlTarget,
                         urlFallbackTarget,
                         dirPath,
                         useCache=True):
        idD = {}
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        fp = os.path.join(dirPath, fn)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(fp):
            tdL = self.__mU.doImport(fp, fmt="json")
            logger.debug("Reading cached IDs list (%d)", len(tdL))
        else:
            logger.info("Fetch ID list from %s", urlTarget)
            ok = fU.get(urlTarget, fp)
            if not ok:
                ok = fU.get(urlFallbackTarget, fp)
            #
            if ok:
                tdL = self.__mU.doImport(fp, fmt="json")
        #
        for td in tdL:
            for k, v in td.items():
                try:
                    idD[k] = datetime.datetime.fromisoformat(v)
                except Exception as e:
                    logger.error("Date processing failing for %r %r with %s",
                                 k, v, str(e))
        #
        sTupL = sorted(idD.items(), key=lambda item: item[1])
        return {k: v for k, v in sTupL}
Beispiel #20
0
class ScopClassificationProvider(StashableBase):
    """Extract SCOPe assignments, term descriptions and SCOP classifications
    from SCOP flat files.

    """
    def __init__(self, **kwargs):
        #
        self.__dirName = "scop"
        if "cachePath" in kwargs:
            self.__cachePath = os.path.abspath(kwargs.get("cachePath", None))
            self.__scopDirPath = os.path.join(self.__cachePath, self.__dirName)
        else:
            self.__scopDirPath = kwargs.get("scopDirPath", ".")
            self.__cachePath, self.__dirName = os.path.split(
                os.path.abspath(self.__scopDirPath))
        super(ScopClassificationProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        #
        useCache = kwargs.get("useCache", True)
        # urlTarget = kwargs.get("scopTargetUrl", "http://scop.berkeley.edu/downloads/update")
        # self.__version = kwargs.get("scopVersion", "2.07-2019-07-23")
        # self.__version = kwargs.get("scopVersion", "2.07-2020-01-23")
        # self.__version = kwargs.get("scopVersion", "2.07-2020-05-07")
        # self.__version = kwargs.get("scopVersion", "2.07-2021-07-07")
        urlTarget = kwargs.get("scopTargetUrl",
                               "http://scop.berkeley.edu/downloads/parse")
        self.__version = kwargs.get("scopVersion", "2.08-stable")
        #
        urlBackupPath = kwargs.get(
            "scopUrlBackupPath",
            "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP"
        )
        #
        self.__mU = MarshalUtil(workPath=self.__scopDirPath)
        self.__nD, self.__pD, self.__pdbD = self.__reload(
            urlTarget,
            self.__scopDirPath,
            useCache=useCache,
            version=self.__version)
        #
        if not useCache and not self.testCache():
            ok = self.__fetchFromBackup(urlBackupPath, self.__scopDirPath)
            if ok:
                self.__nD, self.__pD, self.__pdbD = self.__reload(
                    urlTarget,
                    self.__scopDirPath,
                    useCache=True,
                    version=self.__version)

    def testCache(self):
        logger.info("SCOP lengths nD %d pD %d pdbD %d", len(self.__nD),
                    len(self.__pD), len(self.__pdbD))
        if (len(self.__nD) > 100) and (len(self.__pD) > 100) and (len(
                self.__pdbD) > 100):
            return True
        return False

    def __fetchFromBackup(self, urlBackupPath, scopDirPath):
        pyVersion = sys.version_info[0]
        fn = "scop_domains-py%s.pic" % str(pyVersion)
        scopDomainPath = os.path.join(scopDirPath, fn)
        self.__mU.mkdir(scopDirPath)
        #
        backupUrl = urlBackupPath + "/" + fn
        logger.info("Using backup URL %r", backupUrl)
        fU = FileUtil()
        ok = fU.get(backupUrl, scopDomainPath)
        return ok

    def getScopVersion(self):
        return self.__version

    def getScopSunIds(self, pdbId, authAsymId):
        """
        Get the sunid of the domain assignment for the assignment -

        aD[(pdbId, authAsymId)] = [(sunId, domainId, (authAsymId, resBeg, resEnd))]

        aD[(pdbId, authAsymId)] = [(domSunId, domainId, sccs, (authAsymId, resBeg, resEnd))]
        """
        try:
            return list(
                set([tup[0] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))

        return []

    def getScopDomainNames(self, pdbId, authAsymId):
        try:
            return list(
                set([tup[1] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))

        return []

    def getScopSccsNames(self, pdbId, authAsymId):
        try:
            return list(
                set([tup[2] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))

        return []

    def getScopResidueRanges(self, pdbId, authAsymId):
        try:
            return [(tup[0], tup[1], tup[2], tup[3][0], tup[3][1], tup[3][2])
                    for tup in self.__pdbD[(pdbId, authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))

        return []

    def getScopName(self, sunId):
        try:
            return self.__nD[sunId]
        except Exception:
            logger.debug("Undefined SCOP sunId %r", sunId)
        return None

    def getIdLineage(self, sunId):
        pList = []
        try:
            pList.append(sunId)
            pt = self.__pD[sunId]
            while (pt is not None) and (pt != 0):
                pList.append(pt)
                pt = self.__pD[pt]
        except Exception as e:
            logger.exception("Failing for %r with %s", sunId, str(e))
        #
        pList.reverse()
        return pList

    def getNameLineage(self, sunId):
        try:
            return [self.getScopName(cId) for cId in self.getIdLineage(sunId)]
        except Exception as e:
            logger.exception("Failing for %r with %s", sunId, str(e))
        return None

    def getTreeNodeList(self):
        return self.__exportTreeNodeList(self.__nD, self.__pD)

    #
    ###
    ###
    #
    def __reload(self, urlTarget, scopDirPath, useCache=True, version=None):
        nD = pD = pdbD = {}
        pyVersion = sys.version_info[0]
        scopDomainPath = os.path.join(scopDirPath,
                                      "scop_domains-py%s.pic" % str(pyVersion))
        self.__mU.mkdir(scopDirPath)
        #
        # scopDomainPath = os.path.join(scopDirPath, "scop_domains.json")
        #
        if useCache and self.__mU.exists(scopDomainPath):
            sD = self.__mU.doImport(scopDomainPath, fmt="pickle")
            logger.debug(
                "SCOPe name length %d parent length %d assignments %d",
                len(sD["names"]), len(sD["parents"]), len(sD["assignments"]))
            nD = sD["names"]
            pD = sD["parents"]
            pdbD = sD["assignments"]

        elif not useCache:
            ok = False
            minLen = 1000
            logger.info(
                "Fetch SCOPe name and domain assignment data using target URL %s",
                urlTarget)
            desL, claL, hieL = self.__fetchFromSource(urlTarget,
                                                      version=version)
            #
            nD = self.__extractDescription(desL)
            dmD = self.__extractAssignments(claL)
            pD = self.__extractHierarchy(hieL, nD)
            pdbD = self.__buildAssignments(dmD)
            logger.info("nD %d dmD %d pD %d", len(nD), len(dmD), len(pD))
            scopD = {"names": nD, "parents": pD, "assignments": pdbD}
            if (len(nD) > minLen) and (len(pD) > minLen) and (len(pD) >
                                                              minLen):
                ok = self.__mU.doExport(scopDomainPath, scopD, fmt="pickle")
            logger.debug("Cache save status %r", ok)
            #
        return nD, pD, pdbD

    def __fetchFromSource(self, urlTarget, version="2.07-2019-07-23"):
        """Fetch the classification names and domain assignments from SCOPe repo.
        #
                dir.des.scope.2.07-2019-03-07.txt
                dir.cla.scope.2.07-2019-03-07.txt
                dir.hie.scope.2.07-2019-03-07.txt
        """
        encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii"
        fn = "dir.des.scope.%s.txt" % version
        url = os.path.join(urlTarget, fn)
        desL = self.__mU.doImport(url,
                                  fmt="tdd",
                                  rowFormat="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(desL))
        #
        fn = "dir.cla.scope.%s.txt" % version
        url = os.path.join(urlTarget, fn)
        claL = self.__mU.doImport(url,
                                  fmt="tdd",
                                  rowFormat="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(claL))
        #
        fn = "dir.hie.scope.%s.txt" % version
        url = os.path.join(urlTarget, fn)
        hieL = self.__mU.doImport(url,
                                  fmt="tdd",
                                  rowFormat="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(hieL))
        #
        return desL, claL, hieL

    def __extractDescription(self, desL):
        """
        From  dir.des.scope.2.07-2019-03-07.txt:

        # dir.des.scope.txt
        # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07)  [File format version 1.02]
        # http://scop.berkeley.edu/
        # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about
        46456   cl      a       -       All alpha proteins
        46457   cf      a.1     -       Globin-like
        46458   sf      a.1.1   -       Globin-like
        46459   fa      a.1.1.1 -       Truncated hemoglobin
        46460   dm      a.1.1.1 -       Protozoan/bacterial hemoglobin

        116748  sp      a.1.1.1 -       Bacillus subtilis [TaxId: 1423]
        113449  px      a.1.1.1 d1ux8a_ 1ux8 A:
        46461   sp      a.1.1.1 -       Ciliate (Paramecium caudatum) [TaxId: 5885]
        14982   px      a.1.1.1 d1dlwa_ 1dlw A:
        100068  px      a.1.1.1 d1uvya_ 1uvy A:
        46462   sp      a.1.1.1 -       Green alga (Chlamydomonas eugametos) [TaxId: 3054]
        14983   px      a.1.1.1 d1dlya_ 1dly A:
        100067  px      a.1.1.1 d1uvxa_ 1uvx A:
        63437   sp      a.1.1.1 -       Mycobacterium tuberculosis, HbN [TaxId: 1773]
        164742  px      a.1.1.1 d2gkma_ 2gkm A:
        164743  px      a.1.1.1 d2gkmb_ 2gkm B:

        """
        nD = {}

        for fields in desL:
            if fields[1] in ["cl", "cf", "sf", "fa", "dm"]:
                nD[int(fields[0])] = str(fields[4]).strip()
        logger.debug("Length of name dictionary %d", len(nD))
        nD[0] = "root" if 0 not in nD else nD[0]

        return nD

    def __extractAssignments(self, claL):
        """
        returns:

            aD[sunId] = [(), ... ]
        From dir.cla.scope.2.07-2019-03-07.txt:

        # dir.cla.scope.txt
        # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07)  [File format version 1.02]
        # http://scop.berkeley.edu/
        # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about
        #
        old_sunId                  sccs  sunid
        d1ux8a_ 1ux8    A:      a.1.1.1 113449  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=116748,px=113449
        d1dlwa_ 1dlw    A:      a.1.1.1 14982   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=14982
        d1uvya_ 1uvy    A:      a.1.1.1 100068  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=100068
        d1dlya_ 1dly    A:      a.1.1.1 14983   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=14983
        d1uvxa_ 1uvx    A:      a.1.1.1 100067  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=100067
        d2gkma_ 2gkm    A:      a.1.1.1 164742  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164742
        d2gkmb_ 2gkm    B:      a.1.1.1 164743  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164743
        d2gl3a_ 2gl3    A:      a.1.1.1 164754  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164754
        d2gl3b_ 2gl3    B:      a.1.1.1 164755  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164755
        d1idra_ 1idr    A:      a.1.1.1 62301   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=62301
        d1idrb_ 1idr    B:      a.1.1.1 62302   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=62302
        d1rtea_ 1rte    A:      a.1.1.1 105096  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=105096

        """
        dmD = {}
        logger.info("Length of class list %d", len(claL))
        rng = rngL = tL = None
        for fields in claL:
            try:
                rngL = str(fields[2]).strip().split(",")
                # dmTupL = [(tt[0], tt[1]) for tt in for rng.split(":") in rngL]
                #
                dmTupL = []
                for rng in rngL:
                    tL = [t for t in str(rng).strip().split(":") if len(t)]
                    if len(tL) > 1:
                        rL = tL[1].split("-")
                        tt = (tL[0], rL[0], rL[1])
                    else:
                        tt = (tL[0], None, None)

                    dmTupL.append(tt)
                #
                # Get the sid of the domain  -
                #
                sfL = str(fields[5]).strip().split(",")
                dmfL = sfL[4].split("=")
                dmf = int(dmfL[1])

                #                                         old domid      sccs    sunid for domain assignment
                dmD[int(fields[4])] = (fields[1], dmTupL, fields[0], fields[3],
                                       dmf)
                #
            except Exception as e:
                logger.exception(
                    "Failing fields %r rngL %r rng %r tL %r with %s", fields,
                    rngL, rng, tL, str(e))

        #
        #
        logger.info("Length of domain assignments %d", len(dmD))
        return dmD

    def __buildAssignments(self, dmD):
        """
        Input internal data structure with domain assignments -

        dmD[sunId] = (pdbId, [(authAsymId, begRes, endRes), ...], domain_name, sccs, sid_domain_assigned)

        Returns:

           aD[(pdbId, authAsymId)] = [(domSunId, domainId, sccs, (authAsymId, resBeg, resEnd))]


        """
        pdbD = {}
        for _, dTup in dmD.items():
            for rTup in dTup[1]:
                pdbD.setdefault((dTup[0], rTup[0]), []).append(
                    (dTup[4], dTup[2], dTup[3], rTup))
        return pdbD

    def __extractHierarchy(self, hieL, nD):
        """
        From dir.hie.scope.2.07-2019-03-07.txt:

        # dir.hie.scope.txt
        # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07)  [File format version 1.01]
        # http://scop.berkeley.edu/
        # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about
        0       -       46456,48724,51349,53931,56572,56835,56992,57942,58117,58231,58788,310555
        46456   0       46457,46556,46625,46688,46928,46954,46965,46996,47004,47013,47026,47039,47044,47049,47054,47059,47071,...,...
        46457   46456   46458,46548
        46458   46457   46459,46463,46532,74660,191420
        46459   46458   46460,190322

        """
        pD = {}
        logger.debug("Length of input hierarchy list %d", len(hieL))
        for fields in hieL:
            chId = int(fields[0])
            #
            if chId not in nD:
                continue
            pId = int(fields[1]) if fields[1].isdigit() else None
            pD[chId] = pId
        #
        logger.info("Length of domain parent dictionary %d", len(pD))
        return pD

    def __exportTreeNodeList(self, nD, pD):
        """Create node list from the SCOPe (sunid) parent and name/description dictionaries.

        Exclude the root node from the tree.

        """
        #
        rootId = 0
        pL = [rootId]
        logger.info("nD %d pD %d", len(nD), len(pD))
        # create child dictionary
        cD = {}
        for ctId, ptId in pD.items():
            cD.setdefault(ptId, []).append(ctId)
        #
        logger.debug("cD %d", len(cD))
        #
        idL = []
        for rootId in sorted(pL):
            visited = set([rootId])
            queue = collections.deque(visited)
            while queue:
                tId = queue.popleft()
                idL.append(tId)
                if tId not in cD:
                    # logger.warning("No children for scop tId %r", tId)
                    continue
                for childId in cD[tId]:
                    if childId not in visited:
                        queue.append(childId)
                        visited.add(childId)
        #
        dL = []
        for tId in idL:
            displayName = nD[tId] if tId in nD else None
            ptId = pD[tId] if tId in pD else None
            lL = self.getIdLineage(tId)[1:]
            #
            # d = {'id': str(tId), 'name': displayName, 'lineage': [str(t) for t in lL], 'parents': [str(ptId)], 'depth': len(lL)}
            if tId == rootId:
                continue
            elif ptId == rootId:
                dD = {"id": str(tId), "name": displayName, "depth": 0}
            else:
                dD = {
                    "id": str(tId),
                    "name": displayName,
                    "parents": [str(ptId)],
                    "depth": len(lL)
                }
            dL.append(dD)

        return dL
class ReferenceSequenceUtils(object):
    """Selected utilities to integrate reference sequence information with PDB polymer entity data."""
    def __init__(self, cfgOb, refDbName, **kwargs):
        self.__cfgOb = cfgOb
        self.__refDbName = refDbName
        self.__mU = MarshalUtil()
        #
        self.__refIdList = self.__getReferenceAssignments(refDbName, **kwargs)
        self.__refD, self.__matchD = self.__rebuildCache(
            refDbName, self.__refIdList, **kwargs)

    def __getReferenceAssignments(self, refDbName, **kwargs):
        """Get all accessions assigned to input reference sequence database"""
        rL = []
        exdbDirPath = kwargs.get("exdbDirPath", None)
        cacheKwargs = kwargs.get("cacheKwargs", None)
        useCache = kwargs.get("useCache", True)
        entryLimit = kwargs.get("entryLimit", None)

        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=exdbDirPath,
                                         useCache=useCache,
                                         cacheKwargs=cacheKwargs,
                                         entryLimit=entryLimit)
            eCount = epe.getEntryCount()
            rL = epe.getRefSeqAccessions(refDbName)
            logger.info(
                "Reading polymer entity cache with repository entry count %d ref accession length %d ",
                eCount, len(rL))
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return rL

    def __rebuildCache(self, refDbName, idList, **kwargs):
        """ """
        dD = {}
        dirPath = kwargs.get("exdbDirPath", None)
        cacheKwargs = kwargs.get("cacheKwargs", None)
        useCache = kwargs.get("useCache", True)
        fetchLimit = kwargs.get("fetchLimit", None)
        saveText = kwargs.get("saveText", False)

        ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
        fn = "ref-sequence-data-cache" + "." + ext
        cacheFilePath = os.path.join(dirPath, fn)
        self.__mU.mkdir(dirPath)
        if not useCache:
            for fp in [cacheFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and cacheFilePath and self.__mU.exists(cacheFilePath):
            dD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
        else:
            dD = self.__fetchReferenceEntries(refDbName,
                                              idList,
                                              saveText=saveText,
                                              fetchLimit=fetchLimit)
            if cacheFilePath and cacheKwargs:
                self.__mU.mkdir(dirPath)
                ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
                logger.info("Cache save status %r", ok)

        return dD["refDbCache"], dD["matchInfo"]

    def __fetchReferenceEntries(self,
                                refDbName,
                                idList,
                                saveText=False,
                                fetchLimit=None):
        """Fetch database entries from the input reference sequence database name."""
        dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}}

        try:
            idList = idList[:fetchLimit] if fetchLimit else idList
            logger.info("Starting fetch for %d %s entries", len(idList),
                        refDbName)
            if refDbName == "UNP":
                fobj = UniProtUtils(saveText=saveText)
                refD, matchD = fobj.fetchList(idList)
                dD = {
                    "refDbName": refDbName,
                    "refDbCache": refD,
                    "matchInfo": matchD
                }

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return dD

    def __dumpEntries(self, refD):
        for (eId, eDict) in refD.items():
            logger.info("------ Entry id %s", eId)
            for k, v in eDict.items():
                logger.info("%-15s = %r", k, v)

    def getReferenceAccessionAlignSummary(self):
        """Summarize the alignment of PDB accession assignments with the current reference sequence database."""
        numPrimary = 0
        numSecondary = 0
        numNone = 0
        for _, mD in self.__matchD.items():
            if mD["matched"] == "primary":
                numPrimary += 1
            elif mD["matched"] == "secondary":
                numSecondary += 1
            else:
                numNone += 1
        logger.debug("Matched primary:  %d secondary: %d none %d", numPrimary,
                     numSecondary, numNone)
        return numPrimary, numSecondary, numNone
Beispiel #22
0
class PfamProvider(StashableBase):
    """Manage an index of Pfam identifier to description mappings."""
    def __init__(self, **kwargs):
        urlTargetPfam = kwargs.get(
            "urlTargetPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
        )
        urlTargetPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/Pfam-A.clans.tsv.gz"
        self.__version = "34.0"
        dirName = "pfam"
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, dirName)
        super(PfamProvider, self).__init__(cachePath, [dirName])
        useCache = kwargs.get("useCache", True)
        #
        self.__mU = MarshalUtil(workPath=dirPath)
        self.__pfamD = self.__rebuildCache(urlTargetPfam, urlTargetPfamFB,
                                           dirPath, useCache)

        urlTargetMapPfam = kwargs.get(
            "urlTargetMapPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pdb_pfamA_reg.txt.gz"
        )
        urlTargetMapPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/pdb_pfamA_reg.txt.gz"
        self.__pfamMapD = self.__rebuildMappingCache(urlTargetMapPfam,
                                                     urlTargetMapPfamFB,
                                                     dirPath, useCache)

    def getVersion(self):
        return self.__version

    def getDescription(self, pfamId):
        """Return the description for the input Pfam identifier

        Args:
            pfamId (str): Pfam identifier

        Returns:
            str: text description of the Pfam domain
        """
        descr = None
        try:
            descr = self.__pfamD[pfamId]
        except Exception:
            pass
        return descr

    def getMapping(self, pdbId):
        """Return the list of Pfam domain assignments for the input PDB identifer along with
        residue level mapping information

        Args:
            pdbId (str): PDB identifier

        Returns:
            list: [{'pfamId': , 'authAsymId":  , 'authSeqBeg': , 'authSeqEnd': 'insertBeg': , 'insertEnd': }, {}, ]
        """
        mapL = []
        try:
            mapL = self.__pfamMapD[pdbId.upper()]
        except Exception:
            pass
        return mapL

    def testCache(self):
        # Check length ...
        logger.info("Length PfamD %d", len(self.__pfamD))
        return (len(self.__pfamD) > 19000) and (len(self.__pfamMapD) > 150000)

    #
    def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath,
                       useCache):
        pfamD = {}
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(pfamDataPath):
            pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt)
            logger.debug("Pfam data length %d", len(pfamD))
        elif not useCache:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetPfam,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam))
            ok = fU.get(urlTargetPfam, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB))
                ok = fU.get(urlTargetPfamFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            pfamD = self.__getPfamIndex(fp)
            ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt)
            logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath,
                        ok)
            # ------
        #
        return pfamD

    def __getPfamIndex(self, filePath):
        """Parse annotation classifications
        #
        """
        pfamD = {}
        encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {}
        rowL = self.__mU.doImport(filePath,
                                  fmt="tdd",
                                  rowFormat="list",
                                  **encodingD)
        for row in rowL:
            try:
                pfamId = row[0].strip().upper()
                idCode = row[3].strip()
                descr = row[4].strip()
                pfamD[pfamId] = descr + " (" + idCode + ")"
            except Exception:
                pass
        #
        return pfamD

    def __rebuildMappingCache(self, urlTargetPfam, urlTargetPfamFB, dirPath,
                              useCache):
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        pfamDataPath = os.path.join(dirPath, "pfam-mapping-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(pfamDataPath):
            pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt)
            logger.debug("Pfam mapping data length %d", len(pfamD))
        else:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetPfam,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam))
            ok = fU.get(urlTargetPfam, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB))
                ok = fU.get(urlTargetPfamFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            pfamD = self.__getPfamMapping(fp)
            ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt)
            logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath,
                        ok)
            # ------
        #
        return pfamD

    def __getPfamMapping(self, filePath):
        """Parse mapping data"""
        pFamMapD = {}
        encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {}
        rowL = self.__mU.doImport(filePath,
                                  fmt="tdd",
                                  rowFormat="list",
                                  **encodingD)
        for row in rowL:
            try:
                pdbId = row[2].strip().upper()
                pfamId = row[3].strip().upper()
                authAsymId = row[5].strip()
                authSeqBeg = int(row[6].strip())
                insertBeg = row[7].strip(
                ) if row[7].strip() != "NULL" else None
                authSeqEnd = int(row[8].strip())
                insertEnd = row[9].strip(
                ) if row[9].strip() != "NULL" else None
                pFamMapD.setdefault(pdbId, []).append({
                    "pfamId": pfamId,
                    "authAsymId": authAsymId,
                    "authSeqBeg": authSeqBeg,
                    "authSeqEnd": authSeqEnd,
                    "insertBeg": insertBeg,
                    "insertEnd": insertEnd,
                })
            except Exception as e:
                logger.exception("Failing with %r %s", row, str(e))
        #
        logger.info("Pfam mapping data for (%d) entries", len(pFamMapD))
        return pFamMapD
class Scop2ClassificationProvider(StashableBase):
    """Extract SCOP2 domain assignments, term descriptions and SCOP classification hierarchy
    from SCOP and SCOP2B flat files.
    """
    def __init__(self, cachePath, useCache, **kwargs):
        #
        _ = kwargs
        self.__cachePath = cachePath
        dirName = "scop2"
        self.__dirPath = os.path.join(self.__cachePath, dirName)
        self.__useCache = useCache
        super(Scop2ClassificationProvider,
              self).__init__(self.__cachePath, [dirName])
        #
        self.__version = "latest"
        self.__fmt = "pickle"
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__nD, self.__ntD, self.__pAD, self.__pBD, self.__pBRootD, self.__fD, self.__sfD, self.__sf2bD = self.__reload(
            useCache=self.__useCache, fmt=self.__fmt)
        #
        if not useCache and not self.testCache():
            ok = self.__fetchFromBackup()
            if ok:
                self.__nD, self.__ntD, self.__pAD, self.__pBD, self.__pBRootD, self.__fD, self.__sfD, self.__sf2bD = self.__reload(
                    useCache=True, fmt=self.__fmt)
        #

    def testCache(self):
        logger.info(
            "SCOP2 lengths nD %d pAD %d pBD %d pBRootD %d fD %d sfD %d sf2bD %d",
            len(self.__nD), len(self.__pAD), len(self.__pBD),
            len(self.__pBRootD), len(self.__fD), len(self.__sfD),
            len(self.__sf2bD))
        if (len(self.__nD) > 9000) and (len(self.__pAD) > 70000):
            return True
        return False

    def getVersion(self):
        """Returns the SCOP2 version"""
        return self.__version

    def getFamilyIds(self, pdbId, authAsymId):
        try:
            return list(
                set([tup[1]
                     for tup in self.__fD[(pdbId.upper(), authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyIds(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    tup[1] for tup in self.__sfD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getFamilyNames(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    self.__nD[tup[1]]
                    for tup in self.__fD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyNames(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    self.__nD[tup[1]]
                    for tup in self.__sfD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getFamilyResidueRanges(self, pdbId, authAsymId):
        try:
            # s/fD.setdefault((pdbId, authAsymId), []).append((domSuperFamilyId, authAsymId, authSeqBeg, authSeqEnd))
            return [(tup[0], tup[1], tup[2], tup[3], tup[4])
                    for tup in self.__fD[(pdbId.upper(), authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyResidueRanges(self, pdbId, authAsymId):
        try:
            return [(tup[0], tup[1], tup[2], tup[3], tup[4])
                    for tup in self.__sfD[(pdbId.upper(), authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyNames2B(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    self.__nD[tup[1]]
                    for tup in self.__sf2bD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyIds2B(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    tup[1] for tup in self.__sf2bD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyResidueRanges2B(self, pdbId, authAsymId):
        try:
            return [(tup[0], tup[1], tup[2], tup[3], tup[4])
                    for tup in self.__sf2bD[(pdbId.upper(), authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getName(self, domId):
        try:
            return self.__nD[domId]
        except Exception:
            logger.debug("Undefined SCOP2 id %r", domId)
        return None

    def getNameType(self, domId):
        qD = {
            "TP": "Protein Type",
            "CL": "Protein Class",
            "CF": "Fold",
            "SF": "Superfamily",
            "FA": "Family"
        }
        try:
            return qD[self.__ntD[domId]]
        except Exception:
            logger.debug("Undefined ECOD id %r", domId)
        return None

    def getIdLineage(self, domId):
        pS = set()
        try:
            pS.add(domId)
            pt = self.__pAD[domId]
            while (pt is not None) and (pt != 0):
                pS.add(pt)
                pt = self.__pAD[pt]
            #
            pt = self.__pBD[domId]
            while (pt is not None) and (pt != 0):
                pS.add(pt)
                pt = self.__pBD[pt]
        except Exception as e:
            logger.debug("Failing for %r with %s", domId, str(e))
        #
        return sorted(pS)

    def getNameLineage(self, domId):
        try:
            nL = []
            for dId in self.getIdLineage(domId):
                tN = self.getName(dId)
                tN = tN if tN else "Unnamed"
                nL.append(tN)
            return nL
        except Exception as e:
            logger.debug("Failing for %r with %s", domId, str(e))
        return None

    def getTreeNodeList(self):
        tnL = self.__exportTreeNodeList(self.__nD, self.__pAD, self.__pBRootD)
        return tnL

    def __getAssignmentFileName(self, fmt="json"):
        ext = "json" if fmt == "json" else "pic"
        fn = "scop2_domain_assignments.%s" % ext
        return fn

    def __reload(self, useCache=True, fmt="json"):
        nD = ntD = pAD = pBD = pBRootD = fD = sfD = sf2bD = {}
        fn = self.__getAssignmentFileName(fmt=fmt)
        assignmentPath = os.path.join(self.__dirPath, fn)
        self.__mU.mkdir(self.__dirPath)
        #
        if useCache and self.__mU.exists(assignmentPath):
            sD = self.__mU.doImport(assignmentPath, fmt=fmt)
            logger.debug("Domain name count %d", len(sD["names"]))
            self.__version = sD["version"]
            nD = sD["names"]
            ntD = sD["nametypes"]
            pAD = sD["parentsType"]
            pBD = sD["parentsClass"]
            pBRootD = sD["parentsClassRoot"]
            fD = sD["families"]
            sfD = sD["superfamilies"]
            sf2bD = sD["superfamilies2b"]

        elif not useCache:
            nmL, dmL, scop2bL, _ = self.__fetchFromSource()
            #
            ok = False
            nD = self.__extractNames(nmL)
            logger.info("Domain name dictionary (%d)", len(nD))
            pAD, pBD, pBRootD, ntD, fD, sfD, domToSfD = self.__extractDomainHierarchy(
                dmL)
            #
            logger.info("Domain node parent hierarchy (protein type) (%d)",
                        len(pAD))
            logger.info("Domain node parent hierarchy (structural class) (%d)",
                        len(pBD))
            logger.info(
                "Domain node parent hierarchy (structural class root) (%d)",
                len(pBRootD))
            logger.info("SCOP2 core domain assignments (family %d) (sf %d)",
                        len(fD), len(sfD))
            #
            sf2bD = self.__extractScop2bSuperFamilyAssignments(
                scop2bL, domToSfD)
            logger.info("SCOP2B SF domain assignments (%d)", len(sf2bD))
            #
            tS = datetime.datetime.now().isoformat()
            # vS = datetime.datetime.now().strftime("%Y-%m-%d")
            vS = self.__version
            sD = {
                "version": vS,
                "created": tS,
                "names": nD,
                "nametypes": ntD,
                "parentsType": pAD,
                "parentsClass": pBD,
                "parentsClassRoot": pBRootD,
                "families": fD,
                "superfamilies": sfD,
                "superfamilies2b": sf2bD
            }
            ok = self.__mU.doExport(assignmentPath, sD, fmt=fmt, indent=3)
            logger.info("Cache save status %r", ok)
            #
        return nD, ntD, pAD, pBD, pBRootD, fD, sfD, sf2bD

    def __fetchFromBackup(self, fmt="json"):
        urlTarget = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP2"
        #
        fn = self.__getAssignmentFileName(fmt=fmt)
        assignmentPath = os.path.join(self.__dirPath, fn)
        urlPath = os.path.join(urlTarget, fn)
        self.__mU.mkdir(assignmentPath)
        #
        logger.info("Using backup URL %r", urlPath)
        fU = FileUtil()
        ok = fU.get(urlPath, assignmentPath)
        return ok

    def __fetchFromSource(self):
        """Fetch the classification names and domain assignments from SCOP2 and SCOP2B resources.

        SCOP2 domain names:
            https://scop.mrc-lmb.cam.ac.uk/files/scop-des-latest.txt

        SCOP2 domain hierarchy:
            https://scop.mrc-lmb.cam.ac.uk/files/scop-cla-latest.txt

        SIFTS extrapolated SCOP2 and SCOP2B assignments:
            https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_scop2b_sf_uniprot.tsv.gz
            https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_scop2_uniprot.tsv.gz

        """
        urlTargetScop2 = "https://scop.mrc-lmb.cam.ac.uk/files"
        encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii"
        fn = "scop-des-latest.txt"
        url = os.path.join(urlTargetScop2, fn)
        desL = self.__mU.doImport(url,
                                  fmt="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(desL))
        #
        fn = "scop-cla-latest.txt"
        url = os.path.join(urlTargetScop2, fn)
        claL = self.__mU.doImport(url,
                                  fmt="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(claL))
        #
        headerLines = self.__mU.doImport(url,
                                         fmt="list",
                                         uncomment=False,
                                         encoding=encoding)
        self.__version = headerLines[0].split(
            " ")[3] if headerLines else "2021-05-27"
        # JDW note cert issues with this site
        urlTargetSifts = "http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv"
        fn = "pdb_chain_scop2b_sf_uniprot.tsv.gz"
        url = os.path.join(urlTargetSifts, fn)
        scop2bL = self.__mU.doImport(url,
                                     fmt="tdd",
                                     rowFormat="dict",
                                     uncomment=True,
                                     encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(scop2bL))
        #
        fn = "pdb_chain_scop2_uniprot.tsv.gz"
        url = os.path.join(urlTargetSifts, fn)
        scop2L = self.__mU.doImport(url,
                                    fmt="tdd",
                                    rowFormat="dict",
                                    uncomment=True,
                                    encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(scop2bL))
        #
        return desL, claL, scop2bL, scop2L

    def __extractNames(self, nmL):
        """ """
        rD = {}
        logger.info("Length of input name list %d", len(nmL))
        for nm in nmL:
            ff = nm.split(" ")
            rD[ff[0]] = " ".join(ff[1:])
        # self.__mU.doExport(os.path.join(self.__dirPath, "scop2-names.json"), rD, fmt="json", indent=3)
        return rD

    def __extractDomainHierarchy(self, dmL):
        """Extract the domain node identifier hierarchy from the SCOP2 representative assignment file ...

        Returns:
            dict, dict, dict, dict, dict: parent and name type dictionaries, family and superfamily assignments, and
                                          domain to superfamily mapping

            ntD[domainId] = name type TP=protein type, CL=protein class, CF=fold, SF=superfamily, FA=family
            pD[child domain identifier] = parent domain identifier
            fD[(pdbId, authAsymId)] = [(faDomId, faId, authAsymId, resBeg, resEnd),]
            sfD[(pdbId, authAsymId)] = [(sfDomId, sfId, authAsymId, resBeg, resEnd),]
            domToSfD[domSfid] = sfId

        Example assignment file:

        # SCOP release 2021-05-27
        # http://scop.mrc-lmb.cam.ac.uk
        # based on PDB release 2021-05-14
        # based on UniProt realese 2021-04-08
        # based on SIFTS release 2021-05-19
        # FA-DOMID FA-PDBID FA-PDBREG FA-UNIID FA-UNIREG SF-DOMID SF-PDBID SF-PDBREG SF-UNIID SF-UNIREG SCOPCLA
        8045703 3H8D C:1143-1264 Q64331 1143-1264 8091604 3H8D C:1143-1264 Q64331 1143-1264 TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627
        8094330 6J56 A:1158-1282 Q9UM54 1167-1291 8094331 6J56 A:1158-1282 Q9UM54 1167-1291 TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627
        #

        """
        # Build the parent dictionary and name node type
        ntD = {}
        pAD = {}
        pBD = {}
        pBRootD = {}
        fD = {}
        sfD = {}
        domToSfD = {}
        #
        logger.info("Length of input domain assignment list %d", len(dmL))
        for dm in dmL:
            try:
                ff = dm.split(" ")
                domFamilyId = ff[0]
                domSuperFamilyId = ff[5]
                rngL = ff[10].split(",")
                tD = {}
                for rng in rngL:
                    tL = rng.split("=")
                    tD[tL[0]] = tL[1]
                #
                # -
                # pD[tD["TP"]] = 0
                # pD[tD["CL"]] = tD["TP"]
                # pD[tD["CF"]] = tD["CL"]
                # pD[tD["SF"]] = tD["CF"]
                # pD[tD["FA"]] = tD["SF"]
                # pD[domFamilyId] = tD["FA"]
                # pD[domSuperFamilyId] = tD["SF"]
                #
                #  Represent as two trees separately rooted in protein type  and structural class
                pAD[tD["TP"]] = 0
                pAD[tD["CF"]] = tD["TP"]
                pAD[tD["SF"]] = tD["CF"]
                pAD[tD["FA"]] = tD["SF"]
                pAD[domFamilyId] = tD["FA"]
                pAD[domSuperFamilyId] = tD["SF"]
                #
                # Use this complete pBD here only for generating ID lineages, but NOT for merging with pAD
                pBD[tD["CL"]] = 0
                pBD[tD["CF"]] = tD["CL"]
                pBD[tD["SF"]] = tD["CF"]
                pBD[tD["FA"]] = tD["SF"]
                pBD[domFamilyId] = tD["FA"]
                pBD[domSuperFamilyId] = tD["SF"]
                #
                # Use pBRootD for creating tree node lists; Don't capture any lower branches to avoid re-creating redundant key:values already in pAD
                pBRootD[tD["CL"]] = 0
                pBRootD[tD["CF"]] = tD["CL"]
                #
                ntD[tD["FA"]] = "FA"
                ntD[tD["SF"]] = "SF"
                ntD[tD["CF"]] = "CF"
                ntD[tD["CL"]] = "CL"
                ntD[tD["TP"]] = "TP"
                #
                pdbId = ff[1]
                authAsymId, authSeqBeg, authSeqEnd = self.__parseAssignment(
                    ff[2])
                if authAsymId is not None:
                    fD.setdefault((pdbId, authAsymId), []).append(
                        (domFamilyId, tD["FA"], authAsymId, authSeqBeg,
                         authSeqEnd))
                pdbId = ff[6]
                authAsymId, authSeqBeg, authSeqEnd = self.__parseAssignment(
                    ff[7])
                if authAsymId is not None:
                    sfD.setdefault((pdbId, authAsymId), []).append(
                        (domSuperFamilyId, tD["SF"], authAsymId, authSeqBeg,
                         authSeqEnd))
                #
                domToSfD[domSuperFamilyId] = tD["SF"]
            except Exception as e:
                logger.exception("Failing for case %r: %s", dm, str(e))
        #
        logger.info("pAD (%d) pBD (%d) pBRootD (%d) ntD (%d)", len(pAD),
                    len(pBD), len(pBRootD), len(ntD))
        logger.info("fD (%d) sfD (%d)", len(fD), len(sfD))
        return pAD, pBD, pBRootD, ntD, fD, sfD, domToSfD

    def __parseAssignment(self, tS):
        authAsymId = authSeqBeg = authSeqEnd = None
        try:
            fL = tS.split(":")
            authAsymId = fL[0]
            rS = fL[1]
            if rS[0] == "-":
                authSeqBeg = -int(rS[1:].split("-")[0])
                authSeqEnd = int(rS[1:].split("-")[1])
            else:
                authSeqBeg = int(rS.split("-")[0])
                authSeqEnd = int(rS.split("-")[1])
        except Exception:
            pass
        return authAsymId, authSeqBeg, authSeqEnd

    def __extractScop2bSuperFamilyAssignments(self, scop2bL, domToSfD):
        """
        Extract the SCOP2B  SIFTS superfamily domain assignments for PDB structure entries.

        Returns:

         aD[(pdbId, authAsymId)] = [(sfDomId, sfId, authAsymId, resBeg, resEnd),]

        Example:

        # 2021/06/12 - 05:52 | PDB: 23.21 | UniProt: 2021.03
          PDB     CHAIN   SF_DOMID        SP_PRIMARY      RES_BEG RES_END PDB_BEG PDB_END SP_BEG  SP_END
          5id7    B       8033045 P02768  197     388     197     388     221     412
          1o9x    A       8033045 P02768  197     388     197     388     221     412
        """
        sfD = {}
        try:
            for rowD in scop2bL:
                if rowD["SF_DOMID"] in domToSfD:
                    sfD.setdefault(
                        (rowD["PDB"].upper(), rowD["CHAIN"]), []).append(
                            (rowD["SF_DOMID"], domToSfD[rowD["SF_DOMID"]],
                             rowD["CHAIN"], rowD["PDB_BEG"], rowD["PDB_END"]))
                else:
                    logger.warning("Missing SCOP2B SF ID mapping for %r",
                                   rowD["SF_DOMID"])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return sfD

    def __exportTreeNodeList(self, nD, pAD, pBRootD):
        """Create node list from the SCOP2 parent and name/description dictionaries.

        Exclude the root node from the tree.

        """
        #
        rootId = 0
        pL = [rootId]
        #
        logger.info("nD %d pAD %d pBRootD %d pL %r", len(nD), len(pAD),
                    len(pBRootD), pL)
        # create child dictionary
        cD = {}
        for ctId, ptId in pAD.items():
            cD.setdefault(ptId, []).append(ctId)
        for ctId, ptId in pBRootD.items():
            cD.setdefault(ptId, []).append(ctId)
        #
        logger.debug("cD %d", len(cD))
        #
        idL = []
        for rootId in sorted(pL):
            visited = set([rootId])
            queue = collections.deque(visited)
            while queue:
                tId = queue.popleft()
                idL.append(tId)
                if tId not in cD:
                    # logger.warning("No children for scop tId %r", tId)
                    continue
                for childId in cD[tId]:
                    if childId not in visited:
                        queue.append(childId)
                        visited.add(childId)
        #
        dL = []
        for tId in idL:
            displayName = nD[tId] if tId in nD else None
            ptIdL = []
            if tId in pAD:
                ptIdL.append(pAD[tId])
            if tId in pBRootD:
                ptIdL.append(pBRootD[tId])
            lL = self.getIdLineage(tId)[1:]
            #
            # d = {'id': str(tId), 'name': displayName, 'lineage': [str(t) for t in lL], 'parents': [str(ptId)], 'depth': len(lL)}
            if tId == rootId:
                continue
            elif any([ptId == rootId for ptId in ptIdL]):
                dD = {"id": str(tId), "name": displayName, "depth": 0}
            else:
                displayName = displayName if displayName else "Domain %s" % str(
                    tId)
                dD = {
                    "id": str(tId),
                    "name": displayName,
                    "parents": ptIdL,
                    "depth": len(lL)
                }
            dL.append(dD)

        return dL
Beispiel #24
0
class CitationExtractor(object):
    """Utilities to extract citation related data from the core_entry collection."""

    def __init__(self, cfgOb, **kwargs):
        self.__cfgOb = cfgOb
        self.__resourceName = "MONGO_DB"
        self.__databaseName = "pdbx_core"
        self.__collectionName = "pdbx_core_entry"
        #
        self.__mU = MarshalUtil()
        #
        self.__entryD = self.__rebuildCache(**kwargs)
        self.__idxD = self.__buildIndices(self.__entryD)
        #

    def __rebuildCache(self, **kwargs):
        useCache = kwargs.get("useCache", True)
        dirPath = kwargs.get("exdbDirPath", ".")
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        #
        ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
        fn = "entry-citation-extracted-data-cache" + "." + ext
        cacheFilePath = os.path.join(dirPath, fn)

        cD = {"entryD": {}}
        try:
            if useCache and cacheFilePath and os.access(cacheFilePath, os.R_OK):
                logger.info("Using cached entry citation file %s", cacheFilePath)
                cD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
            else:
                entryD = self.__extractCitations()
                cD["entryD"] = entryD
                if cacheFilePath:
                    ok = self.__mU.mkdir(dirPath)
                    ok = self.__mU.doExport(cacheFilePath, cD, **cacheKwargs)
                    logger.info("Saved entry citation results (%d) status %r in %s", len(entryD), ok, cacheFilePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return cD["entryD"]

    def __buildIndices(self, entryD):
        """
        Example:
         "entryD": {
                      "5KAL": {
                         "citation": [
                            {
                               "country": "UK",
                               "id": "primary",
                               "journal_abbrev": "Nucleic Acids Res.",
                               "journal_id_ASTM": "NARHAD",
                               "journal_id_CSD": "0389",
                               "journal_id_ISSN": "1362-4962",
                               "journal_volume": "44",
                               "page_first": "10862",
                               "page_last": "10878",
                               "title": "RNA Editing TUTase 1: structural foundation of substrate recognition, complex interactions and drug targeting.",
                               "year": 2016,
                               "pdbx_database_id_DOI": "10.1093/nar/gkw917",
                               "pdbx_database_id_PubMed": 27744351,
                               "rcsb_authors": [
                                  "Rajappa-Titu, L.",
                                  "Suematsu, T.",
                                  "Munoz-Tello, P.",
                                  "Long, M.",
                                  "Demir, O.",
                                  "Cheng, K.J.",
                                  "Stagno, J.R.",
                                  "Luecke, H.",
                                  "Amaro, R.E.",
                                  "Aphasizheva, I.",
                                  "Aphasizhev, R.",
                                  "Thore, S."
                               ]
                            }
                         ],
                         "_entry_id": "5KAL"
                      },
        """
        indD = {}
        missingCitationCount = 0
        missingJournalName = 0
        numPubMed = 0
        numDOI = 0
        numCitations = 0
        mD = {}
        issnD = {}
        missingISSNCount = 0
        missingPubMedCount = 0
        try:
            for entryId, eD in entryD.items():
                cDL = eD["citation"] if "citation" in eD else None
                if cDL:
                    for cD in cDL[:1]:
                        if cD and "journal_abbrev" in cD:
                            indD[cD["journal_abbrev"]] = indD[cD["journal_abbrev"]] + 1 if cD["journal_abbrev"] in indD else 1
                        else:
                            logger.info("Missing journal name in entryId %s %r ", entryId, cD)
                            missingJournalName += 1
                        if cD and "pdbx_database_id_DOI" in cD:
                            numDOI += 1

                        if cD and "pdbx_database_id_PubMed" in cD:
                            numPubMed += 1
                        else:
                            mD[cD["journal_abbrev"]] = mD[cD["journal_abbrev"]] + 1 if cD["journal_abbrev"] in mD else 1
                            missingPubMedCount += 1

                        if "journal_id_ISSN" in cD and len(cD["journal_id_ISSN"]) > 7:
                            issnD[cD["journal_id_ISSN"]] = issnD[cD["journal_id_ISSN"]] + 1 if cD["journal_id_ISSN"] in issnD else 1
                        else:
                            missingISSNCount += 1

                        if cD:
                            numCitations += 1
                else:
                    missingCitationCount += 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        logger.info("Number of citatons %d", numCitations)
        logger.info("Number of PubMed ids %d", numPubMed)
        logger.info("Number of DOIs %d", numDOI)
        logger.info("No citation category count %d missing journal name %d", missingCitationCount, missingJournalName)
        #
        logger.info("Journal index name length %d", len(indD))
        # logger.info("Journal name length %r",indD.items())
        #
        logger.info("Missing pubmed index length %d", len(mD))
        logger.info("Missing pubmed length %d", missingPubMedCount)
        logger.info("Missing PubMed %r", mD.items())
        #
        logger.info("ISSN dictionary length %d", len(issnD))
        logger.info("ISSN missing length %d", missingISSNCount)
        #
        return indD

    def getEntryCount(self):
        return len(self.__entryD)

    def __extractCitations(self):
        """Test case - extract unique entity source and host taxonomies"""
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName=self.__databaseName,
                collectionName=self.__collectionName,
                cacheFilePath=None,
                useCache=False,
                keyAttribute="entry",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=None,
                objectLimit=None,
                selectionQuery={},
                selectionList=["rcsb_id", "citation"],
            )
            eCount = obEx.getCount()
            logger.info("Entry count is %d", eCount)
            objD = obEx.getObjects()
            # for ky, eD in objD.items():
            #    logger.info("%s: %r", ky, eD)
            return objD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return {}
Beispiel #25
0
class GlyGenProvider(StashableBase):
    """Fetch glycans and glycoproteins available in the GlyGen.org resource.

    GlyGen glycan link template -
          https://glygen.org/glycan/G28882EF

    Glycoprotein link template -
          https://www.glygen.org/protein/Q658T7
    """
    def __init__(self, **kwargs):
        #
        dirName = "glygen"
        cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(cachePath, dirName)
        super(GlyGenProvider, self).__init__(cachePath, [dirName])
        useCache = kwargs.get("useCache", True)
        #
        baseUrl = kwargs.get(
            "glygenBasetUrl",
            "https://data.glygen.org/ln2data/releases/data/v-1.12.3/reviewed/")
        fallbackUrl = kwargs.get(
            "glygenFallbackUrl",
            "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/glygen/"
        )
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__glycanD = self.__reloadGlycans(baseUrl,
                                              fallbackUrl,
                                              self.__dirPath,
                                              useCache=useCache)
        self.__glycoproteinD = self.__reloadGlycoproteins(baseUrl,
                                                          fallbackUrl,
                                                          self.__dirPath,
                                                          useCache=useCache)

    def testCache(self, minGlycanCount=20000, minGlycoproteinCount=64000):
        #
        logger.info("GlyGen glycan list (%d) glycoprotein list (%d)",
                    len(self.__glycanD), len(self.__glycoproteinD))
        if self.__glycanD and len(
                self.__glycanD
        ) > minGlycanCount and self.__glycoproteinD and len(
                self.__glycoproteinD) > minGlycoproteinCount:
            return True
        return False

    def hasGlycan(self, glyTouCanId):
        try:
            return glyTouCanId in self.__glycanD
        except Exception:
            return False

    def hasGlycoprotein(self, uniProtId):
        try:
            return uniProtId in self.__glycoproteinD
        except Exception:
            return False

    def getGlycans(self):
        return self.__glycanD

    def getGlycoproteins(self):
        return self.__glycoproteinD

    def __reloadGlycans(self, baseUrl, fallbackUrl, dirPath, useCache=True):
        gD = {}
        logger.debug("Using dirPath %r", dirPath)
        self.__mU.mkdir(dirPath)
        #
        myDataPath = os.path.join(dirPath, "glygen-glycan-list.json")
        if useCache and self.__mU.exists(myDataPath):
            gD = self.__mU.doImport(myDataPath, fmt="json")
            logger.debug("GlyGen glycan data length %d", len(gD))
        elif not useCache:
            logger.debug(
                "Fetch GlyGen glycan data from primary data source %s",
                baseUrl)
            endPoint = os.path.join(baseUrl, "glycan_masterlist.csv")
            #
            logger.info("Fetch GlyGen glycan data from primary data source %s",
                        endPoint)
            rawPath = os.path.join(dirPath, "glycan_masterlist.csv")
            fU = FileUtil()
            ok = fU.get(endPoint, rawPath)
            logger.debug("Fetch GlyGen glycan data status %r", ok)
            if not ok:
                endPoint = os.path.join(fallbackUrl, "glycan_masterlist.csv")
                ok = fU.get(endPoint, rawPath)
                logger.info("Fetch fallback GlyGen glycan data status %r", ok)
            #
            if ok:
                gD = self.__parseGlycanList(rawPath)
                ok = self.__mU.doExport(myDataPath, gD, fmt="json")
                logger.info("Exported GlyGen glycan list (%d) (%r) %s",
                            len(gD), ok, myDataPath)
            #
        return gD

    def __parseGlycanList(self, filePath):
        gD = {}
        row = None
        try:
            rowL = self.__mU.doImport(filePath, fmt="csv", rowFormat="list")
            logger.debug("Glycan list length (%d)", len(rowL))
            logger.debug("Row 0 %r", rowL[0])
            for row in rowL[1:]:
                gD[row[0]] = row[1]
        except Exception as e:
            logger.exception("Failing for %r (%r) with %s", filePath, row,
                             str(e))
        return gD

    def __reloadGlycoproteins(self,
                              baseUrl,
                              fallbackUrl,
                              dirPath,
                              useCache=True):
        gD = {}
        logger.debug("Using dirPath %r", dirPath)
        self.__mU.mkdir(dirPath)
        #
        myDataPath = os.path.join(dirPath, "glygen-glycoprotein-list.json")
        if useCache and self.__mU.exists(myDataPath):
            gD = self.__mU.doImport(myDataPath, fmt="json")
            logger.debug("GlyGen glycoprotein data length %d", len(gD))
        else:
            for fn in [
                    "sarscov1_protein_masterlist.csv",
                    "sarscov2_protein_masterlist.csv",
                    "hcv1b_protein_masterlist.csv",
                    "hcv1a_protein_masterlist.csv",
                    "human_protein_masterlist.csv",
                    "mouse_protein_masterlist.csv",
                    "rat_protein_masterlist.csv",
            ]:
                logger.debug(
                    "Fetch GlyGen glycoprotein data from primary data source %s",
                    baseUrl)
                endPoint = os.path.join(baseUrl, fn)
                #
                logger.debug(
                    "Fetch GlyGen glycoprotein data from primary data source %s",
                    endPoint)
                rawPath = os.path.join(dirPath, fn)
                fU = FileUtil()
                ok = fU.get(endPoint, rawPath)
                logger.debug("Fetch GlyGen glycoprotein data status %r", ok)
                if not ok:
                    endPoint = os.path.join(fallbackUrl, fn)
                    ok = fU.get(endPoint, rawPath)
                    logger.info("Fetch fallback GlyGen data status %r", ok)
                #
                if ok:
                    tD = self.__parseGlycoproteinList(rawPath)
                    gD.update(tD)
            #
            ok = self.__mU.doExport(myDataPath, gD, fmt="json")
            logger.info("Exported GlyGen glycoprotein list (%d) (%r) %s",
                        len(gD), ok, myDataPath)
        #
        return gD

    def __parseGlycoproteinList(self, filePath):
        gD = {}
        try:
            rowL = self.__mU.doImport(filePath, fmt="csv", rowFormat="list")
            for row in rowL[1:]:
                ff = row[0].split("-")
                gD[ff[0]] = ff[1]
        except Exception as e:
            logger.exception("Failing for %r with %s", filePath, str(e))
        return gD
Beispiel #26
0
class CathClassificationProvider(StashableBase):
    """Extract CATH domain assignments, term descriptions and CATH classification hierarchy
    from CATH flat files.
    """

    def __init__(self, **kwargs):
        #
        self.__dirName = "cath"
        if "cachePath" in kwargs:
            self.__cachePath = os.path.abspath(kwargs.get("cachePath", None))
            self.__cathDirPath = os.path.join(self.__cachePath, self.__dirName)
        else:
            self.__cathDirPath = kwargs.get("cathDirPath", ".")
            self.__cachePath, self.__dirName = os.path.split(os.path.abspath(self.__cathDirPath))
        super(CathClassificationProvider, self).__init__(self.__cachePath, [self.__dirName])
        #
        useCache = kwargs.get("useCache", True)
        urlTarget = kwargs.get("cathTargetUrl", "http://download.cathdb.info/cath/releases/daily-release/newest")
        urlFallbackTarget = kwargs.get("cathTargetUrl", "http://download.cathdb.info/cath/releases/daily-release/archive")
        # no trailing /
        urlBackupPath = kwargs.get("cathUrlBackupPath", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/CATH")
        #
        self.__mU = MarshalUtil(workPath=self.__cathDirPath)
        self.__nD, self.__pdbD = self.__reload(urlTarget, urlFallbackTarget, self.__cathDirPath, useCache=useCache)
        if not self.testCache() and not useCache:
            ok = self.__fetchFromBackup(urlBackupPath, self.__cathDirPath)
            if ok:
                self.__nD, self.__pdbD = self.__reload(urlTarget, urlFallbackTarget, self.__cathDirPath, useCache=True)
        #

    def testCache(self):
        logger.info("CATH lengths nD %d pdbD %d", len(self.__nD), len(self.__pdbD))
        if (len(self.__nD) > 100) and (len(self.__pdbD) > 5000):
            return True
        return False

    def getCathVersions(self, pdbId, authAsymId):
        """aD[(pdbId, authAsymId)] = [(cathId, domainId, (authAsymId, resBeg, resEnd), version)]"""
        try:
            return list(set([tup[3] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e))

        return []

    def getCathIds(self, pdbId, authAsymId):
        try:
            return list(set([tup[0] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e))

        return []

    def getCathDomainNames(self, pdbId, authAsymId):
        try:
            return list(set([tup[1] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e))

        return []

    def getCathResidueRanges(self, pdbId, authAsymId):
        try:
            return [(tup[0], tup[1], tup[2][0], tup[2][1], tup[2][2]) for tup in self.__pdbD[(pdbId, authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e))

        return []

    def getCathName(self, cathId):
        try:
            return self.__nD[cathId]
        except Exception:
            logger.debug("Undefined CATH id %r", cathId)
        return None

    def getIdLineage(self, cathId):
        try:
            ff = cathId.split(".")
            return [".".join(ff[0:jj]) for jj in range(1, len(ff) + 1)]
        except Exception:
            logger.debug("No lineage for bad CATH id %r", cathId)
        return None

    def getNameLineage(self, cathId):
        try:
            return [self.getCathName(cId) for cId in self.getIdLineage(cathId)]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def getTreeNodeList(self):
        return self.__exportTreeNodeList(self.__nD)

    def __getCathDomainFileName(self):
        pyVersion = sys.version_info[0]
        fn = "cath_domains-py%s.pic" % str(pyVersion)
        return fn

    def __reload(self, urlTarget, urlFallbackTarget, cathDirPath, useCache=True):
        nD = {}
        pdbD = {}
        fn = self.__getCathDomainFileName()
        cathDomainPath = os.path.join(cathDirPath, fn)
        self.__mU.mkdir(cathDirPath)
        #
        # cathDomainPath = os.path.join(cathDirPath, "cath_domains.json")
        #
        if useCache and self.__mU.exists(cathDomainPath):
            sD = self.__mU.doImport(cathDomainPath, fmt="pickle")
            logger.debug("Cath domain length %d", len(sD))
            nD = sD["names"]
            pdbD = sD["assignments"]
        elif not useCache:
            minLen = 1000
            logger.info("Fetch CATH name and domain assignment data from primary data source %s", urlTarget)
            nmL, dmL = self.__fetchFromSource(urlTarget, urlFallbackTarget, minLen)
            #
            ok = False
            nD = self.__extractNames(nmL)
            dD = self.__extractDomainAssignments(dmL)
            pdbD = self.__buildAssignments(dD)
            sD = {"names": nD, "assignments": pdbD}
            if (len(nD) > minLen) and (len(dD) > minLen):
                ok = self.__mU.doExport(cathDomainPath, sD, fmt="pickle")
            logger.debug("Cache save status %r", ok)
            #
        return nD, pdbD

    def __fetchFromBackup(self, urlBackupPath, cathDirPath):
        fn = self.__getCathDomainFileName()
        cathDomainPath = os.path.join(cathDirPath, fn)
        self.__mU.mkdir(cathDirPath)
        #
        backupUrl = urlBackupPath + "/" + fn
        logger.info("Using backup URL %r", backupUrl)
        fU = FileUtil()
        ok = fU.get(backupUrl, cathDomainPath)
        return ok

    def __fetchFromSource(self, urlTarget, urlFallbackTarget, minLen):
        """Fetch the classification names and domain assignments from CATH repo.

        http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-all.gz
        http://download.cathdb.info/cath/releases/daily-release/newest/cath-b-newest-names.gz
        #
        http://download.cathdb.info/cath/releases/daily-release/archive/cath-b-yyyymmdd-all.gz
        http://download.cathdb.info/cath/releases/daily-release/archive/cath-b-yyyymmdd-names-all.gz
        """
        fn = "cath-b-newest-names.gz"
        url = os.path.join(urlTarget, fn)
        nmL = self.__mU.doImport(url, fmt="list", uncomment=True)
        #
        if not nmL or len(nmL) < minLen:
            dS = datetime.today().strftime("%Y%m%d")
            dS = datetime.strftime(datetime.now() - timedelta(1), "%Y%m%d")
            fn = "cath-b-%s-names-all.gz" % dS
            url = os.path.join(urlFallbackTarget, fn)
            logger.info("Using fallback resource for %s", fn)
            nmL = self.__mU.doImport(url, fmt="list", uncomment=True)
        #
        fn = "cath-b-newest-all.gz"
        url = os.path.join(urlTarget, fn)
        dmL = self.__mU.doImport(url, fmt="list", uncomment=True)
        #
        if not dmL or len(dmL) < minLen:
            dS = datetime.today().strftime("%Y%m%d")
            dS = datetime.strftime(datetime.now() - timedelta(1), "%Y%m%d")
            fn = "cath-b-%s-all.gz" % dS
            url = os.path.join(urlFallbackTarget, fn)
            logger.info("Using fallback resource for %s", fn)
            dmL = self.__mU.doImport(url, fmt="list", uncomment=True)
        #
        return nmL, dmL

    def __extractNames(self, nmL):
        """
        From cath-b-newest-names:

            1 Mainly Alpha
            2 Mainly Beta
            3 Alpha Beta
            4 Few Secondary Structures
            1.10 Orthogonal Bundle
            1.20 Up-down Bundle
            1.25 Alpha Horseshoe
            1.40 Alpha solenoid
            1.50 Alpha/alpha barrel
            2.10 Ribbon
            2.20 Single Sheet
            2.30 Roll
            2.40 Beta Barrel
            2.50 Clam
            2.60 Sandwich
            2.70 Distorted Sandwich
            2.80 Trefoil
            2.90 Orthogonal Prism
            2.100 Aligned Prism
            2.102 3-layer Sandwich
        """
        rD = {}
        logger.info("length of input name list %d", len(nmL))
        for nm in nmL:
            ff = nm.split(" ")
            rD[ff[0]] = " ".join(ff[1:])
        return rD

    def __extractDomainAssignments(self, dmL):
        """
        From cath-b-newest-all:

            101mA00 v4_2_0 1.10.490.10 0-153:A
            102lA00 v4_2_0 1.10.530.40 1-162:A
            102mA00 v4_2_0 1.10.490.10 0-153:A
            103lA00 v4_2_0 1.10.530.40 1-162:A
            103mA00 v4_2_0 1.10.490.10 0-153:A
            104lA00 v4_2_0 1.10.530.40 1-162:A
            104lB00 v4_2_0 1.10.530.40 1-162:B
            104mA00 v4_2_0 1.10.490.10 1-153:A
            105mA00 v4_2_0 1.10.490.10 1-153:A
            106mA00 v4_2_0 1.10.490.10 0-153:A
            107lA00 v4_2_0 1.10.530.40 1-162:A
            107mA00 v4_2_0 1.10.490.10 0-153:A
            108lA00 v4_2_0 1.10.530.40 1-162:A
            108mA00 v4_2_0 1.10.490.10 0-153:A
            109lA00 v4_2_0 1.10.530.40 1-162:A
            109mA00 v4_2_0 1.10.490.10 0-153:A
            10gsA01 v4_2_0 3.40.30.10 2-78:A,187-208:A

            #
            Returns:

            dD[domainId] = (cathId, [(authAsymId, resBeg, resEnd), ...], version)

        """
        dD = {}
        logger.info("length of input domain assignment list %d", len(dmL))
        for dm in dmL:
            #
            try:
                ff = dm.split(" ")
                #
                rngL = ff[3].split(",")
                dmTupL = []
                for rng in rngL:
                    tL = rng.split(":")
                    rL = tL[0].split("-")
                    dmTupL.append((tL[1], rL[0], rL[1]))
                #
                dD[ff[0]] = (ff[2], dmTupL, ff[1])
            except Exception:
                logger.info("Failing for case %r: %r", ff, dm)
        return dD

    def __buildAssignments(self, dD):
        """
          Input internal data structure with domain assignments -

          dD[domainId] = (cathId, rangelist, version)

          Returns:

        =
           aD[(pdbId, authAsymId)] = [(cathId, domainId, (authAsymId, resBeg, resEnd), version)]
        """
        pdbD = {}
        for domId, dTup in dD.items():
            pdbId = domId[:4]
            for rTup in dTup[1]:
                pdbD.setdefault((pdbId, rTup[0]), []).append((dTup[0], domId, rTup, dTup[2]))
        return pdbD

    def __exportTreeNodeList(self, nD):
        """Create node list from name dictionary and lineage dictionaries."""
        # create parent dictionary
        #
        pL = []
        pD = {}
        for tId in nD:
            ff = tId.split(".")
            if len(ff) == 1:
                ptId = None
                pL.append(tId)
            else:
                ptId = ".".join(ff[:-1])
            logger.debug("tId %s parent %s", tId, ptId)
            pD[tId] = ptId
        #
        logger.info("nD %d pD %d", len(nD), len(pD))
        # create child dictionary
        cD = {}
        for ctId, ptId in pD.items():
            cD.setdefault(ptId, []).append(ctId)
        #
        logger.info("cD %d", len(cD))
        #
        idL = []
        for rootId in sorted(pL):
            visited = set([rootId])
            queue = collections.deque(visited)
            while queue:
                tId = queue.popleft()
                idL.append(tId)
                if tId not in cD:
                    # logger.debug("No children for CATH tId %s", tId)
                    continue
                for childId in cD[tId]:
                    if childId not in visited:
                        queue.append(childId)
                        visited.add(childId)
        #
        dL = []
        for tId in idL:
            displayName = nD[tId]
            ptId = pD[tId]
            ff = tId.split(".")
            lL = [".".join(ff[0:jj]) for jj in range(1, len(ff) + 1)]
            #
            # d = {'id': tId, 'name': displayName, 'lineage': lL, 'parents': [ptId], 'depth': len(lL)}
            if len(lL) == 1:
                dD = {"id": tId, "name": displayName, "depth": 0}
            else:
                dD = {"id": tId, "name": displayName, "parents": [ptId], "depth": len(lL) - 1}
            dL.append(dD)

        return dL