Esempio n. 1
0
    def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None):
        """Return JSON schema (w/ BSON types) object for the input collection and level.and

        Args:
            databaseName (str): database name
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

        Returns:
            dict: Schema object

        """
        sObj = None
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
        #
        if self.__rebuildFlag:
            filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
            self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts)
        else:
            filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache)
        mU = MarshalUtil(workPath=self.__workPath)
        if filePath and mU.exists(filePath):
            mU = MarshalUtil(workPath=self.__workPath)
            sObj = mU.doImport(filePath, fmt="json")
        else:
            logger.debug("Failed to read schema for %s %r", collectionName, level)
        return sObj
Esempio n. 2
0
    def __rebuildCache(self, urlTargetIsoLtwa, dirPath, useCache):
        """Rebuild the cache of ISO abbreviation term data

        Args:
            urlTargetIsoLtwa (str): URL for ISO4 LTWA title word abbreviations
            dirPath (str):  cache path
            useCache (bool):  flag to use cached files

        Returns:
            tuple: (dict) title word abbreviations
                   (dict) language conflict dictionary
                   (list) multi-word abbreviation targets

        Notes:
            ISO source file (tab delimited UTF-16LE) is maintained at the ISSN site -
            https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt
        """
        aD = {}
        mU = MarshalUtil(workPath=dirPath)
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        isoLtwaNamePath = os.path.join(dirPath, "iso-ltwa.%s" % ext)
        logger.debug("Using cache data path %s", dirPath)
        mU.mkdir(dirPath)
        if not useCache:
            for fp in [isoLtwaNamePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and mU.exists(isoLtwaNamePath):
            aD = mU.doImport(isoLtwaNamePath, fmt=fmt)
            logger.debug("Abbreviation name length %d", len(aD["abbrev"]))
        elif not useCache:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetIsoLtwa,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetIsoLtwa))
            ok = fU.get(urlTargetIsoLtwa, fp)
            aD = self.__getLtwaTerms(dirPath, fp)
            ok = mU.doExport(isoLtwaNamePath, aD, fmt=fmt)
            logger.debug("abbrevD keys %r", list(aD.keys()))
            logger.debug("Caching %d ISO LTWA in %s status %r",
                         len(aD["abbrev"]), isoLtwaNamePath, ok)
        #
        abbrevD = aD["abbrev"] if "abbrev" in aD else {}
        conflictD = aD["conflicts"] if "conflicts" in aD else {}
        multiWordTermL = aD[
            "multi_word_abbrev"] if "multi_word_abbrev" in aD else []
        #
        return abbrevD, conflictD, multiWordTermL
Esempio n. 3
0
    def readIndex(self):
        try:
            mU = MarshalUtil()
            if not mU.exists(self._indexFilePath):
                return False
            indexObj = mU.doImport(self._indexFilePath, fmt=self.__fmt)
            if indexObj is not None and len(indexObj) > 0:
                self._rL.extend(indexObj)
            return True
        except Exception as e:
            logger.error("Failing with %s", str(e))

        return False
class ChemCompMoleculeProvider(object):
    """Utilities to read and serialize the dictionary of PDBx/mmCIF chemical component definitions."""

    def __init__(self, **kwargs):
        # Default source target locators
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__ccUrlTarget = self.__ccUrlTarget if self.__ccUrlTarget else "http://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif.gz"
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        self.__birdUrlTarget = self.__birdUrlTarget if self.__birdUrlTarget else "http://ftp.wwpdb.org/pub/pdb/data/bird/prd/prdcc-all.cif.gz"
        #
        ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc")
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, "chem_comp")
        useCache = kwargs.get("useCache", True)
        molLimit = kwargs.get("molLimit", 0)
        skipObsolete = kwargs.get("skipObsolete", True)
        # Optional id dictionary filter
        filterIdD = kwargs.get("filterIdD", None)
        #
        self.__mU = MarshalUtil(workPath=dirPath)
        self.__ccMolD = self.__reload(
            self.__ccUrlTarget, self.__birdUrlTarget, ccFileNamePrefix, dirPath, useCache=useCache, molLimit=molLimit, filterIdD=filterIdD, skipObsolete=skipObsolete
        )

    def testCache(self, minCount=None, logSizes=False):
        if logSizes and self.__ccMolD:
            logger.info("ccMolD object size %.2f MB", getObjSize(self.__ccMolD) / 1000000.0)
        ok = self.__ccMolD and len(self.__ccMolD) >= minCount if minCount else self.__ccMolD is not None
        return ok

    def getMolD(self):
        return self.__ccMolD

    def getMol(self, ccId):
        try:
            return self.__ccMolD[ccId]
        except Exception as e:
            logger.debug("Get molecule %r failing with %s", ccId, str(e))
        return None

    def getReleaseStatus(self, ccId):
        try:
            ccIt = iter(PdbxChemCompIt(self.__ccMolD[ccId]))
            ccIt = next(ccIt, None)
            return ccIt.getReleaseStatus() if ccIt else None
        except Exception as e:
            logger.exception("Failing for ccId %r with %s", ccId, str(e))
        return None

    def __reload(self, ccUrlTarget, birdUrlTarget, ccFileNamePrefix, dirPath, useCache=False, molLimit=None, filterIdD=None, skipObsolete=True):
        """Reload or create serialized data dictionary of chemical components.

        Args:
            ccUrlTarget (str): target url for chemical component dictionary resource file
            birdUrlTarget (str): target url for bird dictionary resource file (cc format)
            dirPath (str): path to the directory containing cache files
            useCache (bool):
            molLimit (int): maximum number of definitions to process
            filterIdD (dict): dictionary of selected chemical component identifier codes
            skipObsolete (bool): skip obsolete definitions

         Returns:
            (list): chemical component data containers
        """
        #
        startTime = time.time()
        # This is the naming standard for serialized PDBx/mmCIF component data
        ccDataFilePath = os.path.join(dirPath, "%s-chemical-component-data.pic" % ccFileNamePrefix)
        _, fExt = os.path.splitext(ccDataFilePath)
        ccDataFormat = "json" if fExt == ".json" else "pickle"
        #
        if useCache and self.__mU.exists(ccDataFilePath):
            rdCcObjD = self.__mU.doImport(ccDataFilePath, fmt=ccDataFormat)
            ccObjD = {k: rdCcObjD[k] for k in sorted(rdCcObjD.keys())[:molLimit]} if molLimit else rdCcObjD
            if skipObsolete:
                tD = {}
                for ccId in ccObjD:
                    ccIt = iter(PdbxChemCompIt(ccObjD[ccId]))
                    ccIt = next(ccIt, None)
                    if ccIt.getReleaseStatus() not in ["REL", "REF_ONLY"]:
                        continue
                    tD[ccId] = ccObjD[ccId]
                ccObjD = tD

        else:
            # Source component data files ...
            ccdFilePath = self.__fetchUrl(ccUrlTarget, dirPath, useCache=useCache)
            birdFilePath = self.__fetchUrl(birdUrlTarget, dirPath, useCache=useCache)
            rdCcObjD = self.__readComponentDefinitions(ccdFilePath, birdFilePath, molLimit=molLimit, skipObsolete=skipObsolete)
            ccObjD = {ccId: ccObj for ccId, ccObj in rdCcObjD.items() if ccId in filterIdD} if filterIdD else rdCcObjD
            ok = self.__mU.doExport(ccDataFilePath, ccObjD, fmt=ccDataFormat)
            logger.info("Storing %d definitions (status=%r) path: %s ", len(ccObjD), ok, ccDataFilePath)
        #
        endTime = time.time()
        logger.info("Loaded/reloaded %d definitions (%.4f seconds)", len(ccObjD), endTime - startTime)
        return ccObjD

    def __fetchUrl(self, urlTarget, dirPath, useCache=False):
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        filePath = os.path.join(dirPath, fn)
        if not (useCache and fU.exists(filePath)):
            startTime = time.time()
            ok2 = fU.get(urlTarget, filePath)
            endTime = time.time()
            if ok2:
                logger.info("Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime)
            else:
                logger.error("Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime)
        #
        return filePath

    def __readComponentDefinitions(self, ccdFilePath, birdFilePath=None, molLimit=None, skipObsolete=True):
        ccObjD = {}
        try:
            startTime = time.time()
            logger.info("Reading definitions from %s", ccdFilePath)
            rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif")
            endTime = time.time()
            logger.info("Read %s with %d CCD definitions (%.4f seconds)", ccdFilePath, len(rdCcObjL), endTime - startTime)
            # -------
            if birdFilePath:
                startTime = time.time()
                logger.info("Reading definitions from %s", birdFilePath)
                birdCcObjL = self.__mU.doImport(birdFilePath, fmt="mmcif")
                endTime = time.time()
                logger.info("Read %s with %d BIRD definitions (%.4f seconds)", birdFilePath, len(birdCcObjL), endTime - startTime)
                rdCcObjL.extend(birdCcObjL)
            #
            startTime = time.time()
            ccObjL = rdCcObjL[:molLimit] if molLimit else rdCcObjL
            for ccObj in ccObjL:
                ccIt = iter(PdbxChemCompIt(ccObj))
                ccIt = next(ccIt, None)
                ccId = ccIt.getId() if ccIt else ccObj.getName()
                if skipObsolete and ccIt.getReleaseStatus() not in ["REL", "REF_ONLY"]:
                    continue
                ccObjD[ccId] = ccObj
            endTime = time.time()
            logger.info("Processed %d definitions (%.4f seconds)", len(ccObjD), endTime - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return ccObjD
class ReferenceSequenceAssignmentUpdater(object):
    """Selected utilities to update reference sequence assignments information
    in the core_entity collection.

    """
    def __init__(self,
                 cfgOb,
                 databaseName="pdbx_core",
                 collectionName="pdbx_core_polymer_entity",
                 polymerType="Protein",
                 referenceDatabaseName="UniProt",
                 provSource="PDB",
                 **kwargs):
        self.__cfgOb = cfgOb
        self.__polymerType = polymerType
        self.__mU = MarshalUtil()
        #
        self.__databaseName = databaseName
        self.__collectionName = collectionName
        self.__statusList = []
        #
        self.__ssP = self.__fetchSiftsSummaryProvider(
            self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__assignRefD, self.__refD, self.__matchD = self.__reload(
            databaseName, collectionName, polymerType, referenceDatabaseName,
            provSource, **kwargs)

    def __reload(self, databaseName, collectionName, polymerType,
                 referenceDatabaseName, provSource, **kwargs):
        assignRefD = self.__getPolymerReferenceSequenceAssignments(
            databaseName, collectionName, polymerType, **kwargs)
        # get refIdD = {refId: [entity_id, ....], }
        refIdD, _ = self.__getUniqueAssignments(
            assignRefD,
            referenceDatabaseName=referenceDatabaseName,
            provSource=provSource)
        #
        refD, matchD = self.__rebuildReferenceCache(referenceDatabaseName,
                                                    list(refIdD.keys()),
                                                    **kwargs)
        return assignRefD, refD, matchD

    def doUpdate(self, updateId, updateLimit=None):
        desp = DataExchangeStatus()
        statusStartTimestamp = desp.setStartTime()
        #
        numUpd = 0
        updateDL = self.__buildUpdate(self.__assignRefD)
        if updateDL:
            if updateLimit:
                numUpd = self.__doUpdate(self.__cfgOb, updateDL[:updateLimit],
                                         self.__databaseName,
                                         self.__collectionName)
            else:
                numUpd = self.__doUpdate(self.__cfgOb, updateDL,
                                         self.__databaseName,
                                         self.__collectionName)
        self.__updateStatus(updateId, self.__databaseName,
                            self.__collectionName, True, statusStartTimestamp)
        return len(updateDL), numUpd

    def __doUpdate(self, cfgOb, updateDL, databaseName, collectionName):
        obUpd = ObjectUpdater(cfgOb)
        numUpd = obUpd.update(databaseName, collectionName, updateDL)
        logger.info("Update count is %d", numUpd)

        return numUpd

    def __getPolymerReferenceSequenceAssignments(self, databaseName,
                                                 collectionName, polymerType,
                                                 **kwargs):
        """Get all accessions assigned to input reference sequence database for the input polymerType.

        Returns:
         (dict): {"1abc_1": "rcsb_entity_container_identifiers": {"reference_sequence_identifiers": []},
                            "rcsb_polymer_entity_align": [],
                            "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
        """
        cachePath = kwargs.get("cachePath", ".")
        exDbDir = "exdb"
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
        useCache = kwargs.get("useCache", True)
        fetchLimit = kwargs.get("fetchLimit", None)
        cacheFilePath = os.path.join(cachePath, exDbDir,
                                     "entity-poly-ref-seq-assign-cache.json")
        #
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName=databaseName,
                collectionName=collectionName,
                cacheFilePath=cacheFilePath,
                useCache=useCache,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=cacheKwargs,
                objectLimit=fetchLimit,
                selectionQuery={
                    "entity_poly.rcsb_entity_polymer_type": polymerType
                },
                selectionList=[
                    "rcsb_id",
                    "rcsb_entity_container_identifiers.reference_sequence_identifiers",
                    "rcsb_entity_container_identifiers.auth_asym_ids",
                    "rcsb_polymer_entity_align",
                    "rcsb_entity_source_organism.ncbi_taxonomy_id",
                ],
            )
            eCount = obEx.getCount()
            logger.info("Entity count is %d", eCount)
            objD = obEx.getObjects()
            logger.info(
                "Reading polymer entity entity count %d ref accession length %d ",
                eCount, len(objD))
            #
        except Exception as e:
            logger.exception("Failing for %s (%s) with %s", databaseName,
                             collectionName, str(e))
        return objD

    def __getUniqueAssignments(self,
                               objD,
                               referenceDatabaseName="UniProt",
                               provSource="PDB"):
        refIdD = defaultdict(list)
        taxIdD = defaultdict(list)
        numMissing = 0
        for entityKey, eD in objD.items():
            try:
                accS = set()
                for ii, tD in enumerate(eD["rcsb_entity_container_identifiers"]
                                        ["reference_sequence_identifiers"]):
                    if tD["database_name"] == referenceDatabaseName and tD[
                            "provenance_source"] == provSource:
                        accS.add(tD["database_accession"])
                        refIdD[tD["database_accession"]].append(entityKey)
                        #
                        # pick up the corresponding taxonomy -
                        try:
                            taxIdD[tD["database_accession"]].append(
                                eD["rcsb_entity_source_organism"][ii]
                                ["ncbi_taxonomy_id"])
                        except Exception:
                            logger.warning("Failing taxonomy lookup for %s %r",
                                           entityKey, tD["database_accession"])

                logger.debug("PDB assigned sequences length %d", len(accS))
            except Exception as e:
                numMissing += 1
                logger.debug("No sequence assignments for %s with %s",
                             entityKey, str(e))
        #
        for refId, taxIdL in taxIdD.items():
            taxIdL = list(set(taxIdL))
            if len(taxIdL) > 1:
                logger.info(
                    "Multitple taxIds assigned to reference sequence id %s: %r",
                    refId, taxIdL)

        logger.info("Unique %s accession assignments by %s %d (missing %d) ",
                    referenceDatabaseName, provSource, len(refIdD), numMissing)
        return refIdD, taxIdD

    def __reMapAccessions(self,
                          rsiDL,
                          referenceDatabaseName="UniProt",
                          provSourceL=None,
                          excludeReferenceDatabases=None):
        """Internal method to re-map accessions for the input databae and assignment source

        Args:
            rsiDL (list): list of accession
            databaseName (str, optional): resource database name. Defaults to 'UniProt'.
            provSource (str, optional): assignment provenance. Defaults to 'PDB'.

        Returns:
            bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input object list
        """
        isMatched = False
        unMapped = 0
        matched = 0
        excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else [
            "PDB"
        ]
        provSourceL = provSourceL if provSourceL else []
        retDL = []
        for rsiD in rsiDL:
            if rsiD["database_name"] in excludeReferenceDatabases:
                unMapped += 1
                continue
            if rsiD["database_name"] == referenceDatabaseName and rsiD[
                    "provenance_source"] in provSourceL:
                try:
                    if len(self.__matchD[rsiD["database_accession"]]
                           ["matchedIds"]) == 1:
                        rsiD["database_accession"] = self.__matchD[
                            rsiD["database_accession"]]["matchedIds"][0]
                        matched += 1
                    else:
                        logger.info(
                            "Skipping mapping to multiple superseding accessions %s",
                            rsiD["database_accession"])
                    #
                except Exception:
                    unMapped += 1
            retDL.append(rsiD)
        if matched == len(retDL):
            isMatched = True
        return not unMapped, isMatched, retDL

    def __reMapAlignments(self,
                          alignDL,
                          referenceDatabaseName="UniProt",
                          provSourceL=None,
                          excludeReferenceDatabases=None):
        """Internal method to re-map alignments for the input databae and assignment source

        Args:
            alignDL (list): list of aligned regions
            databaseName (str, optional): resource database name. Defaults to 'UniProt'.
            provSourceL (list, optional): assignment provenance. Defaults to 'PDB'.

        Returns:
            bool, list: flag for mapping success, and remapped (and unmapped) accessions in the input align list
        """
        isMatched = False
        unMapped = 0
        matched = 0
        excludeReferenceDatabases = excludeReferenceDatabases if excludeReferenceDatabases else [
            "PDB"
        ]
        retDL = []
        provSourceL = provSourceL if provSourceL else []
        for alignD in alignDL:
            if alignD["reference_database_name"] in excludeReferenceDatabases:
                unMapped += 1
                continue
            if alignD[
                    "reference_database_name"] == referenceDatabaseName and alignD[
                        "provenance_code"] in provSourceL:
                try:
                    if len(self.__matchD[
                            alignD["reference_database_accession"]]
                           ["matchedIds"]) == 1:
                        alignD["reference_database_accession"] = self.__matchD[
                            alignD["reference_database_accession"]][
                                "matchedIds"][0]
                        matched += 1
                    else:
                        logger.info(
                            "Skipping alignment mapping to multiple superseding accessions %s",
                            alignD["reference_database_accession"])
                except Exception:
                    unMapped += 1
            retDL.append(alignD)
        if matched == len(retDL):
            isMatched = True
        #
        return not unMapped, isMatched, retDL

    def __getSiftsAccessions(self, entityKey, authAsymIdL):
        retL = []
        saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
        for (_, dbAccession), _ in saoLD.items():
            retL.append({
                "database_name": "UniProt",
                "database_accession": dbAccession,
                "provenance_source": "SIFTS"
            })
        return retL

    def __getSiftsAlignments(self, entityKey, authAsymIdL):
        retL = []
        saoLD = self.__ssP.getLongestAlignments(entityKey[:4], authAsymIdL)
        for (_, dbAccession), saoL in saoLD.items():
            dD = {
                "reference_database_name": "UniProt",
                "reference_database_accession": dbAccession,
                "provenance_code": "SIFTS",
                "aligned_regions": []
            }
            for sao in saoL:
                dD["aligned_regions"].append({
                    "ref_beg_seq_id":
                    sao.getDbSeqIdBeg(),
                    "entity_beg_seq_id":
                    sao.getEntitySeqIdBeg(),
                    "length":
                    sao.getEntityAlignLength()
                })
            retL.append(dD)
        return retL

    def __buildUpdate(self, assignRefD):
        #
        updateDL = []
        for entityKey, eD in assignRefD.items():
            selectD = {"rcsb_id": entityKey}
            try:
                updateD = {}
                authAsymIdL = []
                ersDL = (eD["rcsb_entity_container_identifiers"]
                         ["reference_sequence_identifiers"]
                         if "reference_sequence_identifiers"
                         in eD["rcsb_entity_container_identifiers"] else None)
                #
                #
                if ersDL:
                    authAsymIdL = eD["rcsb_entity_container_identifiers"][
                        "auth_asym_ids"]
                    isMapped, isMatched, updErsDL = self.__reMapAccessions(
                        ersDL,
                        referenceDatabaseName="UniProt",
                        provSourceL=["PDB"])
                    #
                    if not isMapped or not isMatched:
                        tL = self.__getSiftsAccessions(entityKey, authAsymIdL)
                        if tL:
                            logger.debug(
                                "Using SIFTS accession mapping for %s",
                                entityKey)
                        else:
                            logger.info(
                                "No alternative SIFTS accession mapping for %s",
                                entityKey)
                        updErsDL = tL if tL else []
                    #
                    if len(updErsDL) < len(ersDL):
                        logger.info(
                            "Incomplete reference sequence mapping update for %s",
                            entityKey)
                    updateD[
                        "rcsb_entity_container_identifiers.reference_sequence_identifiers"] = updErsDL
                #
                alignDL = eD[
                    "rcsb_polymer_entity_align"] if "rcsb_polymer_entity_align" in eD else None
                if alignDL and authAsymIdL:
                    isMapped, isMatched, updAlignDL = self.__reMapAlignments(
                        alignDL,
                        referenceDatabaseName="UniProt",
                        provSourceL=["PDB"])
                    #
                    if not isMapped or not isMatched:
                        tL = self.__getSiftsAlignments(entityKey, authAsymIdL)
                        if tL:
                            logger.debug(
                                "Using SIFTS alignment mapping for %s",
                                entityKey)
                        else:
                            logger.info(
                                "No alternative SIFTS alignment mapping for %s",
                                entityKey)
                        updAlignDL = tL if tL else updAlignDL
                    #
                    if len(updAlignDL) < len(alignDL):
                        logger.info(
                            "Incomplete alignment mapping update for %s",
                            entityKey)
                    updateD["rcsb_polymer_entity_align"] = updAlignDL
                #
                if updateD:
                    updateDL.append({"selectD": selectD, "updateD": updateD})
            except Exception as e:
                logger.exception("Mapping error for %s with %s", entityKey,
                                 str(e))
        #
        return updateDL

    def __rebuildReferenceCache(self, refDbName, idList, **kwargs):
        """ """
        dD = {}
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, "exdb")
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
        useCache = kwargs.get("useCache", True)
        fetchLimit = kwargs.get("fetchLimit", None)
        saveText = kwargs.get("saveText", False)
        #
        ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
        fn = "ref-sequence-data-cache" + "." + ext
        cacheFilePath = os.path.join(dirPath, fn)
        #
        self.__mU.mkdir(dirPath)
        if not useCache:
            for fp in [cacheFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and cacheFilePath and self.__mU.exists(cacheFilePath):
            dD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
            # Check for completeness -
            missingS = set(dD["refDbCache"].keys()) - set(idList)
            if missingS:
                logger.info("Reference sequence cache missing %d accessions",
                            len(missingS))
                extraD = self.__fetchReferenceEntries(refDbName,
                                                      list(missingS),
                                                      saveText=saveText,
                                                      fetchLimit=fetchLimit)
                dD["refDbCache"].update(extraD["refDbCache"])
                dD["matchInfo"].update(extraD["matchInfo"])
                if cacheFilePath and cacheKwargs:
                    self.__mU.mkdir(dirPath)
                    ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
                    logger.info("Cache updated with status %r", ok)
            #
        else:
            dD = self.__fetchReferenceEntries(refDbName,
                                              idList,
                                              saveText=saveText,
                                              fetchLimit=fetchLimit)
            if cacheFilePath and cacheKwargs:
                self.__mU.mkdir(dirPath)
                ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
                logger.info("Cache save status %r", ok)

        return dD["refDbCache"], dD["matchInfo"]

    def __fetchReferenceEntries(self,
                                refDbName,
                                idList,
                                saveText=False,
                                fetchLimit=None):
        """Fetch database entries from the input reference sequence database name."""
        dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}}

        try:
            idList = idList[:fetchLimit] if fetchLimit else idList
            logger.info("Starting fetch for %d %s entries", len(idList),
                        refDbName)
            if refDbName == "UniProt":
                fobj = UniProtUtils(saveText=saveText)
                refD, matchD = fobj.fetchList(idList)
                dD = {
                    "refDbName": refDbName,
                    "refDbCache": refD,
                    "matchInfo": matchD
                }

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return dD

    def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
        abbreviated = kwargs.get("siftsAbbreviated", "PROD")
        cachePath = kwargs.get("cachePath", ".")
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        useCache = kwargs.get("useCache", True)
        #
        siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH",
                                             sectionName=configName)
        # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath)
        if siftsSummaryDataPath.lower().startswith("http"):
            srcDirPath = siftsSummaryDataPath
        else:
            srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
        cacheDirPath = os.path.join(
            cachePath,
            cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
        logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
        ssP = SiftsSummaryProvider(srcDirPath=srcDirPath,
                                   cacheDirPath=cacheDirPath,
                                   useCache=useCache,
                                   abbreviated=abbreviated,
                                   cacheKwargs=cacheKwargs)
        logger.info("ssP entry count %d", ssP.getEntryCount())
        return ssP

    def __dumpEntries(self, refD):
        for (eId, eDict) in refD.items():
            logger.info("------ Reference id %s", eId)
            for k, v in eDict.items():
                logger.info("%-15s = %r", k, v)

    def __getUpdateAssignmentCandidates(self, objD):
        totCount = 0
        difCount = 0
        pdbUnpIdD = defaultdict(list)
        siftsUnpIdD = defaultdict(list)
        assignIdDifD = defaultdict(list)
        #
        for entityKey, eD in objD.items():
            try:
                siftsS = set()
                pdbS = set()
                for tD in eD["rcsb_entity_container_identifiers"][
                        "reference_sequence_identifiers"]:
                    if tD["database_name"] == "UniProt":
                        if tD["provenance_source"] == "SIFTS":
                            siftsS.add(tD["database_accession"])
                            siftsUnpIdD[tD["database_accession"]].append(
                                entityKey)
                        elif tD["provenance_source"] == "PDB":
                            pdbS.add(tD["database_accession"])
                            pdbUnpIdD[tD["database_accession"]].append(
                                entityKey)
                    else:
                        logger.debug("No UniProt for %r",
                                     eD["rcsb_entity_container_identifiers"])
                logger.debug("PDB assigned sequence length %d", len(pdbS))
                logger.debug("SIFTS assigned sequence length %d", len(siftsS))

                if pdbS and siftsS:
                    totCount += 1
                    if pdbS != siftsS:
                        difCount += 1
                        for idV in pdbS:
                            assignIdDifD[idV].append(entityKey)

            except Exception as e:
                logger.warning("No identifiers for %s with %s", entityKey,
                               str(e))
        #
        logger.info("Total %d differences %d", totCount, difCount)
        logger.info("Unique UniProt accession assignments PDB %d  SIFTS %d",
                    len(pdbUnpIdD), len(siftsUnpIdD))
        logger.info("Current unique overalapping assignment differences %d ",
                    len(assignIdDifD))
        logger.info("Current unique overalapping assignment differences %r ",
                    assignIdDifD)
        return assignIdDifD, pdbUnpIdD, siftsUnpIdD

    def getReferenceAccessionAlignSummary(self):
        """Summarize the alignment of PDB accession assignments with the current reference sequence database."""
        numPrimary = 0
        numSecondary = 0
        numNone = 0
        for _, mD in self.__matchD.items():
            if mD["matched"] == "primary":
                numPrimary += 1
            elif mD["matched"] == "secondary":
                numSecondary += 1
            else:
                numNone += 1
        logger.debug("Matched primary:  %d secondary: %d none %d", numPrimary,
                     numSecondary, numNone)
        return numPrimary, numSecondary, numNone

    def getLoadStatus(self):
        return self.__statusList

    def __updateStatus(self, updateId, databaseName, collectionName, status,
                       startTimestamp):
        try:
            sFlag = "Y" if status else "N"
            desp = DataExchangeStatus()
            desp.setStartTime(tS=startTimestamp)
            desp.setObject(databaseName, collectionName)
            desp.setStatus(updateId=updateId, successFlag=sFlag)
            desp.setEndTime()
            self.__statusList.append(desp.getStatus())
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
    def __rebuildCache(self,
                       targetUrl,
                       mapNameL,
                       outDirPath,
                       rawDirPath,
                       fmt="pickle",
                       useCache=True):
        """Fetch the UniProt selected id mapping resource file and extract
        UniProt Acc to  'mapIndex' mapping. Serialize the mapping as required.

        Args:
            targetUrl (str): source URL of the remote index file
            mapNameL (list): list of key mapping names to extract from the index
            outDirPath (str): directory path for raw and processed mapping files
            fmt (str, optional): output format (pickle|json) . Defaults to "pickle".
            useCache (bool, optional): use cached files. Defaults to True.

        Returns:
            dict: od[uniprotId] = mapped value

                idmapping_selected.tab

                1. UniProtKB-AC
                2. UniProtKB-ID
                3. GeneID (EntrezGene)
                4. RefSeq
                5. GI
                6. PDB
                7. GO
                8. UniRef100
                9. UniRef90
                10. UniRef50
                11. UniParc
                12. PIR
                13. NCBI-taxon
                14. MIM
                15. UniGene
                16. PubMed
                17. EMBL
                18. EMBL-CDS
                19. Ensembl
                20. Ensembl_TRS
                21. Ensembl_PRO
                22. Additional PubMed

        """
        startTime = time.time()
        nL = mapNameL
        oD = {}
        try:
            fileU = FileUtil()
            fExt = "pic" if fmt == "pickle" else "json"
            fExt = "tdd" if fmt == "tdd" else fExt
            fN, _ = os.path.splitext(fileU.getFileName(targetUrl))
            mapFileName = fN + "-map." + fExt
            idMapPath = os.path.join(outDirPath, mapFileName)
            mU = MarshalUtil()
            if useCache and mU.exists(idMapPath):
                logger.info("Reading cached serialized file %r", idMapPath)
                if fmt in ["pickle", "json"]:
                    tD = mU.doImport(idMapPath, fmt=fmt)
                    nL = list(set(tD["idNameList"]))
                    oD = tD["uniprotMapD"]
                    logger.info("keys %r", list(oD.keys())[:10])
                    logger.info("nL %r", nL)
                    ok = True
                elif fmt == "tdd":
                    ioU = IoUtil()
                    it = ioU.deserializeCsvIter(idMapPath,
                                                delimiter="\t",
                                                rowFormat="list",
                                                encodingErrors="ignore")
                    tL = next(it, [])
                    nL = tL[1:]
                    if len(nL) == 1:
                        for row in it:
                            oD[row[0]] = row[1]
                    else:
                        for row in it:
                            oD[row[0]] = row[1:]
                    ok = True
            else:
                idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl))
                if not fileU.exists(idPath):
                    logger.info(
                        "Fetching selected UniProt idmapping data from %r in %r",
                        targetUrl, outDirPath)
                    ok = fileU.get(targetUrl, idPath)
                    if not ok:
                        logger.error("Failed to downlowd %r", targetUrl)
                        return oD
                else:
                    logger.info("Using cached mapping file %r", idPath)
                # ---
                ioU = IoUtil()
                if fmt in ["pickle", "json"]:
                    if len(mapNameL) == 1:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            oD[row[0]] = str(
                                row[self.__mapRecordD[mapNameL[0]] - 1])
                    else:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            for mapName in mapNameL:
                                oD.setdefault(row[0], []).append(
                                    str(row[self.__mapRecordD[mapName] - 1]))
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    ok = mU.doExport(idMapPath, {
                        "idNameList": mapNameL,
                        "uniprotMapD": oD
                    },
                                     fmt=fmt)
                elif fmt == "tdd":
                    #
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    fU = FileUtil()
                    fU.mkdirForFile(idMapPath)
                    colNameL = []
                    colNameL.append("UniProtId")
                    colNameL.extend(mapNameL)
                    with open(idMapPath, "w", encoding="utf-8") as ofh:
                        ofh.write("%s\n" % "\t".join(colNameL))
                        if len(mapNameL) == 1:
                            idx = self.__mapRecordD[mapNameL[0]] - 1
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write("%s\t%s\n" % (row[0], row[idx]))
                        else:
                            idxL = [0]
                            idxL.extend([
                                self.__mapRecordD[mapName] - 1
                                for mapName in mapNameL
                            ])
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write(
                                    "%s\n" %
                                    "\t".join([str(row[idx]) for idx in idxL]))
                            #
                    nL, oD = self.__rebuildCache(targetUrl,
                                                 mapNameL,
                                                 outDirPath,
                                                 rawDirPath,
                                                 fmt=fmt,
                                                 useCache=True)
                    ok = True if nL and oD else False
            logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return nL, oD
Esempio n. 7
0
class InterProProvider(object):
    """Manage mappings of InterPro identifiers to description and parent/child relationships"""

    def __init__(self, **kwargs):
        urlTargetInterPro = kwargs.get("urlTargetInterPro", "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/entry.list")
        urlTargetInterProFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/InterPro/entry.list"
        urlTargetInterProParent = kwargs.get("urlTargetInterPro", "ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/ParentChildTreeFile.txt")
        urlTargetInterProParentFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/InterPro/ParentChildTreeFile.txt"
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, "interPro")
        useCache = kwargs.get("useCache", True)
        #
        self.__mU = MarshalUtil(workPath=dirPath)
        self.__interProD, self.__interProParentD = self.__rebuildCache(urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache)

    def getDescription(self, interProId):
        ret = None
        try:
            ret = self.__interProD[interProId]["description"]
        except Exception:
            pass
        return ret

    def getType(self, interProId):
        ret = None
        try:
            ret = self.__interProD[interProId]["type"]
        except Exception:
            pass
        return ret

    def testCache(self):
        # Check length ...
        logger.info("Length InterPro %d", len(self.__interProD))
        return len(self.__interProD) > 1000

    #
    def __rebuildCache(self, urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache):
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        interProDataPath = os.path.join(dirPath, "interPro-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(interProDataPath):
            rD = self.__mU.doImport(interProDataPath, fmt=fmt)
            interProD = rD["index"]
            interProParentD = rD["parents"]
            logger.debug("InterPro index length %d parent length %d", len(interProD), len(interProParentD))
        else:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetInterPro, dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetInterPro))
            ok = fU.get(urlTargetInterPro, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProFB))
                ok = fU.get(urlTargetInterProFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            interProD = self.__getInterProIndex(fp)

            logger.info("Caching %d in %s status %r", len(interProD), interProDataPath, ok)
            # ------
            logger.info("Fetch data from source %s in %s", urlTargetInterProParent, dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParent))
            ok = fU.get(urlTargetInterProParent, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParentFB))
                ok = fU.get(urlTargetInterProParentFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            interProParentD = self.__getInterProParents(fp)
            #
            ok = self.__mU.doExport(interProDataPath, {"index": interProD, "parents": interProParentD}, fmt=fmt)
        #
        return interProD, interProParentD

    def getLineage(self, idCode):
        pList = []
        try:
            pList.append(idCode)
            pt = self.getParentId(idCode)
            while (pt is not None) and (pt != 1):
                pList.append(pt)
                pt = self.getParentId(pt)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        pList.reverse()
        return pList

    def getLineageWithNames(self, idCode):
        linL = []
        try:
            idCodeL = self.getLineage(idCode)
            for ii, idCode in enumerate(idCodeL, 1):
                linL.append((idCode, self.getDescription(idCode), ii))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return linL

    def getParentId(self, idCode):
        try:
            return self.__interProParentD[idCode]
        except Exception:
            pass
        return None

    def getTreeNodeList(self, filterD=None):
        dL = []
        try:
            for idCode, _ in self.__interProD.items():
                if filterD and idCode not in filterD:
                    continue
                displayName = self.getDescription(idCode)
                pId = self.getParentId(idCode)
                linL = self.getLineage(idCode)
                #
                if pId is None:
                    dD = {"id": idCode, "name": displayName, "depth": 0}
                else:
                    dD = {"id": idCode, "name": displayName, "parents": [pId], "depth": len(linL) - 1}
                dL.append(dD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return dL

    def __getInterProParents(self, filePath):
        """Read the InterPro parent hierarchy and return a dictionary parent ids.

        Args:
            filePath (str): path to InterPro parent/child hierachy

        Returns:
            dict: {idCode: parentIdCode or None}
        """
        interProParentD = {}
        lineL = self.__mU.doImport(filePath, fmt="list")
        stack = []
        for line in lineL:
            content = line.rstrip()  # drop \n
            row = content.split("--")
            ff = row[-1].split("::")
            tS = ff[0].strip()
            # stack[:] = stack[: len(row) - 1] + [row[-1]]
            stack[:] = stack[: len(row) - 1] + [tS]
            for ii, idCode in enumerate(stack):
                if idCode not in interProParentD:  # prevents overwriting the parent of idCode, in case idCode has already been iterated over in ParentChildTreeFile.txt
                    interProParentD[idCode] = None if ii == 0 else stack[ii - 1]
                else:
                    # This will correct the parent of idCode from being None if it's later identified as having a parent at another point in ParentChildTreeFile.txt
                    if interProParentD[idCode] is None and ii != 0:
                        interProParentD[idCode] = stack[ii - 1]
            logger.debug("Lineage %r", "\t".join(stack))
        #
        return interProParentD

    def __getInterProIndex(self, filePath):
        """Read CSV file of InterPro accessions and descriptions

        Args:
            filePath (str): path to InterPro accession/description csv file

        Returns:
            dict: {idCode: description}
        """

        interProD = {}
        encodingD = {"encoding": "ascii"} if sys.version_info[0] < 3 else {}
        rowL = self.__mU.doImport(filePath, fmt="tdd", rowFormat="list", **encodingD)
        for row in rowL:
            try:
                interProId = row[0].strip().upper()
                interProType = row[1].strip()
                descr = row[2].strip()
                interProD[interProId] = {"description": descr, "type": interProType}
            except Exception:
                pass
        #
        return interProD
class RemovedHoldingsProvider(object):
    """Provide an inventory of removed repository content."""

    def __init__(self, **kwargs):
        self.__dirPath = kwargs.get("holdingsDirPath", ".")
        useCache = kwargs.get("useCache", True)
        baseUrl = kwargs.get("baseUrl", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/")
        urlTarget = kwargs.get("removedTargetUrl", os.path.join(baseUrl, "removed_holdings.json.gz"))
        urlFallbackTarget = kwargs.get("removedTargetUrl", os.path.join(baseUrl, "removed_holdings.json.gz"))
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__invD = self.__reload(urlTarget, urlFallbackTarget, self.__dirPath, useCache=useCache)

    def testCache(self, minCount=1000):
        logger.info("Inventory length cD (%d)", len(self.__invD))
        if len(self.__invD) > minCount:
            return True
        return False

    def getStatusCode(self, entryId):
        """Return the status code for the removed entry"""
        try:
            return self.__invD[entryId.upper()]["status_code"]
        except Exception as e:
            logger.debug("Failing for %r with %s", entryId, str(e))
        return None

    def getRemovedInfo(self, entryId):
        """Return the dictionary describing the details for this removed entry"""
        try:
            return self.__invD[entryId.upper()]
        except Exception as e:
            logger.debug("Failing for %r with %s", entryId, str(e))
        return {}

    def getContentTypes(self, entryId):
        """Return the removed content types for the input entry identifier"""
        try:
            return sorted(self.__invD[entryId.upper()]["content_type"].keys())
        except Exception as e:
            logger.debug("Failing for %r with %s", entryId, str(e))
        return []

    def getContentTypePathList(self, entryId, contentType):
        """Return the removed content types for the input entry identifier"""
        try:
            return (
                self.__invD[entryId.upper()]["content_type"][contentType]
                if isinstance(self.__invD[entryId.upper()]["content_type"][contentType], list)
                else [self.__invD[entryId.upper()]["content_type"][contentType]]
            )
        except Exception as e:
            logger.debug("Failing for %r %r with %s", entryId, contentType, str(e))
        return []

    def getInventory(self):
        """Return the removed inventory dictionary"""
        try:
            return self.__invD
        except Exception as e:
            logger.debug("Failing with %s", str(e))
        return {}

    def __reload(self, urlTarget, urlFallbackTarget, dirPath, useCache=True):
        invD = {}
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        fp = os.path.join(dirPath, fn)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(fp):
            invD = self.__mU.doImport(fp, fmt="json")
            logger.debug("Reading cached inventory (%d)", len(invD))
        else:
            logger.info("Fetch inventory from %s", urlTarget)
            ok = fU.get(urlTarget, fp)
            if not ok:
                ok = fU.get(urlFallbackTarget, fp)
            #
            if ok:
                invD = self.__mU.doImport(fp, fmt="json")
        #
        return invD
class ReferenceSequenceUtils(object):
    """Selected utilities to integrate reference sequence information with PDB polymer entity data."""
    def __init__(self, cfgOb, refDbName, **kwargs):
        self.__cfgOb = cfgOb
        self.__refDbName = refDbName
        self.__mU = MarshalUtil()
        #
        self.__refIdList = self.__getReferenceAssignments(refDbName, **kwargs)
        self.__refD, self.__matchD = self.__rebuildCache(
            refDbName, self.__refIdList, **kwargs)

    def __getReferenceAssignments(self, refDbName, **kwargs):
        """Get all accessions assigned to input reference sequence database"""
        rL = []
        exdbDirPath = kwargs.get("exdbDirPath", None)
        cacheKwargs = kwargs.get("cacheKwargs", None)
        useCache = kwargs.get("useCache", True)
        entryLimit = kwargs.get("entryLimit", None)

        try:
            epe = EntityPolymerExtractor(self.__cfgOb,
                                         exdbDirPath=exdbDirPath,
                                         useCache=useCache,
                                         cacheKwargs=cacheKwargs,
                                         entryLimit=entryLimit)
            eCount = epe.getEntryCount()
            rL = epe.getRefSeqAccessions(refDbName)
            logger.info(
                "Reading polymer entity cache with repository entry count %d ref accession length %d ",
                eCount, len(rL))
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return rL

    def __rebuildCache(self, refDbName, idList, **kwargs):
        """ """
        dD = {}
        dirPath = kwargs.get("exdbDirPath", None)
        cacheKwargs = kwargs.get("cacheKwargs", None)
        useCache = kwargs.get("useCache", True)
        fetchLimit = kwargs.get("fetchLimit", None)
        saveText = kwargs.get("saveText", False)

        ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
        fn = "ref-sequence-data-cache" + "." + ext
        cacheFilePath = os.path.join(dirPath, fn)
        self.__mU.mkdir(dirPath)
        if not useCache:
            for fp in [cacheFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and cacheFilePath and self.__mU.exists(cacheFilePath):
            dD = self.__mU.doImport(cacheFilePath, **cacheKwargs)
        else:
            dD = self.__fetchReferenceEntries(refDbName,
                                              idList,
                                              saveText=saveText,
                                              fetchLimit=fetchLimit)
            if cacheFilePath and cacheKwargs:
                self.__mU.mkdir(dirPath)
                ok = self.__mU.doExport(cacheFilePath, dD, **cacheKwargs)
                logger.info("Cache save status %r", ok)

        return dD["refDbCache"], dD["matchInfo"]

    def __fetchReferenceEntries(self,
                                refDbName,
                                idList,
                                saveText=False,
                                fetchLimit=None):
        """Fetch database entries from the input reference sequence database name."""
        dD = {"refDbName": refDbName, "refDbCache": {}, "matchInfo": {}}

        try:
            idList = idList[:fetchLimit] if fetchLimit else idList
            logger.info("Starting fetch for %d %s entries", len(idList),
                        refDbName)
            if refDbName == "UNP":
                fobj = UniProtUtils(saveText=saveText)
                refD, matchD = fobj.fetchList(idList)
                dD = {
                    "refDbName": refDbName,
                    "refDbCache": refD,
                    "matchInfo": matchD
                }

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return dD

    def __dumpEntries(self, refD):
        for (eId, eDict) in refD.items():
            logger.info("------ Entry id %s", eId)
            for k, v in eDict.items():
                logger.info("%-15s = %r", k, v)

    def getReferenceAccessionAlignSummary(self):
        """Summarize the alignment of PDB accession assignments with the current reference sequence database."""
        numPrimary = 0
        numSecondary = 0
        numNone = 0
        for _, mD in self.__matchD.items():
            if mD["matched"] == "primary":
                numPrimary += 1
            elif mD["matched"] == "secondary":
                numSecondary += 1
            else:
                numNone += 1
        logger.debug("Matched primary:  %d secondary: %d none %d", numPrimary,
                     numSecondary, numNone)
        return numPrimary, numSecondary, numNone
class ReferenceSequenceAssignmentProvider(object):
    """Utilities to cache content required to update referencence sequence assignments."""

    def __init__(
        self,
        cfgOb,
        databaseName="pdbx_core",
        collectionName="pdbx_core_polymer_entity",
        polymerType="Protein",
        referenceDatabaseName="UniProt",
        provSource="PDB",
        maxChunkSize=100,
        fetchLimit=None,
        **kwargs
    ):
        self.__cfgOb = cfgOb
        self.__polymerType = polymerType
        self.__mU = MarshalUtil()
        #
        self.__maxChunkSize = maxChunkSize
        self.__statusList = []
        #
        self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
        self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs)

    def goIdExists(self, goId):
        try:
            return self.__goP.exists(goId)
        except Exception as e:
            logger.exception("Failing for %r with %s", goId, str(e))
        return False

    def getGeneOntologyLineage(self, goIdL):
        # "id"     "name"
        gL = []
        try:
            gTupL = self.__goP.getUniqueDescendants(goIdL)
            for gTup in gTupL:
                gL.append({"id": gTup[0], "name": gTup[1]})
        except Exception as e:
            logger.exception("Failing for %r with %s", goIdL, str(e))
        return gL

    def getPfamProvider(self):
        return self.__pfP

    def getInterProProvider(self):
        return self.__ipP

    def getEcProvider(self):
        return self.__ecP

    def getSiftsSummaryProvider(self):
        return self.__ssP

    def getMatchInfo(self):
        return self.__matchD

    def getRefData(self):
        return self.__refD

    def getDocuments(self, formatType="exchange"):
        fobj = UniProtUtils(saveText=False)
        exObjD = fobj.reformat(self.__refD, formatType=formatType)
        return list(exObjD.values())

    def getRefIdMap(self):
        return self.__refIdMapD

    def getRefDataCount(self):
        return len(self.__refD)

    def testCache(self, minMatchPrimaryPercent=None, logSizes=False):
        okC = True
        logger.info("Reference cache lengths: refIdMap %d matchD %d refD %d", len(self.__refIdMapD), len(self.__matchD), len(self.__refD))
        ok = bool(self.__refIdMapD and self.__matchD and self.__refD)
        #
        numRef = len(self.__refIdMapD)
        countD = defaultdict(int)
        logger.info("Match dictionary length %d", len(self.__matchD))
        for _, mD in self.__matchD.items():
            if "matched" in mD:
                countD[mD["matched"]] += 1
        logger.info("Reference length %d match length %d coverage %r", len(self.__refD), len(self.__matchD), countD.items())
        if minMatchPrimaryPercent:
            try:
                okC = 100.0 * float(countD["primary"]) / float(numRef) > minMatchPrimaryPercent
            except Exception:
                okC = False
            logger.info("Primary reference match percent test status %r", okC)
        #
        if logSizes:
            logger.info(
                "Pfam %.2f InterPro %.2f SIFTS %.2f GO %.2f EC %.2f RefIdMap %.2f RefMatchD %.2f RefD %.2f",
                getObjSize(self.__pfP) / 1000000.0,
                getObjSize(self.__ipP) / 1000000.0,
                getObjSize(self.__ssP) / 1000000.0,
                getObjSize(self.__goP) / 1000000.0,
                getObjSize(self.__ecP) / 1000000.0,
                getObjSize(self.__refIdMapD) / 1000000.0,
                getObjSize(self.__matchD) / 1000000.0,
                getObjSize(self.__refD) / 1000000.0,
            )
        return ok and okC

    def __reload(self, databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs):
        assignRefD = self.__getPolymerReferenceSequenceAssignments(databaseName, collectionName, polymerType, fetchLimit)
        refIdMapD, _ = self.__getAssignmentMap(assignRefD, referenceDatabaseName=referenceDatabaseName, provSource=provSource)
        #
        entryIdL = [rcsbId[:4] for rcsbId in assignRefD]
        siftsUniProtL = self.__ssP.getEntryUniqueIdentifiers(entryIdL, idType="UNPID")
        logger.info("Incorporating %d SIFTS accessions for %d entries", len(siftsUniProtL), len(entryIdL))
        unpIdList = sorted(set(list(refIdMapD.keys()) + siftsUniProtL))
        #
        logger.info("Rebuild cache for %d UniProt accessions (consolidated)", len(unpIdList))
        #
        matchD, refD = self.__rebuildReferenceCache(unpIdList, referenceDatabaseName, **kwargs)
        return refIdMapD, matchD, refD

    def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit):
        """Get all accessions assigned to input reference sequence database for the input polymerType.

        Returns:
         (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []},
                            "rcsb_polymer_entity_align": [],
                            "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
        """
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName=databaseName,
                collectionName=collectionName,
                cacheFilePath=None,
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=None,
                objectLimit=fetchLimit,
                selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
                selectionList=[
                    "rcsb_id",
                    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers",
                    "rcsb_polymer_entity_container_identifiers.auth_asym_ids",
                    # "rcsb_polymer_entity_align",
                    # "rcsb_entity_source_organism.ncbi_taxonomy_id",
                    # "rcsb_polymer_entity_container_identifiers.related_annotation_identifiers",
                    # "rcsb_polymer_entity_annotation",
                    "rcsb_entity_source_organism.ncbi_taxonomy_id",
                ],
            )
            eCount = obEx.getCount()
            logger.info("Polymer entity count type %s is %d", polymerType, eCount)
            objD = obEx.getObjects()
            logger.info("Reading polymer entity count %d ref accession length %d ", eCount, len(objD))
            #
        except Exception as e:
            logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
        return objD

    def __getAssignmentMap(self, objD, referenceDatabaseName="UniProt", provSource="PDB"):
        refIdD = defaultdict(list)
        taxIdD = defaultdict(list)
        numMissing = 0
        numMissingTaxons = 0
        for entityKey, eD in objD.items():
            try:
                accS = set()
                for ii, tD in enumerate(eD["rcsb_polymer_entity_container_identifiers"]["reference_sequence_identifiers"]):
                    if tD["database_name"] == referenceDatabaseName and tD["provenance_source"] == provSource:
                        accS.add(tD["database_accession"])
                        refIdD[tD["database_accession"]].append(entityKey)
                        #
                        # pick up the corresponding taxonomy -
                        try:
                            taxIdD[tD["database_accession"]].append(eD["rcsb_entity_source_organism"][ii]["ncbi_taxonomy_id"])
                        except Exception:
                            logger.debug("Failing taxonomy lookup for %s %r", entityKey, tD["database_accession"])
                            numMissingTaxons += 1

                logger.debug("PDB assigned sequences length %d", len(accS))
            except Exception as e:
                numMissing += 1
                logger.debug("No sequence assignments for %s with %s", entityKey, str(e))
        #
        numMultipleTaxons = 0
        for refId, taxIdL in taxIdD.items():
            taxIdL = list(set(taxIdL))
            if len(taxIdL) > 1:
                logger.debug("Multitple taxIds assigned to reference sequence id %s: %r", refId, taxIdL)
                numMultipleTaxons += 1

        logger.info("Entities with missing taxonomy %d", numMissingTaxons)
        logger.info("Reference sequences with multiple taxonomies %d", numMultipleTaxons)
        logger.info("Unique %s accession assignments by %s %d (entities missing archive accession assignments %d) ", referenceDatabaseName, provSource, len(refIdD), numMissing)
        return refIdD, taxIdD

    #
    def __rebuildReferenceCache(self, idList, refDbName, **kwargs):
        """ """
        fetchLimit = None
        doMissing = True
        dD = {}
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, "exdb")
        # cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        useCache = kwargs.get("useCache", True)
        saveText = kwargs.get("saveText", False)
        #
        ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
        fn = refDbName + "-ref-sequence-data-cache" + "." + ext
        dataCacheFilePath = os.path.join(dirPath, fn)
        #
        fn = refDbName + "-ref-sequence-id-cache" + ".json"
        accCacheFilePath = os.path.join(dirPath, fn)
        #
        self.__mU.mkdir(dirPath)
        if not useCache:
            for fp in [dataCacheFilePath, accCacheFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and accCacheFilePath and self.__mU.exists(accCacheFilePath) and dataCacheFilePath and self.__mU.exists(dataCacheFilePath):
            dD = self.__mU.doImport(dataCacheFilePath, **cacheKwargs)
            idD = self.__mU.doImport(accCacheFilePath, fmt="json")
            logger.info("Reading cached reference sequence ID and data cache files - cached match reference length %d", len(idD["matchInfo"]))
            idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"])
            # Check for completeness -
            if doMissing:
                missingS = set(idList) - set(idD["matchInfo"].keys())
                if missingS:
                    logger.info("Reference sequence cache missing %d accessions", len(missingS))
                    extraD, extraIdD = self.__fetchReferenceEntries(refDbName, list(missingS), saveText=saveText, fetchLimit=fetchLimit)
                    dD["refDbCache"].update(extraD["refDbCache"])
                    idD["matchInfo"].update(extraIdD["matchInfo"])
                    #
                    idD["matchInfo"] = self.__rebuildReferenceMatchIndex(idList, dD["refDbCache"])
                    #
                    if accCacheFilePath and dataCacheFilePath and cacheKwargs:
                        self.__mU.mkdir(dirPath)
                        ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs)
                        ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3)
                        logger.info("Cache updated with missing references with status %r", ok1 and ok2)
            #
        else:
            logger.info("Rebuilding reference cache for %s for %d accessions with limit %r", refDbName, len(idList), fetchLimit)
            dD, idD = self.__fetchReferenceEntries(refDbName, idList, saveText=saveText, fetchLimit=fetchLimit)
            if accCacheFilePath and dataCacheFilePath and cacheKwargs:
                self.__mU.mkdir(dirPath)
                ok1 = self.__mU.doExport(dataCacheFilePath, dD, **cacheKwargs)
                ok2 = self.__mU.doExport(accCacheFilePath, idD, fmt="json", indent=3)
                logger.info("Cache save status %r", ok1 and ok2)

        return idD["matchInfo"], dD["refDbCache"]

    def __rebuildReferenceMatchIndex(self, idList, referenceD):
        fobj = UniProtUtils()
        logger.info("Rebuilding match index on idList (%d) using reference data (%d) %r", len(idList), len(referenceD), type(referenceD))
        matchD = fobj.rebuildMatchResultIndex(idList, referenceD)
        return matchD

    def __fetchReferenceEntries(self, refDbName, idList, saveText=False, fetchLimit=None):
        """Fetch database entries from the input reference sequence database name."""
        dD = {"refDbName": refDbName, "refDbCache": {}}
        idD = {"matchInfo": {}, "refIdMap": {}}

        try:
            idList = idList[:fetchLimit] if fetchLimit else idList
            logger.info("Starting fetch for %d %s entries", len(idList), refDbName)
            if refDbName == "UniProt":
                fobj = UniProtUtils(saveText=saveText)
                logger.info("Maximum reference chunk size %d", self.__maxChunkSize)
                refD, matchD = fobj.fetchList(idList, maxChunkSize=self.__maxChunkSize)
                dD = {"refDbName": refDbName, "refDbCache": refD}
                idD = {"matchInfo": matchD}
            #
            # Check the coverage -
            #
            countD = defaultdict(int)
            logger.info("Match dictionary length %d", len(matchD))
            for _, mD in matchD.items():
                if "matched" in mD:
                    countD[mD["matched"]] += 1
            logger.info("Reference length %d match length %d coverage %r", len(refD), len(matchD), countD.items())
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return dD, idD

    def __fetchSiftsSummaryProvider(self, cfgOb, configName, **kwargs):
        abbreviated = kwargs.get("siftsAbbreviated", "TEST")
        cachePath = kwargs.get("cachePath", ".")
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
        useCache = kwargs.get("useCache", True)
        #
        siftsSummaryDataPath = cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH", sectionName=configName)
        # logger.info("Using SIFTS_SUMMARY_DATA_PATH, %r", siftsSummaryDataPath)
        if siftsSummaryDataPath.lower().startswith("http"):
            srcDirPath = siftsSummaryDataPath
        else:
            srcDirPath = os.path.join(cachePath, siftsSummaryDataPath)
        cacheDirPath = os.path.join(cachePath, cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
        logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
        ssP = SiftsSummaryProvider(srcDirPath=srcDirPath, cacheDirPath=cacheDirPath, useCache=useCache, abbreviated=abbreviated, cacheKwargs=cacheKwargs)
        ok = ssP.testCache()
        logger.debug("SIFTS cache status %r", ok)
        logger.debug("ssP entry count %d", ssP.getEntryCount())
        return ssP

    def __fetchGoProvider(self, cfgOb, configName, **kwargs):
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        #
        cacheDirPath = os.path.join(cachePath, cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        logger.debug("goP %r %r", cacheDirPath, useCache)
        goP = GeneOntologyProvider(goDirPath=cacheDirPath, useCache=useCache)
        ok = goP.testCache()
        logger.debug("Gene Ontology (%r) root node count %r", ok, goP.getRootNodes())
        return goP

    def __fetchEcProvider(self, cfgOb, configName, **kwargs):
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        #
        cacheDirPath = os.path.join(cachePath, cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR", sectionName=configName))
        logger.debug("ecP %r %r", cacheDirPath, useCache)
        ecP = EnzymeDatabaseProvider(enzymeDirPath=cacheDirPath, useCache=useCache)
        ok = ecP.testCache()
        logger.debug("Enzyme cache status %r", ok)
        return ecP

    def __fetchPfamProvider(self, cfgOb, configName, **kwargs):
        _ = cfgOb
        _ = configName
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        pfP = PfamProvider(cachePath=cachePath, useCache=useCache)
        ok = pfP.testCache()
        return pfP if ok else None

    def __fetchInterProProvider(self, cfgOb, configName, **kwargs):
        _ = cfgOb
        _ = configName
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        ipP = InterProProvider(cachePath=cachePath, useCache=useCache)
        ok = ipP.testCache()
        return ipP if ok else None
Esempio n. 11
0
class GlycanProvider(StashableBase):
    """Accessors and generators for entity glycan mapped identifiers.

    dirPath -> CACHE/glycan/
                             branched_entity_glycan_identifier_map.json
                             accession-wurcs-mapping.json
                     stash/entity_glycan_mapped_identifiers.tar.gz

    """
    def __init__(self, **kwargs):
        #
        self.__version = "0.50"
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        self.__dirName = "glycan"
        self.__dirPath = os.path.join(cachePath, self.__dirName)
        super(GlycanProvider, self).__init__(cachePath, [self.__dirName])
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__glyD = self.__reload(fmt="json", useCache=useCache)
        #

    def testCache(self, minCount=1):
        if minCount == 0:
            return True
        if self.__glyD and minCount and ("identifiers" in self.__glyD) and len(
                self.__glyD["identifiers"]) >= minCount:
            logger.info("Glycan identifiers (%d)",
                        len(self.__glyD["identifiers"]))
            return True
        return False

    def getIdentifiers(self):
        """Return a dictionary of related identifiers organized by branched entity id.

        Returns:
            (dict): {entityId: {'idType1': ids, 'idType1': ids}, ... }
        """
        try:
            return self.__glyD["identifiers"] if self.__glyD[
                "identifiers"] else {}
        except Exception as e:
            logger.error("Failing with %r", str(e))
        return {}

    def __getMappingFilePath(self, fmt="json"):
        baseFileName = "branched_entity_glycan_identifier_map"
        fExt = ".json" if fmt == "json" else ".pic"
        fp = os.path.join(self.__dirPath, baseFileName + fExt)
        return fp

    def update(self, cfgOb, fmt="json", indent=3):
        """Update branched entity glycan accession mapping cache.

        Args:
            cfgObj (object): ConfigInfo() object instance

        Returns:
            (bool): True for success for False otherwise
        """
        ok = False
        try:
            gU = GlycanUtils(cfgOb, self.__dirPath)
            eaD = gU.updateEntityAccessionMap()
            logger.info("Got branched entity glycan accession map (%d)",
                        len(eaD))
            #
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            self.__glyD = {
                "version": self.__version,
                "created": tS,
                "identifiers": eaD
            }
            #
            mappingFilePath = self.__getMappingFilePath(fmt=fmt)
            kwargs = {"indent": indent} if fmt == "json" else {}
            ok = self.__mU.doExport(mappingFilePath,
                                    self.__glyD,
                                    fmt=fmt,
                                    **kwargs)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def reload(self):
        """Reload from the current cache file."""
        ok = False
        try:
            self.__glyD = self.__reload(fmt="json", useCache=True)
            ok = self.__glyD is not None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __reload(self, fmt="json", useCache=True):
        mappingFilePath = self.__getMappingFilePath(fmt=fmt)
        tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        pcD = {"version": self.__version, "created": tS, "identifiers": {}}

        if useCache and self.__mU.exists(mappingFilePath):
            logger.info("reading cached path %r", mappingFilePath)
            pcD = self.__mU.doImport(mappingFilePath, fmt=fmt)
        return pcD
class ChemCompIndexProvider(object):
    """Utilities to read and process an index of PDB chemical component definitions."""
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(self.__cachePath, "chem_comp")
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc")
        self.__ccIdxD = self.__reload(**kwargs)

    def getIndexFilePath(self):
        return os.path.join(
            self.__dirPath,
            "%s-idx-chemical-components.json" % self.__ccFileNamePrefix)

    def testCache(self, minCount=None, logSizes=False):
        if logSizes and self.__ccIdxD:
            logger.info("ccIdxD (%.2f MB)",
                        getObjSize(self.__ccIdxD) / 1000000.0)
        ok = self.__ccIdxD and len(
            self.__ccIdxD
        ) >= minCount if minCount else self.__ccIdxD is not None
        return ok

    def matchMolecularFormulaRange(self, typeRangeD, matchSubset=False):
        """Find matching formula for the input atom type range query (evaluates min <= ff <= max).

        Args:
            typeRangeD (dict): dictionary of element ranges {'<element_name>: {'min': <int>, 'max': <int>}}
            matchSubset (bool, optional): test for formula subset (default: False)

        Returns:
            (list):  chemical component identifiers with matching formula (MatchResults)
        """
        rL = []
        try:
            if not typeRangeD:
                return rL
            myTypeRangeD = {k.upper(): v for k, v in typeRangeD.items()}
            queryTypeS = set(myTypeRangeD.keys())
            for ccId, idxD in self.__ccIdxD.items():
                tD = idxD["type-counts"]
                targetTypeS = set(tD.keys())
                if not matchSubset and targetTypeS != queryTypeS:
                    continue
                #
                if not queryTypeS.issubset(targetTypeS):
                    continue
                #
                match = True
                for atomType, rangeD in myTypeRangeD.items():
                    if atomType in tD:
                        # min <= ff <= max
                        if ("min" in rangeD and rangeD["min"] > tD[atomType]
                            ) or ("max" in rangeD
                                  and rangeD["max"] < tD[atomType]):
                            match = False
                            break
                    else:
                        match = False
                        break
                if match:
                    # logger.info("%s formula %r query %r", ccId, idxD["type-counts"], typeRangeD)
                    rL.append(
                        MatchResults(ccId=ccId,
                                     searchType="formula",
                                     formula=idxD["formula"]))
        except Exception as e:
            logger.exception("Failing for %r with %s", typeRangeD, str(e))
        return rL

    def filterMinimumMolecularFormula(self, typeCountD):
        """Find molecules with the minimum formula composition for the input atom type range query (evaluates min <= ff).

        Args:
            typeCountD (dict): dictionary of element minimum values {'<element_name>: #}

        Returns:
            (list):  chemical component identifiers
        """
        rL = []
        try:
            if not typeCountD:
                return list(self.__ccIdxD.keys())

            typeQueryS = set(typeCountD.keys())
            for ccId, idxD in self.__ccIdxD.items():
                tD = idxD["type-counts"]
                #
                if not typeQueryS.issubset(tD):
                    continue
                match = True
                for atomType, minCount in typeCountD.items():
                    try:
                        if minCount > tD[atomType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break
                if match:
                    rL.append(ccId)
        except Exception as e:
            logger.exception("Failing for %r with %s", typeCountD, str(e))
        return rL

    def filterMinimumFormulaAndFeatures(self, typeCountD, featureCountD):
        """Find molecules with the minimum formula and feature composition.

        Args:
            typeCountD (dict): dictionary of element minimum values {'<element_name>: #}
            featureCountD (dict): dictionary of feature minimum values {'<element_name>: #}

        Returns:
            (list):  chemical component identifiers
        """
        rL = []
        try:
            if not typeCountD or not featureCountD:
                return list(self.__ccIdxD.keys())
            # ----
            featureQueryS = set(featureCountD.keys())
            typeQueryS = set(typeCountD.keys())
            #
            for ccId, idxD in self.__ccIdxD.items():
                tD = idxD["type-counts"]
                fD = idxD["feature-counts"]
                #
                if not typeQueryS.issubset(tD) or not featureQueryS.issubset(
                        fD):
                    continue

                match = True
                for atomType, minCount in typeCountD.items():
                    try:
                        if minCount > tD[atomType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break

                if not match:
                    continue
                #
                for featureType, minCount in featureCountD.items():
                    try:
                        if minCount > fD[featureType]:
                            match = False
                            break
                    except Exception:
                        match = False
                        break
                #
                if match:
                    rL.append(ccId)
        except Exception as e:
            logger.exception("Failing for %r with %s", typeCountD, str(e))
        return rL

    def getIndex(self):
        return self.__ccIdxD

    def getIdList(self):
        return list(self.__ccIdxD.keys()) if self.__ccIdxD else []

    def getMol(self, ccId):
        try:
            return self.__ccIdxD[ccId]
        except Exception as e:
            logger.debug("Get molecule %r failing with %s", ccId, str(e))
        return None

    def getSMILES(self, ccId, smiTypeList=None):

        smiTypeList = smiTypeList if smiTypeList else [
            "oe-iso-smiles", "oe-smiles", "acdlabs-smiles",
            "cactvs-iso-smiles", "cactvs-smiles"
        ]
        try:
            sL = []
            for smilesType in smiTypeList:
                if smilesType in self.__ccIdxD[ccId]:
                    sL.append(self.__ccIdxD[ccId][smilesType])
            return sL
        except Exception as e:
            logger.debug("Get SMILES for %r failing with %s", ccId, str(e))
        return []

    def __reload(self, **kwargs):
        """Reload or created index of PDB chemical components.

        Args:
            cachePath (str): path to the directory containing cache files
            ccIdxFileName (str): serialized chemical component data index file name


         Returns:
            (list): chemical component data containers
        """
        #
        logger.debug("kwargs %r", kwargs.items())
        ccIdxD = {}
        useCache = kwargs.get("useCache", True)
        molLimit = kwargs.get("molLimit", 0)

        ccIdxFilePath = self.getIndexFilePath()
        #
        if useCache and self.__mU.exists(ccIdxFilePath):
            _, fExt = os.path.splitext(ccIdxFilePath)
            ccIdxFormat = "json" if fExt == ".json" else "pickle"
            rdCcIdxD = self.__mU.doImport(ccIdxFilePath, fmt=ccIdxFormat)
            ccIdxD = {
                k: rdCcIdxD[k]
                for k in sorted(rdCcIdxD.keys())[:molLimit]
            } if molLimit else rdCcIdxD
        else:
            cmpKwargs = {
                k: v
                for k, v in kwargs.items()
                if k not in ["cachePath", "useCache", "molLimit"]
            }
            ccmP = ChemCompMoleculeProvider(cachePath=self.__cachePath,
                                            useCache=useCache,
                                            molLimit=molLimit,
                                            **cmpKwargs)
            ok = ccmP.testCache(minCount=molLimit, logSizes=True)
            if ok:
                molBuildType = cmpKwargs.get("molBuildType", "model-xyz")
                ccIdxD = self.__updateChemCompIndex(ccmP.getMolD(),
                                                    ccIdxFilePath,
                                                    molBuildType=molBuildType)
        #
        for idxD in ccIdxD.values():
            idxD["atom-types"] = set(idxD["type-counts"].keys()
                                     ) if "type-counts" in idxD else set()
            idxD["feature-types"] = set(idxD["feature-counts"].keys(
            )) if "feature-counts" in idxD else set()
        #
        return ccIdxD

    def __updateChemCompIndex(self,
                              ccObjD,
                              filePath,
                              molBuildType="model-xyz"):
        idxD = {}
        try:
            # Serialized chemical component data index file
            startTime = time.time()
            _, fExt = os.path.splitext(filePath)
            fileFormat = "json" if fExt == ".json" else "pickle"
            idxD = self.__buildChemCompIndex(ccObjD, molBuildType=molBuildType)
            ok = self.__mU.doExport(filePath, idxD, fmt=fileFormat)
            endTime = time.time()
            logger.info(
                "Storing %s with %d raw indexed definitions (status=%r) (%.4f seconds)",
                filePath, len(idxD), ok, endTime - startTime)
        #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return idxD

    def __buildChemCompIndex(self,
                             cD,
                             molBuildType="model-xyz",
                             doFeatures=True):
        """Internal method return a dictionary of extracted chemical component descriptors and formula."""
        rD = {}
        try:
            quietFlag = True
            for _, dataContainer in cD.items():
                ccIt = iter(PdbxChemCompIt(dataContainer))
                cc = next(ccIt, None)
                ccId = cc.getId()
                formula = str(cc.getFormula()).replace(" ", "")
                ambiguousFlag = cc.getAmbiguousFlag().upper() in ["Y", "YES"]
                tch = cc.getFormalCharge()
                fcharge = int(tch) if tch and tch not in [".", "?"] else 0
                #
                logger.debug("ccId %r formula %r ambiguous %r fcharge %r",
                             ccId, formula, ambiguousFlag, fcharge)
                if fcharge:
                    sign = "+" if fcharge > 0 else "-"
                    mag = str(abs(fcharge)) if abs(fcharge) > 1 else ""
                    formula = formula + sign + mag
                #
                atIt = PdbxChemCompAtomIt(dataContainer)
                typeCounts = defaultdict(int)
                for at in atIt:
                    aType = at.getType().upper()
                    typeCounts[aType] += 1
                #
                rD[ccId] = {
                    "formula": formula,
                    "type-counts": typeCounts,
                    "ambiguous": ambiguousFlag,
                    "feature-counts": {}
                }
                desIt = PdbxChemCompDescriptorIt(dataContainer)
                for des in desIt:
                    desBuildType = des.getMolBuildType()
                    tS = des.getDescriptor()
                    descr = tS.strip() if tS else None
                    if not descr:
                        continue
                    if desBuildType in [
                            "oe-iso-smiles", "oe-smiles", "acdlabs-smiles",
                            "cactvs-iso-smiles", "cactvs-smiles", "inchi",
                            "inchikey"
                    ]:
                        rD[ccId][desBuildType] = descr
                    else:
                        logger.error("%s unexpected descriptor build type %r",
                                     ccId, desBuildType)
                if doFeatures:
                    oemf = OeMoleculeFactory()
                    if quietFlag:
                        oemf.setQuiet()
                    tId = oemf.setChemCompDef(dataContainer)
                    if tId != ccId:
                        logger.error(
                            "%s chemical component definition import error",
                            ccId)
                        continue
                    ok = oemf.build(molBuildType=molBuildType)
                    if ok:
                        rD[ccId]["feature-counts"] = oemf.getFeatureCounts()

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return rD
Esempio n. 13
0
class GlycanUtils:
    """Utilities for fetching and mapping glycan annotations."""

    def __init__(self, cfgOb, dirPath):
        self.__cfgOb = cfgOb
        self.__dirPath = dirPath
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        #

    def __getRawGlycanDetailsPath(self):
        return os.path.join(self.__dirPath, "pdb-raw-branched-entity-details.json")

    def getBranchedEntityDetails(self):
        """For branched entities, get BIRD mapping and WURCS details"""
        ok = False
        try:
            bEx = BranchedEntityExtractor(self.__cfgOb)
            branchedEntityD = bEx.getBranchedDetails()
            logger.info("Branched entity descriptor details count %d", len(branchedEntityD))
            detailsPath = self.__getRawGlycanDetailsPath()
            ok = bEx.exportBranchedEntityDetails(detailsPath, fmt="json")
            logger.info("Store raw branched entity data (%r) %s", ok, detailsPath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return branchedEntityD

    def __getGlycanAccessionMapPath(self):
        return os.path.join(self.__dirPath, "accession-wurcs-mapping.json")

    def fetchGlycanAccessionMap(self):
        mapD = {}
        accessionMapPath = self.__getGlycanAccessionMapPath()
        if self.__mU.exists(accessionMapPath):
            mapD = self.__mU.doImport(accessionMapPath, fmt="json")
        return mapD

    def storeGlycanAccessionMap(self, mapD):
        accessionMapPath = self.__getGlycanAccessionMapPath()
        ok = self.__mU.doExport(accessionMapPath, mapD, fmt="json", indent=3)
        return ok

    def updateEntityAccessionMap(self):
        """Update entity to glycan accession mapping

        Returns:
            dict: {entityId: {'glyTouCanId':... , 'prdId': ..., }, ... }
        """
        entityAccessionMapD = {}
        wurcsTupL = []
        uniqueWurcsD = {}
        accessionMapD = self.fetchGlycanAccessionMap()
        branchedEntityD = self.getBranchedEntityDetails()
        for entityId, iD in branchedEntityD.items():
            if iD["wurcs"] and iD["wurcs"] not in accessionMapD and iD["wurcs"] not in uniqueWurcsD:
                wurcsTupL.append((entityId, iD["wurcs"]))
                uniqueWurcsD.setdefault(iD["wurcs"], []).append(entityId)
        if wurcsTupL:
            tMap = self.getAccessionMapping(wurcsTupL)
            accessionMapD.update(tMap)
            self.storeGlycanAccessionMap(accessionMapD)
        #

        for entityId, iD in branchedEntityD.items():
            if iD["wurcs"] in accessionMapD:
                prdId = iD["prdId"] if iD["wurcs"] else None
                entityAccessionMapD[entityId] = {"glyTouCanId": accessionMapD[iD["wurcs"]][0], "prdId": prdId}
        return entityAccessionMapD

    def getAccessionMapping(self, wurcsTupL):
        """Fetch GlyTouCan accessions for the input WURCS desriptor list"""
        accessionMapD = {}
        logger.info("Fetching (%d) WURCS descriptors", len(wurcsTupL))
        baseUrl = "https://api.glycosmos.org"
        endPoint = "glytoucan/sparql/wurcs2gtcids"
        numDescriptors = len(wurcsTupL)
        for ii, (entityId, wurcs) in enumerate(wurcsTupL, 1):
            try:
                pD = {}
                pD["wurcs"] = wurcs
                uR = UrlRequestUtil()
                rDL, retCode = uR.post(baseUrl, endPoint, pD, returnContentType="JSON")
                logger.debug(" %r wurcs fetch result (%r) %r", entityId, retCode, rDL)
                if rDL:
                    for rD in rDL:
                        if "id" in rD:
                            accessionMapD.setdefault(wurcs, []).append(rD["id"])
                        else:
                            logger.info("%r fetch fails (%r) (%r) %r", entityId, retCode, wurcs, rDL)
                if ii % 5 == 0:
                    logger.info("Fetched %d/%d", ii, numDescriptors)
            except Exception as e:
                logger.exception("Failing for (%r) wurcs (%r) with %s", entityId, wurcs, str(e))
        return accessionMapD
class EcodClassificationProvider(StashableBase):
    """Extract ECOD domain assignments, term descriptions and ECOD classification hierarchy
    from ECOD flat files.

    http://prodata.swmed.edu/ecod/

    See:
    H. Cheng, R. D. Schaeffer, Y. Liao, L. N. Kinch, J. Pei, S. Shi, B. H. Kim, N. V. Grishin. (2014)
    ECOD: An evolutionary classification of protein domains. PLoS Comput Biol 10(12): e1003926.

    Linking details:  http://prodata.swmed.edu/ecod/complete/domain/<domainId>

                      http://prodata.swmed.edu/ecod/complete/domain/e6sl5G1
    """

    #
    # --
    def __init__(self, cachePath, useCache, **kwargs):
        self.__cachePath = cachePath
        self.__useCache = useCache
        dirName = "ecod"
        super(EcodClassificationProvider, self).__init__(self.__cachePath, [dirName])
        self.__dirPath = os.path.join(cachePath, "ecod")
        self.__version = None
        #
        urlTarget = kwargs.get("ecodTargetUrl", "http://prodata.swmed.edu/ecod/distributions/ecod.latest.domains.txt")
        urlBackup = kwargs.get("ecodUrlBackupPath", "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/ECOD/ecod.latest.domains.txt.gz")
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__pD, self.__nD, self.__ntD, self.__pdbD = self.__reload(urlTarget, urlBackup, self.__dirPath, useCache=useCache)

    def testCache(self):
        logger.info("ECOD Lengths nD %d pdbD %d", len(self.__nD), len(self.__pdbD))
        if (len(self.__nD) > 100) and (len(self.__pdbD) > 5000):
            return True
        return False

    def getVersion(self):
        return self.__version

    # --
    def getFamilyIds(self, pdbId, authAsymId):
        try:
            return list(set([tup[1] for tup in self.__pdbD[(pdbId.lower(), authAsymId)]]))
        except Exception as e:
            logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e))
        return []

    def getDomainIds(self, pdbId, authAsymId):
        try:
            return list(set([tup[0] for tup in self.__pdbD[(pdbId.lower(), authAsymId)]]))
        except Exception as e:
            logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e))
        return []

    def getFamilyNames(self, pdbId, authAsymId):
        try:
            return list(set([self.getName(tup[1]) for tup in self.__pdbD[(pdbId.lower(), authAsymId)]]))
        except Exception as e:
            logger.exception("Failing for %r %r with %s", pdbId, authAsymId, str(e))
        return []

    def getFamilyResidueRanges(self, pdbId, authAsymId):
        try:
            # pdbD.setdefault((pdbId, authAsymId), []).append((domId, fId, authAsymId, authSeqBeg, authSeqEnd))
            return [(tup[0], tup[1], tup[2], tup[3], tup[4]) for tup in self.__pdbD[(pdbId.lower(), authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId, str(e))
        return []

    def getName(self, domId):
        try:
            return self.__nD[domId].split("|")[0]
        except Exception:
            logger.debug("Undefined ECOD id %r", domId)
        return None

    def getNameType(self, domId):
        qD = {"A": "Architecture", "X": "Possible Homology", "H": "Homology", "T": "Topology", "F": "Family"}
        try:
            return qD[self.__ntD[domId]]
        except Exception:
            logger.debug("Undefined ECOD id %r", domId)
        return None

    def getIdLineage(self, domId):
        pList = []
        try:
            pList.append(domId)
            if domId == 0:
                return pList
            pt = self.__pD[domId]
            while (pt is not None) and (pt != 0):
                pList.append(pt)
                pt = self.__pD[pt]
        except Exception as e:
            logger.exception("Failing for %r with %s", domId, str(e))
        #
        pList.reverse()
        return pList

    def getNameLineage(self, domId):
        try:
            nL = []
            for dId in self.getIdLineage(domId):
                tN = self.getName(dId)
                tN = tN if tN else "Unnamed"
                nL.append(tN)
            return nL
        except Exception as e:
            logger.exception("Failing for %r with %s", domId, str(e))
        return None

    def getTreeNodeList(self):
        return self.__exportTreeNodeList(self.__pD)

    def __getDomainFileName(self):
        pyVersion = sys.version_info[0]
        fn = "ecod_domains-py%s.pic" % str(pyVersion)
        return fn

    def __reload(self, urlTarget, urlBackup, ecodDirPath, useCache=True):
        pD = nD = ntD = pdbD = {}
        fn = self.__getDomainFileName()
        ecodDomainPath = os.path.join(ecodDirPath, fn)
        self.__mU.mkdir(ecodDirPath)
        #
        if useCache and self.__mU.exists(ecodDomainPath):
            sD = self.__mU.doImport(ecodDomainPath, fmt="pickle")
            logger.debug("ECOD domain length %d", len(sD))
            nD = sD["names"]
            ntD = sD["nametypes"]
            pD = sD["parents"]
            pdbD = sD["assignments"]
            self.__version = sD["version"]
        elif not useCache:
            minLen = 1000
            logger.info("Fetch ECOD name and domain assignment data from primary data source %s", urlTarget)
            nmL = self.__fetchFromSource(urlTarget)
            if not nmL:
                nmL = self.__fetchFromSource(urlBackup)
            #
            logger.info("ECOD raw file length (%d)", len(nmL))
            ok = False
            pD, nD, ntD, pdbD = self.__extractDomainHierarchy(nmL)
            #
            tS = datetime.datetime.now().isoformat()
            vS = self.__version
            sD = {"version": vS, "created": tS, "names": nD, "nametypes": ntD, "parents": pD, "assignments": pdbD}
            if (len(nD) > minLen) and (len(pD) > minLen):
                ok = self.__mU.doExport(ecodDomainPath, sD, fmt="pickle")
            logger.debug("Cache save status %r", ok)
            #
        return pD, nD, ntD, pdbD

    def __fetchFromSource(self, urlTarget):
        """Fetch the classification names and domain assignments from the ECOD repo."""
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        fp = os.path.join(self.__dirPath, fn)
        if not fU.exists(fp):
            fU.get(urlTarget, fp)
        #
        with open(fp, "r", encoding="utf-8") as ifh:
            line = ifh.readline()
            line = ifh.readline()
            line = ifh.readline()
            ff = line[:-1].split()
            self.__version = ff[-1]
        #
        nmL = self.__mU.doImport(fp, fmt="list", uncomment=True)
        fU.remove(fp)
        #
        return nmL

    def __extractDomainHierarchy(self, nmL):
        """
        #/data/ecod/database_versions/v280/ecod.develop280.domains.txt
        #ECOD version develop280
        #Domain list version 1.6
        #Grishin lab (http://prodata.swmed.edu/ecod)
        #uid	ecod_domain_id	manual_rep	f_id	pdb	chain	pdb_range	seqid_range	unp_acc	arch_name	x_name	h_name	t_name	f_name	asm_status	ligand
        002728551	e7d2xA1	AUTO_NONREP	1.1.1	7d2x	A	A:-3-183	A:20-206	NO_UNP	beta barrels	"cradle loop barrel"	"RIFT-related"	"acid protease"	F_UNCLASSIFIED
        002728572	e7d5aA2	AUTO_NONREP	1.1.1	7d5a	A	A:-3-183	A:20-206	NO_UNP	beta barrels	"cradle loop barrel"	"RIFT-related"	"acid protease"	F_UNCLASSIFIED
        002726563	e7b1eA1	AUTO_NONREP	1.1.1	7b1e	A	A:46P-183	A:14-199	NO_UNP	beta barrels	"cradle loop barrel"	"RIFT-related"	"acid protease"	F_UNCLASSIFIED
        002726573	e7b1pA2	AUTO_NONREP	1.1.1	7b1p	A	A:47P-183	A:15-199	NO_UNP	beta barrels	"cradle loop barrel"	"RIFT-related"	"acid protease"	F_UNCLASSIFIED
        """
        assignD = {}
        pD = {}
        ntD = {}
        hD = {}
        pIdD = {}
        nmD = {}
        #
        logger.info("Length of input ECOD name list %d", len(nmL))
        for nm in nmL:
            ff = nm.split("\t")
            # uId = ff[0]
            # ecodId is the linkable identifier -
            ecodId = ff[1]
            entryId = ff[4].lower()
            authAsymId = ff[5]
            resRange = ff[6]
            #
            #  There are no unique identifiers published for the internal elements of the hierarchy
            #   so these are assigned here similar to scop -   There are also many unnamed nodes
            #   that are conventionally filled in from the leaf levels of the tree...
            #  {"A": "Architecture", "X": "Possible Homology", "H": "Homology", "T": "Topology", "F": "Family"}
            aGroupOrg = "A: " + ff[9].replace('"', "")
            xGroupOrg = "X: " + ff[10].replace('"', "")
            hGroupOrg = "H: " + ff[11].replace('"', "")
            tGroupOrg = "T: " + ff[12].replace('"', "")
            fGroupOrg = "F: " + ff[13].replace('"', "")
            if hGroupOrg == "H: NO_H_NAME":
                # hGroupOrg = tGroupOrg  + "|(NO_H)"
                hGroupOrg = "H: " + ff[12].replace('"', "") + " (From Topology)" + "|(NO_H)"
            if xGroupOrg == "X: NO_X_NAME":
                if ff[11].replace('"', "") == "NO_H_NAME":
                    # xGroupOrg = hGroupOrg + "|(NO_X)"
                    xGroupOrg = "X: " + ff[12].replace('"', "") + " (From Topology)" + "|(NO_X)"
                else:
                    xGroupOrg = "X: " + ff[11].replace('"', "") + " (From Homology)" + "|(NO_X)"
                #
            fGroupOrg = fGroupOrg if fGroupOrg != "F_UNCLASSIFIED" else "Unmapped domain of " + tGroupOrg
            #
            # Remove redundancy in names and assign unique ids
            #
            aGroup = aGroupOrg
            xGroup = xGroupOrg + "|" + aGroupOrg
            hGroup = hGroupOrg + "|" + xGroupOrg + "|" + aGroupOrg
            tGroup = tGroupOrg + "|" + hGroupOrg + "|" + xGroupOrg
            fGroup = fGroupOrg + "|" + tGroupOrg
            #
            hD.setdefault("A", set()).add(aGroup)
            hD.setdefault("X", set()).add(xGroup)
            hD.setdefault("H", set()).add(hGroup)
            hD.setdefault("T", set()).add(tGroup)
            hD.setdefault("F", set()).add(fGroup)
            aId = 100000 + len(hD["A"])
            xId = 200000 + len(hD["X"])
            hId = 300000 + len(hD["H"])
            tId = 400000 + len(hD["T"])
            fId = 500000 + len(hD["F"])
            #
            #
            if xGroup in pD and pD[xGroup] != aGroup:
                logger.error("skipping %r multiple parents for xGroup %r  %r and %r ", ecodId, xGroup, pD[xGroup], aGroup)
                continue
            #
            if hGroup in pD and pD[hGroup] != xGroup:
                logger.error("skipping %r multiple parents for hGroup %r  %r and %r ", ecodId, hGroup, pD[hGroup], xGroup)
                continue
            #
            if tGroup in pD and pD[tGroup] != hGroup:
                logger.error("skipping %r multiple parents for tGroup %r  %r and %r ", ecodId, tGroup, pD[tGroup], hGroup)
                continue
            #
            if fGroup in pD and pD[fGroup] != tGroup:
                logger.error("skipping %r multiple parents for fGroup %r  %r and %r ", ecodId, fGroup, pD[fGroup], tGroup)
                continue

            if xId in pIdD and pIdD[xId] != aId:
                logger.error("skipped %r multiple parents for xId %r  %r and %r ", ecodId, xId, pIdD[xId], aId)
            #
            if hId in pIdD and pIdD[hId] != xId:
                logger.error("skipped %r multiple parents for hId %r  %r and %r ", ecodId, hId, pIdD[hId], xId)
            #
            if tId in pIdD and pIdD[tId] != hId:
                logger.error("skipped %r multiple parents for tId %r  %r and %r ", ecodId, tId, pIdD[tId], hId)
            #
            if fId in pIdD and pIdD[fId] != tId:
                logger.error("skipped %r multiple parents for fId %r  %r and %r ", ecodId, fId, pIdD[fId], tId)

            #
            pIdD[aId] = 0
            pIdD[xId] = aId
            pIdD[hId] = xId
            pIdD[tId] = hId
            pIdD[fId] = tId
            #
            nmD[aId] = aGroupOrg
            nmD[xId] = xGroupOrg
            nmD[hId] = hGroupOrg
            nmD[tId] = tGroupOrg
            nmD[fId] = fGroupOrg
            #
            ntD[aId] = "A"
            ntD[xId] = "X"
            ntD[hId] = "H"
            ntD[tId] = "T"
            ntD[fId] = "F"
            rL = self.__parseRanges(resRange)
            if (entryId, authAsymId) not in assignD:
                assignD[(entryId, authAsymId)] = [(ecodId, fId, t[0], t[1], t[2]) for t in rL]
            else:
                for t in rL:
                    assignD[(entryId, authAsymId)].append((ecodId, fId, t[0], t[1], t[2]))
            #
        return pIdD, nmD, ntD, assignD

    def __parseRanges(self, rS):
        rL = []
        authAsymId = authSeqBeg = authSeqEnd = None
        try:
            tSL = rS.split(",")
            for tS in tSL:
                fL = tS.split(":")
                authAsymId = fL[0]
                rS = fL[1]
                if rS[0] == "-":
                    authSeqBeg = -int(rS[1:].split("-")[0])
                    authSeqEnd = int(rS[1:].split("-")[1])
                else:
                    authSeqBeg = int(rS.split("-")[0])
                    authSeqEnd = int(rS.split("-")[1])
            rL.append((authAsymId, authSeqBeg, authSeqEnd))
        except Exception:
            pass
        return rL

    def __exportTreeNodeList(self, pD):
        """Create node list from name dictionary and lineage dictionaries."""
        #
        rootId = 0
        pL = [rootId]
        #
        logger.info("pD %d pL %r", len(pD), pL)
        # --
        #
        # create child dictionary
        cD = {}
        for ctId, ptId in pD.items():
            cD.setdefault(ptId, []).append(ctId)
        #
        logger.info("cD %d", len(cD))
        #
        idL = []
        for rootId in sorted(pL):
            visited = set([rootId])
            queue = collections.deque(visited)
            while queue:
                tId = queue.popleft()
                idL.append(tId)
                if tId not in cD:
                    # logger.debug("No children for Ecod tId %s", tId)
                    continue
                for childId in cD[tId]:
                    if childId not in visited:
                        queue.append(childId)
                        visited.add(childId)
        #
        dL = []
        for tId in idL:
            displayName = self.getName(tId)
            ptId = pD[tId] if tId in pD else None
            lL = self.getIdLineage(tId)[1:]
            #
            if tId == rootId:
                continue
            elif ptId == rootId:
                dD = {"id": str(tId), "name": displayName, "depth": 0}
            else:
                dD = {"id": str(tId), "name": displayName, "parents": [str(ptId)], "depth": len(lL)}
            dL.append(dD)

        return dL
Esempio n. 15
0
    def search(self, dataList, procName, optionsD, workingDir):
        """Worker method to execute a shell to search CCDC for the input mol2 path list.

        Args:
            dataList (list): list of mol2 file paths to be searched
            procName (str): processName
            optionsD (dict): dictionary of options
            workingDir (str): path to working directory (not used)

        Returns:
            (successList, resultList, []): success and result lists of mol2 paths with CCDC matches
        """
        resultPath = optionsD["resultPath"]
        searchType = optionsD["searchType"]
        pythonRootPath = optionsD["pythonRootPath"]
        csdHome = optionsD["csdHome"]
        _ = workingDir
        resultList = []
        startTime = time.time()
        logger.info("starting %s at %s", procName,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        #
        try:
            stopPath = os.path.join(resultPath, "STOP")
            logger.info("%s search list length %d", procName, len(dataList))
            if self.__checkStop(stopPath):
                logger.info("%s stopping", procName)
                return resultList, resultList, []
            #
            queryListFilePath = os.path.join(resultPath, procName,
                                             "queryFileList.list")
            mU = MarshalUtil()
            ok = mU.doExport(queryListFilePath, dataList, fmt="list")
            if not ok:
                return resultList, resultList, []
            #
            exU = ExecUtils()
            logger.info("%s executing shell for %s", procName,
                        queryListFilePath)
            cmdPath = os.path.join(pythonRootPath, "bin", "ccdc_search_cli")
            hitListPath = os.path.join(resultPath, procName, "hitList.list")
            logPath = os.path.join(resultPath, procName, "execlog.log")

            logger.info("cmdPath %r", cmdPath)
            ok = exU.runShell(
                "%s --mol_list_path %s --result_path %s --search_type %s --csdhome %s --hit_list_path %s"
                % (cmdPath, queryListFilePath, resultPath, searchType, csdHome,
                   hitListPath),
                outPath=logPath,
                outAppend=False,
                timeOut=60,
                suppressStderr=False,
            )
            #
            if ok and mU.exists(hitListPath):
                resultList = mU.doImport(hitListPath, fmt="list")
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        endTime = time.time()
        logger.info("%s (result len %d) completed at %s (%.2f seconds)",
                    procName, len(resultList),
                    time.strftime("%Y %m %d %H:%M:%S",
                                  time.localtime()), endTime - startTime)
        return resultList, resultList, []
class LigandNeighborMappingProvider(StashableBase):
    """Accessors for essential ligand neighbor mapping details associated with polymer and branched
    entity instances."""

    def __init__(self, cachePath, useCache=True):
        #
        self.__cachePath = cachePath
        self.__useCache = useCache
        self.__dirName = "ligand-neighbor-mapping"
        super(LigandNeighborMappingProvider, self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__mapD = self.__reload(self.__dirPath, useCache)
        #

    def testCache(self, minCount=0):
        logger.info("Cached ligand neighbor mapping count %d", len(self.__mapD["mapping"]) if "mapping" in self.__mapD else 0)
        if minCount == 0 or self.__mapD and "mapping" in self.__mapD and len(self.__mapD["mapping"]) >= minCount:
            return True
        else:
            return False

    def getLigandNeighbors(self, rcsbEntityId):
        """Get the unique list of ligand neighbors for the input polymer or branched entity instance.

        Args:
            rcsbEntityId (str): entryId '_' entityId

        Returns:
            list: [chem_comp_id, ... ]
        """
        try:
            return list(set([t[0] for t in self.__mapD["mapping"][rcsbEntityId.upper()]]))
        except Exception:
            return []

    def __reload(self, dirPath, useCache):
        startTime = time.time()
        retD = {}
        ok = False
        mappingPath = self.__getMappingDataPath()
        #
        logger.info("useCache %r mappingPath %r", useCache, mappingPath)
        if useCache and self.__mU.exists(mappingPath):
            retD = self.__mU.doImport(mappingPath, fmt="json")
            ok = True
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        num = len(retD["mapping"]) if "mapping" in retD else 0
        logger.info("Completed ligand mapping reload (%d) with status (%r) at %s (%.4f seconds)", num, ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
        return retD

    def __getMappingDataPath(self):
        return os.path.join(self.__dirPath, "ligand-neighbor-mapping-data.json")

    def fetchLigandNeighborMapping(self, cfgOb):
        """Fetch ligand neighbor mapping details

        Args:
            cfgOb (obj): instance configuration class ConfigUtil()

        Returns:
            bool: True for success or False otherwise
        """
        try:
            lnmEx = LigandNeighborMappingExtractor(cfgOb)
            lnD = lnmEx.getLigandNeighbors()
            fp = self.__getMappingDataPath()
            tS = datetime.datetime.now().isoformat()
            vS = datetime.datetime.now().strftime("%Y-%m-%d")
            ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "mapping": lnD}, fmt="json", indent=3)
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
class SAbDabTargetFeatureProvider(StashableBase):
    """Accessors for Thera-SAbDab(Therapeutic Structural Antibody Database) target features."""

    # Link out using the INN therapeutic name -
    # http://opig.stats.ox.ac.uk/webapps/newsabdab/therasabdab/search/?therapeutic=Coltuximab
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        self.__dirName = "SAbDab-features"
        super(SAbDabTargetFeatureProvider, self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__fD = self.__reload(self.__dirPath, useCache)
        #

    def testCache(self, minCount=500):
        logger.info(
            "Therapeutic SAbDab feature count %d Assignment count %d",
            len(self.__fD["features"]) if "features" in self.__fD else 0,
            len(self.__fD["assignments"]) if "assignments" in self.__fD else 0,
        )
        if self.__fD and "features" in self.__fD and len(self.__fD["features"]) > minCount and self.__fD and "assignments" in self.__fD and len(self.__fD["assignments"]):
            return True
        else:
            return False

    def getVersion(self):
        try:
            return self.__fD["version"]
        except Exception:
            pass
        return None

    def hasFeatures(self, rcsbEntityId):
        return rcsbEntityId.upper() in self.__fD["features"]

    def getFeatures(self, rcsbEntityId):
        try:
            return self.__fD["features"][rcsbEntityId.upper()]
        except Exception:
            return []

    def getAssignment(self, instanceId, featureKey):
        """Return the value of the key feature for the input instance identifier.

        Args:
            instanceId (str): instance identifier '<pdbId>.<authAsymId>'
            featureKey (str): assignment feature key: one of pdb|Hchain|Lchain|model|antigen_chain|antigen_type|
                              antigen_het_name|antigen_name|heavy_subclass|light_subclass|light_ctype)

        Returns:
            str:  feature value or None
        """
        fVal = None
        try:
            fVal = self.__fD["assignments"][instanceId][featureKey]
        except Exception:
            fVal = None
        return fVal

    def hasAssignment(self, instanceId):
        """Return if assignment data is available for the input instance.

        Args:
            instanceId (str): instance identifier '<pdbId>.<authAsymId>'

        Returns:
            bool: True for success or False otherwise
        """
        return instanceId in self.__fD["assignments"]

    def __getFeatureDataPath(self):
        return os.path.join(self.__dirPath, "sabdab-feature-data.json")

    def reload(self):
        self.__fD = self.__reload(self.__dirPath, True)
        return True

    def __reload(self, dirPath, useCache):
        startTime = time.time()
        fD = {}

        ok = False
        featurePath = self.__getFeatureDataPath()
        #
        logger.info("useCache %r featurePath %r", useCache, featurePath)
        if useCache and self.__mU.exists(featurePath):
            fD = self.__mU.doImport(featurePath, fmt="json")
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload (%r) at %s (%.4f seconds)", ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), time.time() - startTime)
        return fD

    def buildFeatureList(self, sequenceMatchFilePath):
        """Build polymer entity feature list for the matching entities in the input sequence match file.

        Args:
            sequenceMatchFilePath (str): sequence match output file path

        Returns:
            bool: True for success or False otherwise

        """
        rDL = []
        stP = SAbDabTargetProvider(cachePath=self.__cachePath, useCache=False)
        mD = self.__mU.doImport(sequenceMatchFilePath, fmt="json")
        #
        provenanceSource = "SAbDab"
        refScheme = "PDB entity"
        assignVersion = stP.getAssignmentVersion()
        #
        # - sort out if we match light and heavy chains
        #
        iD = {}
        fullMatchD = {}
        for queryId, matchDL in mD.items():
            qCmtD = self.__decodeComment(queryId)
            # Tanezumab|therapeutic|light|chain
            thName = qCmtD["therapeutic"]
            chainType = qCmtD["chain"]
            for matchD in matchDL:
                tCmtD = self.__decodeComment(matchD["target"])
                entryId = tCmtD["entityId"].split("_")[0]
                entityId = tCmtD["entityId"].split("_")[1]
                iD[(thName, chainType, entryId)] = entityId
        logger.info("Match index length (%d)", len(iD))
        for (thName, chainType, entryId), entityId in iD.items():
            if chainType == "light":
                continue
            if (thName, "light", entryId) in iD:
                fullMatchD[(thName, "heavy", entryId, entityId)] = True
                lEntityId = iD[(thName, "light", entryId)]
                fullMatchD[(thName, "light", entryId, lEntityId)] = True
        logger.info("Antibody entity match length (%d)", len(fullMatchD))
        #
        # - Add features for full matches -
        for queryId, matchDL in mD.items():
            qCmtD = self.__decodeComment(queryId)
            # Tanezumab|therapeutic|light|chain
            thName = qCmtD["therapeutic"]
            chainType = qCmtD["chain"]
            #
            for matchD in matchDL:
                fpL = []
                if "alignedRegions" in matchD:
                    fpL = [{"beg_seq_id": arD["targetBegin"], "end_seq_id": arD["targetEnd"]} for arD in matchD["alignedRegions"]]
                else:
                    fpL = [{"beg_seq_id": matchD["targetBegin"], "end_seq_id": matchD["targetEnd"]}]
                #
                tCmtD = self.__decodeComment(matchD["target"])
                entryId = tCmtD["entityId"].split("_")[0]
                entityId = tCmtD["entityId"].split("_")[1]
                if (thName, chainType, entryId, entityId) not in fullMatchD:
                    continue
                ii = 1
                for fType, fKy in [
                    ("SABDAB_ANTIBODY_NAME", "antibodyName"),
                    ("SABDAB_ANTIBODY_FORMAT", "antibodyFormat"),
                    ("SABDAB_ANTIBODY_CH1_ISOTYPE", "ch1IsoType"),
                    ("SABDAB_ANTIBODY_LIGHT_CHAIN_TYPE", "VD_LC"),
                    ("SABDAB_ANTIBODY_TARGET", "target"),
                ]:
                    if fType == "Antibody_Light_Chain_Type" and chainType == "heavy":
                        continue
                    fVL = stP.getFeatures(thName, fKy)
                    if not fVL:
                        continue
                    for fV in fVL:
                        rD = {
                            "entry_id": entryId,
                            "entity_id": entityId,
                            "type": fType,
                            "feature_id": thName + "_" + chainType + "_" + str(ii),
                            "name": fV,
                            "provenance_source": provenanceSource,
                            "reference_scheme": refScheme,
                            "assignment_version": assignVersion,
                            "feature_positions": fpL,
                        }
                        rDL.append(rD)
                        ii += 1
        #
        qD = {}
        for rD in rDL:
            eId = rD["entry_id"] + "_" + rD["entity_id"]
            qD.setdefault(eId, []).append(rD)
        #
        logger.info("Antibody matches (%d)", len(qD))
        #
        fp = self.__getFeatureDataPath()
        tS = datetime.datetime.now().isoformat()
        vS = datetime.datetime.now().strftime("%Y-%m-%d")
        ok = self.__mU.doExport(fp, {"version": vS, "created": tS, "features": qD, "assignments": stP.getAssignments()}, fmt="json", indent=3)
        return ok

    def __decodeComment(self, comment, separator="|"):
        dD = {}
        try:
            ti = iter(comment.split(separator))
            dD = {tup[1]: tup[0] for tup in zip(ti, ti)}
        except Exception:
            pass
        return dD
Esempio n. 18
0
class GlyGenProvider(StashableBase):
    """Fetch glycans and glycoproteins available in the GlyGen.org resource.

    GlyGen glycan link template -
          https://glygen.org/glycan/G28882EF

    Glycoprotein link template -
          https://www.glygen.org/protein/Q658T7
    """
    def __init__(self, **kwargs):
        #
        dirName = "glygen"
        cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(cachePath, dirName)
        super(GlyGenProvider, self).__init__(cachePath, [dirName])
        useCache = kwargs.get("useCache", True)
        #
        baseUrl = kwargs.get(
            "glygenBasetUrl",
            "https://data.glygen.org/ln2data/releases/data/v-1.12.3/reviewed/")
        fallbackUrl = kwargs.get(
            "glygenFallbackUrl",
            "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/glygen/"
        )
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__glycanD = self.__reloadGlycans(baseUrl,
                                              fallbackUrl,
                                              self.__dirPath,
                                              useCache=useCache)
        self.__glycoproteinD = self.__reloadGlycoproteins(baseUrl,
                                                          fallbackUrl,
                                                          self.__dirPath,
                                                          useCache=useCache)

    def testCache(self, minGlycanCount=20000, minGlycoproteinCount=64000):
        #
        logger.info("GlyGen glycan list (%d) glycoprotein list (%d)",
                    len(self.__glycanD), len(self.__glycoproteinD))
        if self.__glycanD and len(
                self.__glycanD
        ) > minGlycanCount and self.__glycoproteinD and len(
                self.__glycoproteinD) > minGlycoproteinCount:
            return True
        return False

    def hasGlycan(self, glyTouCanId):
        try:
            return glyTouCanId in self.__glycanD
        except Exception:
            return False

    def hasGlycoprotein(self, uniProtId):
        try:
            return uniProtId in self.__glycoproteinD
        except Exception:
            return False

    def getGlycans(self):
        return self.__glycanD

    def getGlycoproteins(self):
        return self.__glycoproteinD

    def __reloadGlycans(self, baseUrl, fallbackUrl, dirPath, useCache=True):
        gD = {}
        logger.debug("Using dirPath %r", dirPath)
        self.__mU.mkdir(dirPath)
        #
        myDataPath = os.path.join(dirPath, "glygen-glycan-list.json")
        if useCache and self.__mU.exists(myDataPath):
            gD = self.__mU.doImport(myDataPath, fmt="json")
            logger.debug("GlyGen glycan data length %d", len(gD))
        elif not useCache:
            logger.debug(
                "Fetch GlyGen glycan data from primary data source %s",
                baseUrl)
            endPoint = os.path.join(baseUrl, "glycan_masterlist.csv")
            #
            logger.info("Fetch GlyGen glycan data from primary data source %s",
                        endPoint)
            rawPath = os.path.join(dirPath, "glycan_masterlist.csv")
            fU = FileUtil()
            ok = fU.get(endPoint, rawPath)
            logger.debug("Fetch GlyGen glycan data status %r", ok)
            if not ok:
                endPoint = os.path.join(fallbackUrl, "glycan_masterlist.csv")
                ok = fU.get(endPoint, rawPath)
                logger.info("Fetch fallback GlyGen glycan data status %r", ok)
            #
            if ok:
                gD = self.__parseGlycanList(rawPath)
                ok = self.__mU.doExport(myDataPath, gD, fmt="json")
                logger.info("Exported GlyGen glycan list (%d) (%r) %s",
                            len(gD), ok, myDataPath)
            #
        return gD

    def __parseGlycanList(self, filePath):
        gD = {}
        row = None
        try:
            rowL = self.__mU.doImport(filePath, fmt="csv", rowFormat="list")
            logger.debug("Glycan list length (%d)", len(rowL))
            logger.debug("Row 0 %r", rowL[0])
            for row in rowL[1:]:
                gD[row[0]] = row[1]
        except Exception as e:
            logger.exception("Failing for %r (%r) with %s", filePath, row,
                             str(e))
        return gD

    def __reloadGlycoproteins(self,
                              baseUrl,
                              fallbackUrl,
                              dirPath,
                              useCache=True):
        gD = {}
        logger.debug("Using dirPath %r", dirPath)
        self.__mU.mkdir(dirPath)
        #
        myDataPath = os.path.join(dirPath, "glygen-glycoprotein-list.json")
        if useCache and self.__mU.exists(myDataPath):
            gD = self.__mU.doImport(myDataPath, fmt="json")
            logger.debug("GlyGen glycoprotein data length %d", len(gD))
        else:
            for fn in [
                    "sarscov1_protein_masterlist.csv",
                    "sarscov2_protein_masterlist.csv",
                    "hcv1b_protein_masterlist.csv",
                    "hcv1a_protein_masterlist.csv",
                    "human_protein_masterlist.csv",
                    "mouse_protein_masterlist.csv",
                    "rat_protein_masterlist.csv",
            ]:
                logger.debug(
                    "Fetch GlyGen glycoprotein data from primary data source %s",
                    baseUrl)
                endPoint = os.path.join(baseUrl, fn)
                #
                logger.debug(
                    "Fetch GlyGen glycoprotein data from primary data source %s",
                    endPoint)
                rawPath = os.path.join(dirPath, fn)
                fU = FileUtil()
                ok = fU.get(endPoint, rawPath)
                logger.debug("Fetch GlyGen glycoprotein data status %r", ok)
                if not ok:
                    endPoint = os.path.join(fallbackUrl, fn)
                    ok = fU.get(endPoint, rawPath)
                    logger.info("Fetch fallback GlyGen data status %r", ok)
                #
                if ok:
                    tD = self.__parseGlycoproteinList(rawPath)
                    gD.update(tD)
            #
            ok = self.__mU.doExport(myDataPath, gD, fmt="json")
            logger.info("Exported GlyGen glycoprotein list (%d) (%r) %s",
                        len(gD), ok, myDataPath)
        #
        return gD

    def __parseGlycoproteinList(self, filePath):
        gD = {}
        try:
            rowL = self.__mU.doImport(filePath, fmt="csv", rowFormat="list")
            for row in rowL[1:]:
                ff = row[0].split("-")
                gD[ff[0]] = ff[1]
        except Exception as e:
            logger.exception("Failing for %r with %s", filePath, str(e))
        return gD
Esempio n. 19
0
    def search(self,
               queryTargetId,
               queryTargetPath,
               resultPath,
               normalizeFlag=True,
               maxHits=50,
               searchType="similarity",
               suppressMetals=False):
        """Search the CCDC database for similar or substructure matches for the input query molecule.

        Args:
            queryTargetId (str): query identifier
            queryTargetPath (str): path to the query molfile (mol, sdf, mol2)
            resultPath (str): output path to match results
            normalizeFlag (bool, optional): do standard perceptions on matching molecules. Defaults to True.
            maxHits (int, optional): maximum number of matches to return. Defaults to 50.
            searchType (str, optional): search mode (substructure, similarity). Defaults to "similarity".
            suppressMetals (bool, optional): filter structures containing metals. Defaults to False.

        Returns:
            (int): number of matches
        """

        mU = MarshalUtil()
        logger.info("Start search for target %s path %s result path %s",
                    queryTargetId, queryTargetPath, resultPath)
        #
        summaryList = []
        #
        targetDirPath = os.path.dirname(queryTargetPath)
        cifTargetPath = os.path.join(targetDirPath, queryTargetId + ".cif")

        #
        targetStructures = EntryReader(queryTargetPath)
        dirPath = os.path.join(resultPath, queryTargetId)
        numHits = 0
        for ii, e in enumerate(targetStructures, 1):
            numHits = 0
            startTime = time.time()
            targetMol = e.molecule
            if normalizeFlag:
                targetMol.assign_bond_types(which="unknown")
                targetMol.standardise_aromatic_bonds()
                targetMol.standardise_delocalised_bonds()
            #
            logger.info("(%d) begin %s search - query id %s", ii, searchType,
                        queryTargetId)
            if searchType == "similarity":
                hits = self.__similaritySearch(targetMol,
                                               suppressMetals=suppressMetals)
            elif searchType == "substructure":
                hits = self.__moleculeSubstructureSearch(
                    targetMol, suppressMetals=suppressMetals)
            else:
                hits = []
            logger.info("(%d) completed search query id %s in %.3f seconds",
                        ii, queryTargetId,
                        time.time() - startTime)

            if hits:
                numHits += len(hits)
                logger.info("(%d) search for %s matched %d: %r", ii,
                            queryTargetId, numHits,
                            [targetHit.identifier for targetHit in hits])

                #
                for targetHit in hits[:maxHits]:
                    #
                    hI = CcdcMatchIndexInst()
                    hI.setCsdVersion(csd_version())
                    hI.setCsdDirectory(csd_directory())
                    hI.setTargetId(queryTargetId)
                    hI.setTargetPath(queryTargetPath)
                    if mU.exists(cifTargetPath):
                        hI.setTargetCcPath(cifTargetPath)
                    hI.setIdentifier(targetHit.identifier)
                    hI.setMatchType(searchType)
                    try:
                        hI.setRFactor(targetHit.entry.r_factor)
                        hI.setChemicalName(targetHit.entry.chemical_name)
                        hI.setTemperature(targetHit.entry.temperature)
                        hI.setRadiationSource(targetHit.entry.radiation_source)
                        hI.setHasDisorder("N")
                        cit = targetHit.entry.publication
                        if cit.doi is not None:
                            hI.setCitationDOI(cit.doi)
                        if searchType == "similarity":
                            hI.setSimilarityScore(targetHit.similarity)
                        elif searchType == "substructure":
                            hI.setMatchedAtomLength(
                                len(targetHit.match_atoms()))
                    except Exception as e:
                        logger.exception("Failing with %s", str(e))
                        #
                    #
                    mU.mkdir(dirPath)
                    mol2L = []
                    if searchType == "substructure":
                        for jj, mc in enumerate(targetHit.match_components(),
                                                1):
                            fp = os.path.join(
                                dirPath, queryTargetId + "_" +
                                targetHit.identifier + "_%03d" % jj + ".mol2")
                            mol2L.append(fp)
                            with MoleculeWriter(fp) as ofh:
                                ofh.write(mc)
                            # Replace the title line
                            with open(fp) as fin:
                                lines = fin.readlines()
                            lines[1] = lines[1].replace(
                                "00", targetHit.identifier)
                            #
                            with open(fp, "w") as fout:
                                fout.write("".join(lines))
                            #
                            fp = os.path.join(
                                dirPath, queryTargetId + "_" +
                                targetHit.identifier + "_%03d" % jj + ".sdf")
                            with MoleculeWriter(fp) as ofh:
                                ofh.write(mc)

                            # Replace the title line
                            with open(fp) as fin:
                                lines = fin.readlines()
                            lines[0] = lines[0].replace(
                                "00", targetHit.identifier)
                            #
                            with open(fp, "w") as fout:
                                fout.write("".join(lines))
                        #
                        #  Check for multiple generated result files -
                        #
                        for jj, fp in enumerate(mol2L, 1):
                            logger.debug("(%d) adding component fp %s", jj, fp)
                            hI.setMatchNumber(jj)
                            hI.setMol2Path(fp)
                            tt = fp[:-4] + "sdf"
                            hI.setMolPath(tt)
                            summaryList.append(copy.deepcopy(hI.get()))
                            #
                    else:
                        hI.setMatchNumber(1)
                        summaryList.append(copy.deepcopy(hI.get()))
            else:
                logger.info("(%d) search for %s returns no matches", ii,
                            targetMol.identifier)
                hits = None
        #
        if numHits > 0:
            mU.mkdir(dirPath)
            fp = os.path.join(dirPath, queryTargetId + "-index.json")
            cmI = CcdcMatchIndex(indexFilePath=fp, verbose=self.__verbose)
            cmI.load(summaryList)
            cmI.writeIndex()

        return numHits
Esempio n. 20
0
    def build(self,
              alignType="relaxed-stereo",
              numProc=4,
              chunkSize=10,
              verbose=False,
              doFigures=True):
        """Run the model build step in the chemical component model workflow.

        Args:
          alignType (str):  "relaxed"|"strict"| relaxed-stereo".  Default: relaxed-stereo
          numProc (int, optional): number of processes to invoke. Defaults to 4.
          chunkSize (int, optional): work chunksize. Defaults to 10.
          verbose (bool, optional): verbose logging.  Defaults to False.

        Returns:
            (dict): {searchId: [{"targetId": , "modelId": , "modelPath": ,"matchId": , "parentId": , "rFactor": , }]

        """
        retD = {}
        try:
            mU = MarshalUtil(workPath=self.__cachePath)
            ccms = CODModelSearch(self.__cachePath, prefix=self.__prefix)
            modelDirPath = self.getModelDirFilePath()
            imageDirPath = self.getModelImageDirFilePath()
            #
            tD = ccms.getResultIndex()
            # Make parent index ---
            idxIdD = {}
            for idxId, iDL in tD.items():
                pId = idxId.split("|")[0]
                idxIdD.setdefault(pId, []).extend(iDL)
            #
            idxIdL = list(idxIdD.keys())
            midxIdL = []
            for pId in idxIdL:
                fp = os.path.join(modelDirPath, pId, "model-index.json")
                if mU.exists(fp):
                    # Skip empty indices
                    fst = os.stat(fp)
                    if fst.st_size > 10:
                        continue
                midxIdL.append(pId)
            #
            logger.info(
                "Starting COD model build using (%d) from a total of results length (%d)",
                len(midxIdL), len(idxIdD))
            #
            cmbw = CODModelBuildWorker(self.__cachePath,
                                       verbose=verbose,
                                       timeOut=self.__timeOut)
            mpu = MultiProcUtil(verbose=True)
            mpu.setWorkingDir(modelDirPath)
            mpu.setOptions(
                optionsD={
                    "modelDirPath": modelDirPath,
                    "imageDirPath": imageDirPath,
                    "alignType": alignType,
                    "ccSIdxP": self.__ccSIdxP,
                    "idxIdD": idxIdD,
                    "oesmP": self.__oesmP,
                    "ccmP": self.__ccmP,
                    "doFigures": doFigures,
                })
            #
            mpu.set(workerObj=cmbw, workerMethod="build")
            ok, failList, resultList, _ = mpu.runMulti(dataList=midxIdL,
                                                       numProc=numProc,
                                                       numResults=1,
                                                       chunkSize=chunkSize)
            logger.info(
                "Run ended with status %r success count %d failures %r", ok,
                len(resultList[0]), len(failList))
            successList = copy.copy(resultList[0])
            #
            if successList:
                logger.info("Completed build with %d models ",
                            len(successList))
            else:
                logger.info("No models built")
            #
            # Build full index -
            #
            logger.info("Building full model index")
            for pId in idxIdL:
                fp = os.path.join(modelDirPath, pId, "model-index.json")
                if mU.exists(fp):
                    tDL = mU.doImport(fp, fmt="json")
                    for tD in tDL:
                        retD.setdefault(tD["parentId"], []).append(tD)
            #
            retD = dict(sorted(retD.items()))
            logger.info("Storing models for %d parent components", len(retD))
            ok = self.storeModelIndex(retD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return retD
Esempio n. 21
0
class RepositoryProvider(object):
    def __init__(self,
                 cfgOb,
                 cachePath=None,
                 numProc=8,
                 fileLimit=None,
                 verbose=False):
        self.__fileLimit = fileLimit
        self.__numProc = numProc
        self.__verbose = verbose
        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__topCachePath = cachePath if cachePath else "."
        self.__cachePath = os.path.join(
            self.__topCachePath,
            self.__cfgOb.get("REPO_UTIL_CACHE_DIR",
                             sectionName=self.__configName))
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        #
        self.__ccPathD = None
        #
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"

    def getLocatorObjList(self,
                          contentType,
                          inputPathList=None,
                          mergeContentTypes=None):
        """Convenience method to get the data path list for the input repository content type.

        Args:
            contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...)
            inputPathList (list, optional): path list that will be returned if provided.
            mergeContentTypes (list, optional): repository content types to combined with the
                                primary content type.

        Returns:
            Obj list: data file paths or tuple of file paths

        """
        inputPathList = inputPathList if inputPathList else []
        if inputPathList:
            return self.getLocatorObjListWithInput(
                contentType,
                inputPathList=inputPathList,
                mergeContentTypes=mergeContentTypes)
        #
        if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [
                "pdbx", "pdbx_core"
        ]:
            dictPath = os.path.join(
                self.__topCachePath,
                self.__cfgOb.get(
                    "DICTIONARY_CACHE_DIR",
                    sectionName=self.__cfgOb.getDefaultSectionName()))
            os.environ["_RP_DICT_PATH_"] = dictPath
            locatorList = self.getEntryLocatorObjList(
                mergeContentTypes=mergeContentTypes)
        else:
            locatorList = self.__getLocatorList(contentType,
                                                inputPathList=inputPathList)
        return locatorList

    def getLocatorObjListWithInput(self,
                                   contentType,
                                   inputPathList=None,
                                   mergeContentTypes=None):
        """Convenience method to get the data path list for the input repository content type.

        Args:
            contentType (str): Repository content type (e.g. pdbx, chem_comp, bird, ...)
            inputPathList (list, optional): path list that will be returned if provided.
            mergeContentTypes (list, optional): repository content types to combined with the
                                primary content type.

        Returns:
            Obj list: data file paths or tuple of file paths

        """
        inputPathList = inputPathList if inputPathList else []
        locatorList = self.__getLocatorList(contentType,
                                            inputPathList=inputPathList)
        # JDW move the following to config
        if mergeContentTypes and "vrpt" in mergeContentTypes and contentType in [
                "pdbx", "pdbx_core"
        ]:
            dictPath = os.path.join(
                self.__topCachePath,
                self.__cfgOb.get(
                    "DICTIONARY_CACHE_DIR",
                    sectionName=self.__cfgOb.getDefaultSectionName()))
            os.environ["_RP_DICT_PATH_"] = dictPath
            #
            locObjL = []
            for locator in locatorList:
                if isinstance(locator, str):
                    kwD = HashableDict({})
                    oL = [
                        HashableDict({
                            "locator": locator,
                            "fmt": "mmcif",
                            "kwargs": kwD
                        })
                    ]
                    for mergeContentType in mergeContentTypes:
                        _, fn = os.path.split(locator)
                        idCode = fn[:4] if fn and len(fn) >= 8 else None
                        mergeLocator = self.__getLocator(
                            mergeContentType, idCode,
                            checkExists=True) if idCode else None
                        if mergeLocator:
                            # kwD = HashableDict({"marshalHelper": vrd.toCif})
                            kwD = HashableDict({"marshalHelper": toCifWrapper})
                            oL.append(
                                HashableDict({
                                    "locator": mergeLocator,
                                    "fmt": "xml",
                                    "kwargs": kwD
                                }))
                    lObj = tuple(oL)
                else:
                    logger.error("Unexpected output locator type %r", locator)
                    lObj = locator
                locObjL.append(lObj)
            #
            locatorList = locObjL
        # -
        return locatorList

    def getContainerList(self, locatorObjList):
        """ Return the data container list obtained by parsing the input locator object list.
        """
        cL = []
        for locatorObj in locatorObjList:
            myContainerList = self.__mergeContainers(locatorObj,
                                                     fmt="mmcif",
                                                     mergeTarget=0)
            for cA in myContainerList:
                cL.append(cA)
        return cL

    def __mergeContainers(self, locatorObj, fmt="mmcif", mergeTarget=0):
        """ Consolidate content in auxiliary files locatorObj[1:] into
            locatorObj[0] container index 'mergeTarget'.

        """
        #
        cL = []
        try:
            if isinstance(locatorObj, str):
                cL = self.__mU.doImport(locatorObj, fmt=fmt)
                return cL if cL else []
            elif isinstance(locatorObj, (list, tuple)) and locatorObj:
                dD = locatorObj[0]
                kw = dD["kwargs"]
                cL = self.__mU.doImport(dD["locator"], fmt=dD["fmt"], **kw)
                if cL:
                    for dD in locatorObj[1:]:
                        kw = dD["kwargs"]
                        rObj = self.__mU.doImport(dD["locator"],
                                                  fmt=dD["fmt"],
                                                  **kw)
                        mergeL = rObj if rObj else []
                        for mc in mergeL:
                            cL[mergeTarget].merge(mc)
                #
                return cL
            else:
                return []
        except Exception as e:
            logger.exception("Failing for %r with %s", locatorObj, str(e))

        return cL

    def getLocatorsFromPaths(self, locatorObjList, pathList, locatorIndex=0):
        """ Return locator objects with paths (locatorObjIndex) matching the input pathList.

        """
        # index the input locatorObjList
        rL = []
        try:
            if locatorObjList and isinstance(locatorObjList[0], str):
                return pathList
            #
            locIdx = {}
            for ii, locatorObj in enumerate(locatorObjList):
                if "locator" in locatorObj[locatorIndex]:
                    locIdx[locatorObj[locatorIndex]["locator"]] = ii
            #
            for pth in pathList:
                jj = locIdx[pth] if pth in locIdx else None
                if jj is not None:
                    rL.append(locatorObjList[jj])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return rL

    def getLocatorPaths(self, locatorObjList, locatorIndex=0):
        try:
            if locatorObjList and isinstance(locatorObjList[0], str):
                return locatorObjList
            else:
                return [
                    locatorObj[locatorIndex]["locator"]
                    for locatorObj in locatorObjList
                ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return []

    def __getLocatorList(self, contentType, inputPathList=None):
        """ Internal convenience method to return repository path list by content type:
        """
        outputPathList = []
        inputPathList = inputPathList if inputPathList else []
        try:
            if contentType in ["bird", "bird_core"]:
                outputPathList = inputPathList if inputPathList else self.getBirdPathList(
                )
            elif contentType == "bird_family":
                outputPathList = inputPathList if inputPathList else self.getBirdFamilyPathList(
                )
            elif contentType in ["chem_comp"]:
                outputPathList = inputPathList if inputPathList else self.getChemCompPathList(
                )
            elif contentType in ["bird_chem_comp"]:
                outputPathList = inputPathList if inputPathList else self.getBirdChemCompPathList(
                )
            elif contentType in ["pdbx", "pdbx_core"]:
                outputPathList = inputPathList if inputPathList else self.getEntryPathList(
                )
            elif contentType in [
                    "chem_comp_core", "bird_consolidated",
                    "bird_chem_comp_core"
            ]:
                outputPathList = inputPathList if inputPathList else self.mergeBirdAndChemCompRefData(
                )
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                outputPathList = inputPathList if inputPathList else self.getIhmDevPathList(
                )
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                outputPathList = inputPathList if inputPathList else []
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        if self.__fileLimit:
            outputPathList = outputPathList[:self.__fileLimit]

        return sorted(outputPathList)

    def __getLocator(self,
                     contentType,
                     idCode,
                     version="v1-0",
                     checkExists=False):
        """ Convenience method to return repository path for a content type and cardinal identifier.
        """
        pth = None
        try:
            idCodel = idCode.lower()
            if contentType == "bird":
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType == "bird_family":
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType in ["chem_comp", "chem_comp_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[0], idCode, idCode + ".cif")
            elif contentType in ["bird_chem_comp"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode[-1], idCode + ".cif")
            elif contentType in ["pdbx", "pdbx_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCodel[1:3], idCodel + ".cif.gz")
            elif contentType in ["bird_consolidated", "bird_chem_comp_core"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCode + ".cif")
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                pth = os.path.join(self.__getRepoTopPath(contentType), idCode,
                                   idCode + "_model_%s.cif.gz" % version)
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                pass
            elif contentType in ["vrpt"]:
                pth = os.path.join(self.__getRepoTopPath(contentType),
                                   idCodel[1:3], idCodel,
                                   idCodel + "_validation.xml.gz")
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        if checkExists:
            pth = pth if self.__mU.exists(pth) else None
        return pth

    def __getRepoTopPath(self, contentType):
        """ Convenience method to return repository top path from configuration data.
        """
        pth = None
        try:
            if contentType == "bird":
                pth = self.__cfgOb.getPath("BIRD_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType == "bird_family":
                pth = self.__cfgOb.getPath("BIRD_FAMILY_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["chem_comp", "chem_comp_core"]:
                pth = self.__cfgOb.getPath("CHEM_COMP_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["bird_chem_comp"]:
                pth = self.__cfgOb.getPath("BIRD_CHEM_COMP_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["pdbx", "pdbx_core"]:
                pth = self.__cfgOb.getPath("PDBX_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in ["bird_consolidated", "bird_chem_comp_core"]:
                pth = self.__cachePath
            elif contentType in ["ihm_dev", "ihm_dev_core", "ihm_dev_full"]:
                pth = self.__cfgOb.getPath("IHM_DEV_REPO_PATH",
                                           sectionName=self.__configName)
            elif contentType in [
                    "pdb_distro", "da_internal", "status_history"
            ]:
                pass
            elif contentType in ["vrpt"]:
                pth = self.__cfgOb.getEnvValue("VRPT_REPO_PATH_ENV",
                                               sectionName=self.__configName,
                                               default=None)
                if pth is None:
                    pth = self.__cfgOb.getPath("VRPT_REPO_PATH",
                                               sectionName=self.__configName)
                else:
                    logger.debug(
                        "Using validation report path from environment assignment %s",
                        pth)
            else:
                logger.warning("Unsupported contentType %s", contentType)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return pth

    def _chemCompPathWorker(self, dataList, procName, optionsD, workingDir):
        """ Return the list of chemical component definition file paths in the current repository.
        """
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        pathList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.endswith(".cif") and len(name) <= 7:
                        pathList.append(os.path.join(root, name))
        return dataList, pathList, []

    def getChemCompPathList(self):
        return self.__getChemCompPathList(self.__getRepoTopPath("chem_comp"),
                                          numProc=self.__numProc)

    def __getChemCompPathList(self, topRepoPath, numProc=8):
        """Get the path list for the chemical component definition repository
        """
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
            dataList = [a for a in dataS]
            optD = {}
            optD["topRepoPath"] = topRepoPath
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="_chemCompPathWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Path list length %d  in %.4f seconds", len(pathList),
                         endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def _entryLocatorObjWithMergeWorker(self, dataList, procName, optionsD,
                                        workingDir):
        """ Return the list of entry locator objects including merge content in the current repository.
        """
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        mergeContentTypes = optionsD["mergeContentTypes"]
        locatorObjList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for fn in files:
                    if (fn.endswith(".cif.gz")
                            and len(fn) == 11) or (fn.endswith(".cif")
                                                   and len(fn) == 8):
                        locator = os.path.join(root, fn)
                        kwD = HashableDict({})
                        oL = [
                            HashableDict({
                                "locator": locator,
                                "fmt": "mmcif",
                                "kwargs": kwD
                            })
                        ]
                        for mergeContentType in mergeContentTypes:
                            idCode = fn[:4] if fn and len(fn) >= 8 else None
                            mergeLocator = self.__getLocator(
                                mergeContentType, idCode,
                                checkExists=True) if idCode else None
                            if mergeLocator:
                                kwD = HashableDict(
                                    {"marshalHelper": toCifWrapper})
                                oL.append(
                                    HashableDict({
                                        "locator": mergeLocator,
                                        "fmt": "xml",
                                        "kwargs": kwD
                                    }))
                        lObj = tuple(oL)
                        locatorObjList.append(lObj)
        return dataList, locatorObjList, []

    def getEntryLocatorObjList(self, mergeContentTypes=None):
        return self.__getEntryLocatorObjList(
            self.__getRepoTopPath("pdbx"),
            numProc=self.__numProc,
            mergeContentTypes=mergeContentTypes)

    def __getEntryLocatorObjList(self,
                                 topRepoPath,
                                 numProc=8,
                                 mergeContentTypes=None):
        """Get the path list for structure entries in the input repository
        """
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataList = []
            anL = "abcdefghijklmnopqrstuvwxyz0123456789"
            for a1 in anL:
                for a2 in anL:
                    hc = a1 + a2
                    dataList.append(hc)
                    hc = a2 + a1
                    dataList.append(hc)
            dataList = list(set(dataList))
            #
            optD = {}
            optD["topRepoPath"] = topRepoPath
            optD["mergeContentTypes"] = mergeContentTypes
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self,
                    workerMethod="_entryLocatorObjWithMergeWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Locator object list length %d  in %.4f seconds",
                         len(pathList), endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def _entryPathWorker(self, dataList, procName, optionsD, workingDir):
        """ Return the list of entry file paths in the current repository.
        """
        _ = procName
        _ = workingDir
        topRepoPath = optionsD["topRepoPath"]
        pathList = []
        for subdir in dataList:
            dd = os.path.join(topRepoPath, subdir)
            for root, _, files in walk(dd, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if (name.endswith(".cif.gz")
                            and len(name) == 11) or (name.endswith(".cif")
                                                     and len(name) == 8):
                        pathList.append(os.path.join(root, name))
        return dataList, pathList, []

    def getEntryPathList(self):
        return self.__getEntryPathList(self.__getRepoTopPath("pdbx"),
                                       numProc=self.__numProc)

    def __getEntryPathList(self, topRepoPath, numProc=8):
        """Get the path list for structure entries in the input repository
        """
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting at %s", ts)
        startTime = time.time()
        pathList = []
        try:
            dataList = []
            anL = "abcdefghijklmnopqrstuvwxyz0123456789"
            for a1 in anL:
                for a2 in anL:
                    hc = a1 + a2
                    dataList.append(hc)
                    hc = a2 + a1
                    dataList.append(hc)
            dataList = list(set(dataList))
            #
            optD = {}
            optD["topRepoPath"] = topRepoPath
            mpu = MultiProcUtil(verbose=self.__verbose)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="_entryPathWorker")
            _, _, retLists, _ = mpu.runMulti(dataList=dataList,
                                             numProc=numProc,
                                             numResults=1)
            pathList = retLists[0]
            endTime0 = time.time()
            logger.debug("Path list length %d  in %.4f seconds", len(pathList),
                         endTime0 - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return self.__applyFileLimit(pathList)

    def getBirdPathList(self):
        return self.__getBirdPathList(self.__getRepoTopPath("bird"))

    def __getBirdPathList(self, topRepoPath):
        """ Return the list of definition file paths in the current repository.

            List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PRD_") and name.endswith(
                            ".cif") and len(name) <= 14:
                        pth = os.path.join(root, name)
                        sd[int(name[4:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def getBirdFamilyPathList(self):
        return self.__getBirdFamilyPathList(
            self.__getRepoTopPath("bird_family"))

    def __getBirdFamilyPathList(self, topRepoPath):
        """ Return the list of definition file paths in the current repository.

            List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("FAM_") and name.endswith(
                            ".cif") and len(name) <= 14:
                        pth = os.path.join(root, name)
                        sd[int(name[4:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def getBirdChemCompPathList(self):
        return self.__getBirdChemCompPathList(
            self.__getRepoTopPath("bird_chem_comp"))

    def __getBirdChemCompPathList(self, topRepoPath):
        """ Return the list of definition file paths in the current repository.

            List is ordered in increasing PRD ID numerical code.
        """
        pathList = []
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PRDCC_") and name.endswith(
                            ".cif") and len(name) <= 16:
                        pth = os.path.join(root, name)
                        sd[int(name[6:-4])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return self.__applyFileLimit(pathList)

    def __applyFileLimit(self, pathList):
        logger.debug("Length of file path list %d (limit %r)", len(pathList),
                     self.__fileLimit)
        if self.__fileLimit:
            return pathList[:self.__fileLimit]
        else:
            return pathList

    def __buildFamilyIndex(self):
        """ Using information from the PRD family definition:
            #
            loop_
            _pdbx_reference_molecule_list.family_prd_id
            _pdbx_reference_molecule_list.prd_id
                FAM_000010 PRD_000041
                FAM_000010 PRD_000042
                FAM_000010 PRD_000043
                FAM_000010 PRD_000044
                FAM_000010 PRD_000048
                FAM_000010 PRD_000049
                FAM_000010 PRD_000051
            #
        """
        prdD = {}
        try:
            pthL = self.__getLocatorList("bird_family")
            for pth in pthL:
                containerL = self.__mU.doImport(pth, fmt="mmcif")
                for container in containerL:
                    catName = "pdbx_reference_molecule_list"
                    if container.exists(catName):
                        catObj = container.getObj(catName)
                        for ii in range(catObj.getRowCount()):
                            familyPrdId = catObj.getValue(
                                attributeName="family_prd_id", rowIndex=ii)
                            prdId = catObj.getValue(attributeName="prd_id",
                                                    rowIndex=ii)
                            if prdId in prdD:
                                logger.debug(
                                    "duplicate prdId in family index %s %s",
                                    prdId, familyPrdId)
                            prdD[prdId] = {
                                "familyPrdId": familyPrdId,
                                "c": container
                            }
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return prdD

    def __buildBirdCcIndex(self):
        """ Using information from the PRD pdbx_reference_molecule category to
        index the BIRDs corresponding small molecule correspondences

        """
        prdD = {}
        ccPathD = {}
        prdStatusD = {}
        try:
            ccPathL = self.__getLocatorList("chem_comp")
            ccPathD = {}
            for ccPath in ccPathL:
                _, fn = os.path.split(ccPath)
                ccId, _ = os.path.splitext(fn)
                ccPathD[ccId] = ccPath
            logger.debug("ccPathD length %d", len(ccPathD))
            pthL = self.__getLocatorList("bird")
            for pth in pthL:
                containerL = self.__mU.doImport(pth, fmt="mmcif")
                for container in containerL:
                    catName = "pdbx_reference_molecule"
                    if container.exists(catName):
                        catObj = container.getObj(catName)
                        ii = 0
                        prdId = catObj.getValue(attributeName="prd_id",
                                                rowIndex=ii)
                        relStatus = catObj.getValue(
                            attributeName="release_status", rowIndex=ii)
                        prdStatusD[prdId] = relStatus
                        if relStatus != "REL":
                            continue
                        prdRepType = catObj.getValue(
                            attributeName="represent_as", rowIndex=ii)
                        logger.debug("represent as %r", prdRepType)
                        if prdRepType in ["single molecule"]:
                            ccId = catObj.getValueOrDefault(
                                attributeName="chem_comp_id",
                                rowIndex=ii,
                                defaultValue=None)
                            # prdId = catObj.getValue(attributeName="prd_id", rowIndex=ii)
                            logger.debug("mapping prdId %r ccId %r", prdId,
                                         ccId)
                            if ccId and ccId in ccPathD:
                                prdD[prdId] = {
                                    "ccId": ccId,
                                    "ccPath": ccPathD[ccId]
                                }
                                ccPathD[ccPathD[ccId]] = {
                                    "ccId": ccId,
                                    "prdId": prdId
                                }
                            else:
                                logger.error("Bad ccId %r for BIRD %r", ccId,
                                             prdId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return prdD, ccPathD, prdStatusD

    # -
    def mergeBirdAndChemCompRefData(self):
        prdSmallMolCcD, ccPathD, prdStatusD = self.__buildBirdCcIndex()
        logger.info("PRD to CCD index length %d CCD map path length %d",
                    len(prdSmallMolCcD), len(ccPathD))
        outputPathList = self.mergeBirdRefData(prdSmallMolCcD, prdStatusD)
        ccOutputPathList = [
            pth for pth in self.getChemCompPathList() if pth not in ccPathD
        ]
        outputPathList.extend(ccOutputPathList)
        return outputPathList

    def mergeBirdRefData(self, prdSmallMolCcD, prdStatusD):
        """ Consolidate all of the bird reference data in a single container.

            If the BIRD is a 'small molecule' type then also merge with the associated CC definition.

            Store the merged data in the REPO_UTIL cache path and ...

            Return a path list for the consolidated data files -

        """
        outPathList = []
        try:
            birdPathList = self.__getLocatorList("bird")
            birdPathD = {}
            for birdPath in birdPathList:
                _, fn = os.path.split(birdPath)
                prdId, _ = os.path.splitext(fn)
                birdPathD[prdId] = birdPath
            #
            logger.debug("BIRD data length %d", len(birdPathD))
            logger.debug("BIRD keys %r", list(birdPathD.keys()))
            birdCcPathList = self.__getLocatorList("bird_chem_comp")
            birdCcPathD = {}
            for birdCcPath in birdCcPathList:
                _, fn = os.path.split(birdCcPath)
                prdCcId, _ = os.path.splitext(fn)
                prdId = "PRD_" + prdCcId[6:]
                birdCcPathD[prdId] = birdCcPath
            #
            logger.debug("BIRD CC data length %d", len(birdCcPathD))
            logger.debug("BIRD CC keys %r", list(birdCcPathD.keys()))
            fD = self.__buildFamilyIndex()
            logger.debug("Family index length %d", len(fD))
            logger.debug("Family index keys %r", list(fD.keys()))
            logger.debug("PRD to CCD small mol index length %d",
                         len(prdSmallMolCcD))
            #
            for prdId in birdPathD:
                if prdId in prdStatusD and prdStatusD[prdId] != "REL":
                    continue
                fp = os.path.join(self.__cachePath, prdId + ".cif")
                logger.debug("Export cache path is %r", fp)
                #
                pth2 = birdPathD[prdId]
                cL = self.__mU.doImport(pth2, fmt="mmcif")
                cFull = cL[0]
                logger.debug("Got Bird %r", cFull.getName())
                #
                #
                ccBird = None
                ccD = None
                if prdId in prdSmallMolCcD:
                    pthCc = prdSmallMolCcD[prdId]["ccPath"]
                    cL = self.__mU.doImport(pthCc, fmt="mmcif")
                    ccD = cL[0]
                    logger.debug("Got corresponding CCD %r", ccD.getName())
                elif prdId in birdCcPathD:
                    pth1 = birdCcPathD[prdId]
                    c1L = self.__mU.doImport(pth1, fmt="mmcif")
                    ccBird = c1L[0]
                    logger.debug("Got ccBird %r", ccBird.getName())
                    #
                cFam = None
                if prdId in fD:
                    cFam = fD[prdId]["c"]
                    logger.debug("Got cFam %r", cFam.getName())
                #
                if ccD:
                    for catName in ccD.getObjNameList():
                        cFull.append(ccD.getObj(catName))
                #
                if ccBird:
                    for catName in ccBird.getObjNameList():
                        cFull.append(ccBird.getObj(catName))
                if cFam:
                    for catName in cFam.getObjNameList():
                        cFull.append(cFam.getObj(catName))
                #
                self.__mU.doExport(fp, [cFull], fmt="mmcif")
                outPathList.append(fp)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return outPathList
        #

    def __exportConfig(self, container):
        """
                - CATEGORY_NAME: diffrn_detector
                  ATTRIBUTE_NAME_LIST:
                      - pdbx_frequency
                - CATEGORY_NAME: pdbx_serial_crystallography_measurement
                  ATTRIBUTE_NAME_LIST:
                      - diffrn_id
                      - pulse_energy
                      - pulse_duration
                      - xfel_pulse_repetition_rate
        """
        for catName in container.getObjNameList():
            cObj = container.getObj(catName)
            print("- CATEGORY_NAME: %s" % catName)
            print("  ATTRIBUTE_NAME_LIST:")
            for atName in cObj.getAttributeList():
                print("       - %s" % atName)
        return True

    def getIhmDevPathList(self):
        return self.__getIhmDevPathList(self.__getRepoTopPath("ihm_dev"))

    def __getIhmDevPathList(self, topRepoPath):
        """ Return the list of I/HM entries in the current repository.

            File name template is: PDBDEV_0000 0020_model_v1-0.cif.gz

            List is ordered in increasing PRDDEV numerical code.
        """
        pathList = []
        logger.debug("Searching path %r", topRepoPath)
        try:
            sd = {}
            for root, _, files in os.walk(topRepoPath, topdown=False):
                if "REMOVE" in root:
                    continue
                for name in files:
                    if name.startswith("PDBDEV_") and name.endswith(
                            ".cif.gz") and len(name) <= 50:
                        pth = os.path.join(root, name)
                        sd[int(name[7:15])] = pth
            #
            for k in sorted(sd.keys()):
                pathList.append(sd[k])
        except Exception as e:
            logger.exception("Failing search in %r with %s", topRepoPath,
                             str(e))
        #
        return self.__applyFileLimit(pathList)
class Scop2ClassificationProvider(StashableBase):
    """Extract SCOP2 domain assignments, term descriptions and SCOP classification hierarchy
    from SCOP and SCOP2B flat files.
    """
    def __init__(self, cachePath, useCache, **kwargs):
        #
        _ = kwargs
        self.__cachePath = cachePath
        dirName = "scop2"
        self.__dirPath = os.path.join(self.__cachePath, dirName)
        self.__useCache = useCache
        super(Scop2ClassificationProvider,
              self).__init__(self.__cachePath, [dirName])
        #
        self.__version = "latest"
        self.__fmt = "pickle"
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__nD, self.__ntD, self.__pAD, self.__pBD, self.__pBRootD, self.__fD, self.__sfD, self.__sf2bD = self.__reload(
            useCache=self.__useCache, fmt=self.__fmt)
        #
        if not useCache and not self.testCache():
            ok = self.__fetchFromBackup()
            if ok:
                self.__nD, self.__ntD, self.__pAD, self.__pBD, self.__pBRootD, self.__fD, self.__sfD, self.__sf2bD = self.__reload(
                    useCache=True, fmt=self.__fmt)
        #

    def testCache(self):
        logger.info(
            "SCOP2 lengths nD %d pAD %d pBD %d pBRootD %d fD %d sfD %d sf2bD %d",
            len(self.__nD), len(self.__pAD), len(self.__pBD),
            len(self.__pBRootD), len(self.__fD), len(self.__sfD),
            len(self.__sf2bD))
        if (len(self.__nD) > 9000) and (len(self.__pAD) > 70000):
            return True
        return False

    def getVersion(self):
        """Returns the SCOP2 version"""
        return self.__version

    def getFamilyIds(self, pdbId, authAsymId):
        try:
            return list(
                set([tup[1]
                     for tup in self.__fD[(pdbId.upper(), authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyIds(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    tup[1] for tup in self.__sfD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getFamilyNames(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    self.__nD[tup[1]]
                    for tup in self.__fD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyNames(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    self.__nD[tup[1]]
                    for tup in self.__sfD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getFamilyResidueRanges(self, pdbId, authAsymId):
        try:
            # s/fD.setdefault((pdbId, authAsymId), []).append((domSuperFamilyId, authAsymId, authSeqBeg, authSeqEnd))
            return [(tup[0], tup[1], tup[2], tup[3], tup[4])
                    for tup in self.__fD[(pdbId.upper(), authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyResidueRanges(self, pdbId, authAsymId):
        try:
            return [(tup[0], tup[1], tup[2], tup[3], tup[4])
                    for tup in self.__sfD[(pdbId.upper(), authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyNames2B(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    self.__nD[tup[1]]
                    for tup in self.__sf2bD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyIds2B(self, pdbId, authAsymId):
        try:
            return list(
                set([
                    tup[1] for tup in self.__sf2bD[(pdbId.upper(), authAsymId)]
                ]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getSuperFamilyResidueRanges2B(self, pdbId, authAsymId):
        try:
            return [(tup[0], tup[1], tup[2], tup[3], tup[4])
                    for tup in self.__sf2bD[(pdbId.upper(), authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))
        return []

    def getName(self, domId):
        try:
            return self.__nD[domId]
        except Exception:
            logger.debug("Undefined SCOP2 id %r", domId)
        return None

    def getNameType(self, domId):
        qD = {
            "TP": "Protein Type",
            "CL": "Protein Class",
            "CF": "Fold",
            "SF": "Superfamily",
            "FA": "Family"
        }
        try:
            return qD[self.__ntD[domId]]
        except Exception:
            logger.debug("Undefined ECOD id %r", domId)
        return None

    def getIdLineage(self, domId):
        pS = set()
        try:
            pS.add(domId)
            pt = self.__pAD[domId]
            while (pt is not None) and (pt != 0):
                pS.add(pt)
                pt = self.__pAD[pt]
            #
            pt = self.__pBD[domId]
            while (pt is not None) and (pt != 0):
                pS.add(pt)
                pt = self.__pBD[pt]
        except Exception as e:
            logger.debug("Failing for %r with %s", domId, str(e))
        #
        return sorted(pS)

    def getNameLineage(self, domId):
        try:
            nL = []
            for dId in self.getIdLineage(domId):
                tN = self.getName(dId)
                tN = tN if tN else "Unnamed"
                nL.append(tN)
            return nL
        except Exception as e:
            logger.debug("Failing for %r with %s", domId, str(e))
        return None

    def getTreeNodeList(self):
        tnL = self.__exportTreeNodeList(self.__nD, self.__pAD, self.__pBRootD)
        return tnL

    def __getAssignmentFileName(self, fmt="json"):
        ext = "json" if fmt == "json" else "pic"
        fn = "scop2_domain_assignments.%s" % ext
        return fn

    def __reload(self, useCache=True, fmt="json"):
        nD = ntD = pAD = pBD = pBRootD = fD = sfD = sf2bD = {}
        fn = self.__getAssignmentFileName(fmt=fmt)
        assignmentPath = os.path.join(self.__dirPath, fn)
        self.__mU.mkdir(self.__dirPath)
        #
        if useCache and self.__mU.exists(assignmentPath):
            sD = self.__mU.doImport(assignmentPath, fmt=fmt)
            logger.debug("Domain name count %d", len(sD["names"]))
            self.__version = sD["version"]
            nD = sD["names"]
            ntD = sD["nametypes"]
            pAD = sD["parentsType"]
            pBD = sD["parentsClass"]
            pBRootD = sD["parentsClassRoot"]
            fD = sD["families"]
            sfD = sD["superfamilies"]
            sf2bD = sD["superfamilies2b"]

        elif not useCache:
            nmL, dmL, scop2bL, _ = self.__fetchFromSource()
            #
            ok = False
            nD = self.__extractNames(nmL)
            logger.info("Domain name dictionary (%d)", len(nD))
            pAD, pBD, pBRootD, ntD, fD, sfD, domToSfD = self.__extractDomainHierarchy(
                dmL)
            #
            logger.info("Domain node parent hierarchy (protein type) (%d)",
                        len(pAD))
            logger.info("Domain node parent hierarchy (structural class) (%d)",
                        len(pBD))
            logger.info(
                "Domain node parent hierarchy (structural class root) (%d)",
                len(pBRootD))
            logger.info("SCOP2 core domain assignments (family %d) (sf %d)",
                        len(fD), len(sfD))
            #
            sf2bD = self.__extractScop2bSuperFamilyAssignments(
                scop2bL, domToSfD)
            logger.info("SCOP2B SF domain assignments (%d)", len(sf2bD))
            #
            tS = datetime.datetime.now().isoformat()
            # vS = datetime.datetime.now().strftime("%Y-%m-%d")
            vS = self.__version
            sD = {
                "version": vS,
                "created": tS,
                "names": nD,
                "nametypes": ntD,
                "parentsType": pAD,
                "parentsClass": pBD,
                "parentsClassRoot": pBRootD,
                "families": fD,
                "superfamilies": sfD,
                "superfamilies2b": sf2bD
            }
            ok = self.__mU.doExport(assignmentPath, sD, fmt=fmt, indent=3)
            logger.info("Cache save status %r", ok)
            #
        return nD, ntD, pAD, pBD, pBRootD, fD, sfD, sf2bD

    def __fetchFromBackup(self, fmt="json"):
        urlTarget = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP2"
        #
        fn = self.__getAssignmentFileName(fmt=fmt)
        assignmentPath = os.path.join(self.__dirPath, fn)
        urlPath = os.path.join(urlTarget, fn)
        self.__mU.mkdir(assignmentPath)
        #
        logger.info("Using backup URL %r", urlPath)
        fU = FileUtil()
        ok = fU.get(urlPath, assignmentPath)
        return ok

    def __fetchFromSource(self):
        """Fetch the classification names and domain assignments from SCOP2 and SCOP2B resources.

        SCOP2 domain names:
            https://scop.mrc-lmb.cam.ac.uk/files/scop-des-latest.txt

        SCOP2 domain hierarchy:
            https://scop.mrc-lmb.cam.ac.uk/files/scop-cla-latest.txt

        SIFTS extrapolated SCOP2 and SCOP2B assignments:
            https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_scop2b_sf_uniprot.tsv.gz
            https://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv/pdb_chain_scop2_uniprot.tsv.gz

        """
        urlTargetScop2 = "https://scop.mrc-lmb.cam.ac.uk/files"
        encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii"
        fn = "scop-des-latest.txt"
        url = os.path.join(urlTargetScop2, fn)
        desL = self.__mU.doImport(url,
                                  fmt="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(desL))
        #
        fn = "scop-cla-latest.txt"
        url = os.path.join(urlTargetScop2, fn)
        claL = self.__mU.doImport(url,
                                  fmt="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(claL))
        #
        headerLines = self.__mU.doImport(url,
                                         fmt="list",
                                         uncomment=False,
                                         encoding=encoding)
        self.__version = headerLines[0].split(
            " ")[3] if headerLines else "2021-05-27"
        # JDW note cert issues with this site
        urlTargetSifts = "http://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/tsv"
        fn = "pdb_chain_scop2b_sf_uniprot.tsv.gz"
        url = os.path.join(urlTargetSifts, fn)
        scop2bL = self.__mU.doImport(url,
                                     fmt="tdd",
                                     rowFormat="dict",
                                     uncomment=True,
                                     encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(scop2bL))
        #
        fn = "pdb_chain_scop2_uniprot.tsv.gz"
        url = os.path.join(urlTargetSifts, fn)
        scop2L = self.__mU.doImport(url,
                                    fmt="tdd",
                                    rowFormat="dict",
                                    uncomment=True,
                                    encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(scop2bL))
        #
        return desL, claL, scop2bL, scop2L

    def __extractNames(self, nmL):
        """ """
        rD = {}
        logger.info("Length of input name list %d", len(nmL))
        for nm in nmL:
            ff = nm.split(" ")
            rD[ff[0]] = " ".join(ff[1:])
        # self.__mU.doExport(os.path.join(self.__dirPath, "scop2-names.json"), rD, fmt="json", indent=3)
        return rD

    def __extractDomainHierarchy(self, dmL):
        """Extract the domain node identifier hierarchy from the SCOP2 representative assignment file ...

        Returns:
            dict, dict, dict, dict, dict: parent and name type dictionaries, family and superfamily assignments, and
                                          domain to superfamily mapping

            ntD[domainId] = name type TP=protein type, CL=protein class, CF=fold, SF=superfamily, FA=family
            pD[child domain identifier] = parent domain identifier
            fD[(pdbId, authAsymId)] = [(faDomId, faId, authAsymId, resBeg, resEnd),]
            sfD[(pdbId, authAsymId)] = [(sfDomId, sfId, authAsymId, resBeg, resEnd),]
            domToSfD[domSfid] = sfId

        Example assignment file:

        # SCOP release 2021-05-27
        # http://scop.mrc-lmb.cam.ac.uk
        # based on PDB release 2021-05-14
        # based on UniProt realese 2021-04-08
        # based on SIFTS release 2021-05-19
        # FA-DOMID FA-PDBID FA-PDBREG FA-UNIID FA-UNIREG SF-DOMID SF-PDBID SF-PDBREG SF-UNIID SF-UNIREG SCOPCLA
        8045703 3H8D C:1143-1264 Q64331 1143-1264 8091604 3H8D C:1143-1264 Q64331 1143-1264 TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627
        8094330 6J56 A:1158-1282 Q9UM54 1167-1291 8094331 6J56 A:1158-1282 Q9UM54 1167-1291 TP=1,CL=1000003,CF=2001470,SF=3002524,FA=4004627
        #

        """
        # Build the parent dictionary and name node type
        ntD = {}
        pAD = {}
        pBD = {}
        pBRootD = {}
        fD = {}
        sfD = {}
        domToSfD = {}
        #
        logger.info("Length of input domain assignment list %d", len(dmL))
        for dm in dmL:
            try:
                ff = dm.split(" ")
                domFamilyId = ff[0]
                domSuperFamilyId = ff[5]
                rngL = ff[10].split(",")
                tD = {}
                for rng in rngL:
                    tL = rng.split("=")
                    tD[tL[0]] = tL[1]
                #
                # -
                # pD[tD["TP"]] = 0
                # pD[tD["CL"]] = tD["TP"]
                # pD[tD["CF"]] = tD["CL"]
                # pD[tD["SF"]] = tD["CF"]
                # pD[tD["FA"]] = tD["SF"]
                # pD[domFamilyId] = tD["FA"]
                # pD[domSuperFamilyId] = tD["SF"]
                #
                #  Represent as two trees separately rooted in protein type  and structural class
                pAD[tD["TP"]] = 0
                pAD[tD["CF"]] = tD["TP"]
                pAD[tD["SF"]] = tD["CF"]
                pAD[tD["FA"]] = tD["SF"]
                pAD[domFamilyId] = tD["FA"]
                pAD[domSuperFamilyId] = tD["SF"]
                #
                # Use this complete pBD here only for generating ID lineages, but NOT for merging with pAD
                pBD[tD["CL"]] = 0
                pBD[tD["CF"]] = tD["CL"]
                pBD[tD["SF"]] = tD["CF"]
                pBD[tD["FA"]] = tD["SF"]
                pBD[domFamilyId] = tD["FA"]
                pBD[domSuperFamilyId] = tD["SF"]
                #
                # Use pBRootD for creating tree node lists; Don't capture any lower branches to avoid re-creating redundant key:values already in pAD
                pBRootD[tD["CL"]] = 0
                pBRootD[tD["CF"]] = tD["CL"]
                #
                ntD[tD["FA"]] = "FA"
                ntD[tD["SF"]] = "SF"
                ntD[tD["CF"]] = "CF"
                ntD[tD["CL"]] = "CL"
                ntD[tD["TP"]] = "TP"
                #
                pdbId = ff[1]
                authAsymId, authSeqBeg, authSeqEnd = self.__parseAssignment(
                    ff[2])
                if authAsymId is not None:
                    fD.setdefault((pdbId, authAsymId), []).append(
                        (domFamilyId, tD["FA"], authAsymId, authSeqBeg,
                         authSeqEnd))
                pdbId = ff[6]
                authAsymId, authSeqBeg, authSeqEnd = self.__parseAssignment(
                    ff[7])
                if authAsymId is not None:
                    sfD.setdefault((pdbId, authAsymId), []).append(
                        (domSuperFamilyId, tD["SF"], authAsymId, authSeqBeg,
                         authSeqEnd))
                #
                domToSfD[domSuperFamilyId] = tD["SF"]
            except Exception as e:
                logger.exception("Failing for case %r: %s", dm, str(e))
        #
        logger.info("pAD (%d) pBD (%d) pBRootD (%d) ntD (%d)", len(pAD),
                    len(pBD), len(pBRootD), len(ntD))
        logger.info("fD (%d) sfD (%d)", len(fD), len(sfD))
        return pAD, pBD, pBRootD, ntD, fD, sfD, domToSfD

    def __parseAssignment(self, tS):
        authAsymId = authSeqBeg = authSeqEnd = None
        try:
            fL = tS.split(":")
            authAsymId = fL[0]
            rS = fL[1]
            if rS[0] == "-":
                authSeqBeg = -int(rS[1:].split("-")[0])
                authSeqEnd = int(rS[1:].split("-")[1])
            else:
                authSeqBeg = int(rS.split("-")[0])
                authSeqEnd = int(rS.split("-")[1])
        except Exception:
            pass
        return authAsymId, authSeqBeg, authSeqEnd

    def __extractScop2bSuperFamilyAssignments(self, scop2bL, domToSfD):
        """
        Extract the SCOP2B  SIFTS superfamily domain assignments for PDB structure entries.

        Returns:

         aD[(pdbId, authAsymId)] = [(sfDomId, sfId, authAsymId, resBeg, resEnd),]

        Example:

        # 2021/06/12 - 05:52 | PDB: 23.21 | UniProt: 2021.03
          PDB     CHAIN   SF_DOMID        SP_PRIMARY      RES_BEG RES_END PDB_BEG PDB_END SP_BEG  SP_END
          5id7    B       8033045 P02768  197     388     197     388     221     412
          1o9x    A       8033045 P02768  197     388     197     388     221     412
        """
        sfD = {}
        try:
            for rowD in scop2bL:
                if rowD["SF_DOMID"] in domToSfD:
                    sfD.setdefault(
                        (rowD["PDB"].upper(), rowD["CHAIN"]), []).append(
                            (rowD["SF_DOMID"], domToSfD[rowD["SF_DOMID"]],
                             rowD["CHAIN"], rowD["PDB_BEG"], rowD["PDB_END"]))
                else:
                    logger.warning("Missing SCOP2B SF ID mapping for %r",
                                   rowD["SF_DOMID"])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return sfD

    def __exportTreeNodeList(self, nD, pAD, pBRootD):
        """Create node list from the SCOP2 parent and name/description dictionaries.

        Exclude the root node from the tree.

        """
        #
        rootId = 0
        pL = [rootId]
        #
        logger.info("nD %d pAD %d pBRootD %d pL %r", len(nD), len(pAD),
                    len(pBRootD), pL)
        # create child dictionary
        cD = {}
        for ctId, ptId in pAD.items():
            cD.setdefault(ptId, []).append(ctId)
        for ctId, ptId in pBRootD.items():
            cD.setdefault(ptId, []).append(ctId)
        #
        logger.debug("cD %d", len(cD))
        #
        idL = []
        for rootId in sorted(pL):
            visited = set([rootId])
            queue = collections.deque(visited)
            while queue:
                tId = queue.popleft()
                idL.append(tId)
                if tId not in cD:
                    # logger.warning("No children for scop tId %r", tId)
                    continue
                for childId in cD[tId]:
                    if childId not in visited:
                        queue.append(childId)
                        visited.add(childId)
        #
        dL = []
        for tId in idL:
            displayName = nD[tId] if tId in nD else None
            ptIdL = []
            if tId in pAD:
                ptIdL.append(pAD[tId])
            if tId in pBRootD:
                ptIdL.append(pBRootD[tId])
            lL = self.getIdLineage(tId)[1:]
            #
            # d = {'id': str(tId), 'name': displayName, 'lineage': [str(t) for t in lL], 'parents': [str(ptId)], 'depth': len(lL)}
            if tId == rootId:
                continue
            elif any([ptId == rootId for ptId in ptIdL]):
                dD = {"id": str(tId), "name": displayName, "depth": 0}
            else:
                displayName = displayName if displayName else "Domain %s" % str(
                    tId)
                dD = {
                    "id": str(tId),
                    "name": displayName,
                    "parents": ptIdL,
                    "depth": len(lL)
                }
            dL.append(dD)

        return dL
Esempio n. 23
0
class IMGTTargetFeatureProvider(StashableBase):
    """Accessors for IMGT (The International Immunogenetic Information System) target features"""

    # Link out using the IMGT -
    # http://www.imgt.org/3Dstructure-DB/cgi/details.cgi?pdbcode=5w5m&Part=Chain
    #
    def __init__(self, **kwargs):
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        self.__dirName = "IMGT-features"
        super(IMGTTargetFeatureProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__fD = self.__reload(self.__dirPath, useCache)
        #

    def testCache(self, minCount=20000):
        logger.info(
            "IMGT feature count %d",
            len(self.__fD["features"]) if "features" in self.__fD else 0)
        if self.__fD and "features" in self.__fD and len(
                self.__fD["features"]) > minCount:
            return True
        else:
            return False

    def hasFeatures(self, rcsbInstanceId):
        """Return if features exist for the input instance identifier (auth_asym_id)

        Args:
            rcsbInstanceId (str): <pdbId (lower case)>.<auth_asym_id (case sensitive)>

        Returns:
            bool: True for success or False otherwise
        """
        return rcsbInstanceId in self.__fD["features"]

    def getFeatures(self, rcsbInstanceId):
        """Return features for the instance identifier (auth_asym_id)

        Args:
            rcsbInstanceId (str): <pdbId (lower case)>.<auth_asym_id (case sensitive)>

        Returns:
            list: list of feature dictionaries
        """
        try:
            return self.__fD["features"][rcsbInstanceId]
        except Exception:
            return []

    def __getFeatureDataPath(self):
        return os.path.join(self.__dirPath, "IMGT-feature-data.json")

    def reload(self):
        self.__fD = self.__reload(self.__dirPath, True)
        return True

    def __reload(self, dirPath, useCache):
        startTime = time.time()
        fD = {}
        featurePath = self.__getFeatureDataPath()
        #
        logger.info("useCache %r featurePath %r", useCache, featurePath)
        if useCache and self.__mU.exists(featurePath):
            fD = self.__mU.doImport(featurePath, fmt="json")
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload (useCache %r) at %s (%.4f seconds)",
                    useCache,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        return fD

    def buildFeatureList(self, useCache=True):
        """Build polymer instance feature list for IMGT annotations.

        Returns:
            bool: True for success or False otherwise

                    5w5m_B": {
                    "description": "FUSION-TNFRSF1B-GAMMA-1",
                    "domains": {
                        "C-DOMAIN|CH2|1": {
                        "geneAlleles": [
                            {
                                "taxName": "H**o sapiens",
                                "geneAllele": "IGHG4*01"
                            },
                            {
                                "taxName": "H**o sapiens",
                                "geneAllele": "IGHG4*03"
                            },
                            {
                                "taxName": "H**o sapiens",
                                "geneAllele": "IGHG4*04"
                            }
                        ],
                        "alignment": {
                            "begEntitySeqId": 7,
                            "endEntitySeqId": 116,
                            "begIMGTSeqId": "1",
                            "endIMGTSeqId": "105"
                        }
                        },
                        "C-DOMAIN|CH3|2": {
                        "geneAlleles": [
                            {
                                "taxName": "H**o sapiens",
                                "geneAllele": "IGHG4*01"
                            },
                            {
                                "taxName": "H**o sapiens",
                                "geneAllele": "IGHG4*04"
                            }
                        ],
                        "alignment": {
                            "begEntitySeqId": 117,
                            "endEntitySeqId": 221,
                            "begIMGTSeqId": "106",
                            "endIMGTSeqId": "209"
                        }
                        }
                    },
                    "proteinName": "IgG4 Sigma1 Fc",
                    "receptorType": "IG",
                    "receptorDescription": "FUSION-[TNFRSF1B]2-FC-GAMMA-1",
                    "species": "H**o sapiens (human)"
                },
        """
        rDL = []
        imgtP = IMGTTargetProvider(cachePath=self.__cachePath,
                                   useCache=useCache)
        #
        provenanceSource = "IMGT"
        refScheme = "PDB entity"
        assignVersion = imgtP.getVersion()
        #
        chainD = imgtP.getChains()
        #
        fTupL = [
            ("description", "IMGT_ANTIBODY_DESCRIPTION"),
            ("proteinName", "IMGT_ANTIBODY_PROTEIN_NAME"),
            ("receptorType", "IMGT_ANTIBODY_RECEPTOR_TYPE"),
            ("receptorDescription", "IMGT_ANTIBODY_RECEPTOR_DESCRIPTION"),
            ("species", "IMGT_ANTIBODY_ORGANISM_NAME"),
        ]
        ii = 1
        #
        for chainId, chD in chainD.items():
            entryId = chainId[:4]
            authAsymId = chainId.split("_")[1]
            # descriptive features -
            for fTup in fTupL:
                rD = {
                    "entry_id": entryId,
                    "auth_asym_id": authAsymId,
                    "type": fTup[1],
                    "feature_id": "IMGT_" + str(ii),
                    "name": chD[fTup[0]] if fTup[0] in chD else None,
                    "provenance_source": provenanceSource,
                    "reference_scheme": refScheme,
                    "assignment_version": assignVersion,
                    "feature_positions": [],
                }
                rDL.append(rD)
                ii += 1
            # domain features -
            if "domains" not in chD:
                continue
            for domainId, dD in chD["domains"].items():
                dIdL = domainId.split("|")
                domainName = dIdL[0] + " " + dIdL[1]
                begSeqId = endSeqId = None
                if "alignment" in dD:
                    begSeqId = dD["alignment"]["begEntitySeqId"]
                    endSeqId = dD["alignment"]["endEntitySeqId"]
                else:
                    logger.debug("%r missing alignment for in %r", chainId, dD)
                #
                gaL = []
                if "geneAlleles" in dD:
                    for gD in dD["geneAlleles"]:
                        gaL.append(gD["geneAllele"])
                else:
                    logger.debug("%r missing gene and alleles for in %r",
                                 chainId, dD)
                #
                #
                rD = {
                    "entry_id":
                    entryId,
                    "auth_asym_id":
                    authAsymId,
                    "type":
                    "IMGT_ANTIBODY_DOMAIN_NAME",
                    "feature_id":
                    "IMGT_" + str(ii),
                    "name":
                    domainName,
                    "provenance_source":
                    provenanceSource,
                    "reference_scheme":
                    refScheme,
                    "assignment_version":
                    assignVersion,
                    "feature_positions": [{
                        "beg_seq_id": begSeqId,
                        "end_seq_id": endSeqId
                    }],
                }
                rDL.append(rD)
                ii += 1
                #
                for ga in gaL:
                    rD = {
                        "entry_id":
                        entryId,
                        "auth_asym_id":
                        authAsymId,
                        "type":
                        "IMGT_ANTIBODY_GENE_ALLELE_NAME",
                        "feature_id":
                        "IMGT_" + str(ii),
                        "name":
                        ga,
                        "provenance_source":
                        provenanceSource,
                        "reference_scheme":
                        refScheme,
                        "assignment_version":
                        assignVersion,
                        "feature_positions": [{
                            "beg_seq_id": begSeqId,
                            "end_seq_id": endSeqId
                        }],
                    }
                    rDL.append(rD)
                    ii += 1
        #
        qD = {}
        for rD in rDL:
            eId = rD["entry_id"] + "." + rD["auth_asym_id"]
            qD.setdefault(eId, []).append(rD)
        #
        logger.info("IMGT antibody chain features (%d)", len(qD))
        #
        fp = self.__getFeatureDataPath()
        tS = datetime.datetime.now().isoformat()
        vS = assignVersion
        ok = self.__mU.doExport(fp, {
            "version": vS,
            "created": tS,
            "features": qD
        },
                                fmt="json",
                                indent=3)
        return ok
Esempio n. 24
0
class RepoHoldingsDataPrep(object):
    """Consolidate legacy data describing repository content updates and repository entry status."""
    def __init__(self, **kwargs):
        self.__cfgOb = kwargs.get("cfgOb", None)
        self.__cachePath = kwargs.get("cachePath", None)
        self.__sandboxPath = kwargs.get("sandboxPath", None)
        self.__filterType = kwargs.get("filterType", "")
        self.__assignDates = "assign-dates" in self.__filterType
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__currentCacheD = None
        #

    def getHoldingsCombinedEntry(self, updateId, dirPath=None):
        dList = []
        retD = self.__getHoldingsCombined(dirPath=dirPath)
        for entryId, qD in retD.items():
            tD = {
                "rcsb_id": entryId,
                "entry_id": entryId,
                "update_id": updateId
            }
            rD = {
                "rcsb_id": entryId,
                "rcsb_repository_holdings_combined_entry_container_identifiers":
                tD,
                "rcsb_repository_holdings_combined": qD,
            }
            dList.append(rD)
        return dList

    def __getHoldingsCombined(self, dirPath=None):
        retD = {}
        dirPath = dirPath if dirPath else self.__sandboxPath
        currentD = self.__currentCacheD if self.__currentCacheD else self.__getHoldingsCurrent(
            dirPath=dirPath)
        for entryId, tD in currentD.items():
            retD[entryId] = {"status": "CURRENT", "status_code": "REL"}
        logger.debug("Released entries %d", len(retD))
        #
        unRelD = self.__getHoldingsUnreleased(dirPath=dirPath)
        # logger.info("@@@ unRelD %r", unRelD)
        for entryId, tD in unRelD.items():
            if entryId not in retD and tD["status_code"] in [
                    "AUCO", "AUTH", "HOLD", "HPUB", "POLC", "PROC", "REFI",
                    "REPL", "WAIT", "WDRN"
            ]:
                retD[entryId] = {
                    "status": "UNRELEASED",
                    "status_code": tD["status_code"]
                }
        logger.debug("Released & unreleased entries %d", len(retD))
        #
        trfD, _ = self.__getHoldingsTransferred(dirPath=dirPath)
        for entryId, tD in trfD.items():
            if entryId not in retD and tD["status_code"] in ["TRSF"]:
                retD[entryId] = {
                    "status": "REMOVED",
                    "status_code": tD["status_code"]
                }
        #
        logger.debug("Released & unreleased & transferred entries %d",
                     len(retD))
        #
        rmvD, _, replacesD = self.__getHoldingsRemoved(dirPath=dirPath)
        #
        # for entryId in rmvD:
        #    if entryId not in retD:
        #        retD[entryId] = {"status": "REMOVED", "status_code": "OBS"}
        #
        replacedByD = {}
        for entryId, tD in replacesD.items():
            for sId in tD["id_codes_superseded"]:
                replacedByD[sId.strip().upper()] = entryId.strip().upper()
        #
        logger.info("replacedbyD (%d) rmvD (%d) currentD (%d) retD (%d)",
                    len(replacedByD), len(rmvD), len(currentD), len(retD))
        for entryId in rmvD:
            if entryId in currentD:
                continue
            tId = entryId
            if tId in replacedByD:
                if tId == replacedByD[tId]:
                    logger.info("Inconsistent obsolete entry info for %r", tId)
                while tId in replacedByD and tId != replacedByD[tId]:
                    # logger.debug("tId %r replacedByD[tId] %r", tId, replacedByD[tId])
                    tId = replacedByD[tId]
                if tId in currentD:
                    retD[entryId] = {
                        "status": "REMOVED",
                        "status_code": "OBS",
                        "id_code_replaced_by_latest": tId
                    }
                else:
                    logger.debug("%r missing replacedby entry %r", entryId,
                                 tId)
            else:
                retD[entryId] = {"status": "REMOVED", "status_code": "OBS"}
        #
        logger.debug(
            "Released & unreleased & transferred & removed entries %d",
            len(retD))
        return retD

    def getHoldingsCurrentEntry(self, updateId, dirPath=None):
        dList = []
        retD = self.__currentCacheD if self.__currentCacheD else self.__getHoldingsCurrent(
            dirPath=dirPath)
        self.__currentCacheD = retD
        for entryId, qD in retD.items():
            tD = ({
                "rcsb_id": entryId,
                "entry_id": entryId,
                "update_id": updateId,
                "assembly_ids": qD["assembly_ids"]
            } if "assembly_ids" in qD else {
                "rcsb_id": entryId,
                "entry_id": entryId,
                "update_id": updateId
            })
            rD = {
                "rcsb_id": entryId,
                "rcsb_repository_holdings_current_entry_container_identifiers":
                tD,
                "rcsb_repository_holdings_current": {
                    "repository_content_types": qD["repository_content_types"]
                },
            }
            dList.append(rD)
        return dList

    def getHoldingsUpdateEntry(self, updateId, dirPath=None):
        dList = []
        retD = self.__getHoldingsUpdate(dirPath=dirPath)
        for entryId, qD in retD.items():
            tD = {
                "rcsb_id": entryId,
                "entry_id": entryId,
                "update_id": updateId
            }
            rD = {
                "rcsb_id": entryId,
                "rcsb_repository_holdings_update_entry_container_identifiers":
                tD,
                "rcsb_repository_holdings_update": qD,
            }
            dList.append(rD)
        return dList

    def getHoldingsUnreleasedEntry(self, updateId, dirPath=None):
        dList = []
        retD = self.__getHoldingsUnreleased(dirPath=dirPath)
        prD = self.__getHoldingsPrerelease(dirPath=dirPath)
        currentD = self.__currentCacheD if self.__currentCacheD else self.__getHoldingsCurrent(
            dirPath=dirPath)
        self.__currentCacheD = currentD
        for entryId, qD in retD.items():
            if entryId in currentD:
                continue
            rD = {"rcsb_id": entryId}
            rD["rcsb_repository_holdings_unreleased_entry_container_identifiers"] = {
                "rcsb_id": entryId,
                "entry_id": entryId,
                "update_id": updateId
            }
            if entryId in prD:
                rD["rcsb_repository_holdings_prerelease"] = prD[entryId]
                qD["prerelease_sequence_available_flag"] = "Y"
            else:
                qD["prerelease_sequence_available_flag"] = "N"
            rD["rcsb_repository_holdings_unreleased"] = qD
            #
            dList.append(rD)
        return dList

    def getHoldingsRemovedEntry(self, updateId, dirPath=None):
        dList = []
        rmvD, aaD, spsD = self.__getHoldingsRemoved(dirPath=dirPath)
        trfD, insD = self.__getHoldingsTransferred(dirPath=dirPath)
        currentD = self.__currentCacheD if self.__currentCacheD else self.__getHoldingsCurrent(
            dirPath=dirPath)
        self.__currentCacheD = currentD
        #
        # Get the list of candidate keys for removed entries -
        #
        entryIdL = sorted(set(list(insD.keys()) + list(rmvD.keys())))
        for entryId in entryIdL:
            if entryId in currentD:
                continue
            rD = {"rcsb_id": entryId}
            rD["rcsb_repository_holdings_removed_entry_container_identifiers"] = {
                "rcsb_id": entryId,
                "entry_id": entryId,
                "update_id": updateId
            }
            #
            if entryId in rmvD:
                rD["rcsb_repository_holdings_removed"] = rmvD[entryId]
            if entryId in aaD:
                rD["rcsb_repository_holdings_removed_audit_author"] = aaD[
                    entryId]
            if entryId in spsD:
                rD["rcsb_repository_holdings_superseded"] = spsD[entryId]
            if entryId in trfD:
                rD["rcsb_repository_holdings_transferred"] = trfD[entryId]
            if entryId in insD:
                rD["rcsb_repository_holdings_insilico_models"] = insD[entryId]
            dList.append(rD)
        return dList

    def __getHoldingsTransferred(self, dirPath=None):
        """Parse legacy lists defining the repository contents transferred to alternative repositories

        Args:
            updateId (str): update identifier (e.g. 2018_32)
            dirPath (str): directory path containing update list files
            **kwargs: unused

        Returns:
            (dict): dictionaries containing data for rcsb_repository_holdings_transferred
            (dict): dictionaries containing data for rcsb_repository_holdings_insilico_models

        Example input data:

        ma-czyyf : 262D - TITLE A THREE-DIMENSIONAL MODEL OF THE REV BINDING ELEMENT OF HIV- TITLE 2 1 DERIVED FROM ANALYSES OF IN VITRO SELECTED VARIANTS
        ma-cfqla : 163D - TITLE A THREE-DIMENSIONAL MODEL OF THE REV BINDING ELEMENT OF HIV- TITLE 2 1 DERIVED FROM ANALYSES OF IN VITRO SELECTED VARIANTS

        and -

        1DX2    REL 1999-12-16  2000-12-15  Tumour Targetting Human ...  Beiboer, S.H.W., Reurs, A., Roovers, R.C., Arends, J., Whitelegg, N.R.J., Rees, A.R., Hoogenboom, H.R.

        and -

        1APD    OBSLTE  1992-10-15      2APD
        1BU0    OBSLTE  1998-10-07      2BU0
        1CLJ    OBSLTE  1998-03-04      2CLJ
        1DU8    OBSLTE  2001-01-31      1GIE
        1I2J    OBSLTE  2001-01-06      1JA5

        """
        trsfD = {}
        insD = {}
        dirPath = dirPath if dirPath else self.__sandboxPath

        try:
            fp = os.path.join(dirPath, "status",
                              "theoretical_model_obsolete.tsv")
            lineL = self.__mU.doImport(fp, "list")  # pylint: disable=no-member
            #
            obsDateD = {}
            obsIdD = {}
            for line in lineL:
                fields = line.split("\t")
                if len(fields) < 3:
                    continue
                entryId = str(fields[0]).strip().upper()
                obsDateD[entryId] = dateutil.parser.parse(
                    fields[2]) if self.__assignDates else fields[2]
                if len(fields) > 3 and len(fields[3]) > 3:
                    obsIdD[entryId] = str(fields[3]).strip().upper()
            logger.debug("Read %d obsolete insilico id codes", len(obsDateD))
            # ---------  ---------  ---------  ---------  ---------  ---------  ---------
            fp = os.path.join(dirPath, "status",
                              "model-archive-PDB-insilico-mapping.list")
            lineL = self.__mU.doImport(fp, "list")
            #
            trD = {}
            for line in lineL:
                fields = line.split(":")
                if len(fields) < 2:
                    continue
                entryId = str(fields[1]).strip().upper()[:4]
                maId = str(fields[0]).strip()
                trD[entryId] = maId
            logger.debug("Read %d model archive id codes", len(trD))
            #
            # ---------  ---------  ---------  ---------  ---------  ---------  ---------
            fp = os.path.join(dirPath, "status", "theoretical_model_v2.tsv")
            lineL = self.__mU.doImport(fp, "list")
            #
            logger.debug("Read %d insilico id codes", len(lineL))
            for line in lineL:
                fields = str(line).split("\t")
                if len(fields) < 6:
                    continue
                depDate = dateutil.parser.parse(
                    fields[2]) if self.__assignDates else fields[2]
                relDate = None
                if len(fields[3]) >= 10 and not fields[3].startswith("0000"):
                    relDate = dateutil.parser.parse(
                        fields[3]) if self.__assignDates else fields[3]

                statusCode = "TRSF" if fields[1] == "REL" else fields[1]

                entryId = str(fields[0]).upper()
                title = fields[4]
                #
                auditAuthors = [t.strip() for t in fields[5].split(";")]
                repId = None
                if entryId in trD:
                    repName = "Model Archive"
                    repId = trD[entryId]

                #
                dD = {
                    "status_code": statusCode,
                    "deposit_date": depDate,
                    "repository_content_types": ["coordinates"],
                    "title": title,
                    "audit_authors": auditAuthors,
                }
                #
                if relDate:
                    dD["release_date"] = relDate
                #
                if repId:
                    dD["remote_accession_code"] = repId
                    dD["remote_repository_name"] = repName
                if statusCode == "TRSF":
                    trsfD[entryId] = dD
                #
                #
                dD = {
                    "status_code": statusCode,
                    "deposit_date": depDate,
                    "title": title,
                    "audit_authors": auditAuthors
                }
                #
                if relDate:
                    dD["release_date"] = relDate
                #
                if entryId in obsDateD:
                    dD["remove_date"] = relDate
                #
                if entryId in obsIdD:
                    dD["id_codes_replaced_by"] = [obsIdD[entryId]]
                #
                insD[entryId] = dD
            #
            logger.info("Transferred entries %d - insilico models %d",
                        len(trsfD), len(insD))
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return trsfD, insD

    def __getHoldingsUpdate(self, dirPath=None):
        """Parse legacy lists defining the contents of the repository update

        Args:
            updateId (str): update identifier (e.g. 2018_32)
            dirPath (str): directory path containing update list files
            **kwargs: unused

        Returns:
            list: List of dictionaries containing rcsb_repository_holdings_update
        """
        retD = {}
        dirPath = dirPath if dirPath else self.__sandboxPath
        try:
            updateTypeList = ["added", "modified", "obsolete"]
            contentTypeList = ["entries", "mr", "cs", "sf", "nef", "nmr-str"]
            contentNameD = {
                "entries": "coordinates",
                "mr": "NMR restraints",
                "cs": "NMR chemical shifts",
                "sf": "structure factors",
                "nef": "Combined NMR data (NEF)",
                "nmr-str": "Combined NMR data (NMR-STAR)",
            }
            #
            for updateType in updateTypeList:
                uD = {}
                for contentType in contentTypeList:
                    fp = os.path.join(dirPath, "update-lists",
                                      updateType + "-" + contentType)
                    if not self.__mU.exists(fp):
                        continue
                    entryIdL = self.__mU.doImport(fp, "list")
                    #
                    for entryId in entryIdL:
                        entryId = entryId.strip().upper()
                        uD.setdefault(entryId,
                                      []).append(contentNameD[contentType])
                for entryId in uD:
                    uType = "removed" if updateType == "obsolete" else updateType
                    # retD[entryId] = {"update_id": updateId, "entry_id": entryId, "update_type": uType, "repository_content_types": uD[entryId]}
                    retD[entryId] = {
                        "update_type": uType,
                        "repository_content_types": uD[entryId]
                    }
            return retD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return retD

    def __getHoldingsCurrent(self, dirPath=None):
        """Parse legacy lists defining the current contents of the repository update

        Args:
            updateId (str): update identifier (e.g. 2018_32)
            dirPath (str): directory path containing update list files
            **kwargs: unused

        Returns:
            list: List of dictionaries containing data for rcsb_repository_holdings_current
        """
        rD = {}
        retD = {}
        dirPath = dirPath if dirPath else self.__sandboxPath
        try:
            updateTypeList = ["all"]
            contentTypeList = [
                "pdb", "pdb-format", "mr", "cs", "sf", "nef", "nmr-str"
            ]
            contentNameD = {
                "pdb": "coordinates",
                "pdb-format": "PDB format coordinates",
                "mr": "NMR restraints",
                "cs": "NMR chemical shifts",
                "sf": "structure factors",
                "nef": "Combined NMR data (NEF)",
                "nmr-str": "Combined NMR data (NMR-STAR)",
            }
            #
            tD = {}
            for updateType in updateTypeList:
                for contentType in contentTypeList:
                    fp = os.path.join(dirPath, "update-lists",
                                      updateType + "-" + contentType + "-list")
                    if not self.__mU.exists(fp):
                        continue
                    entryIdL = self.__mU.doImport(fp, "list")
                    #
                    for entryId in entryIdL:
                        entryId = entryId.strip().upper()
                        if entryId not in tD:
                            tD[entryId.upper()] = {}
                        tD[entryId.upper()][contentNameD[contentType]] = True
            #
            fp = os.path.join(dirPath, "status", "biounit_file_list.tsv")
            lines = self.__mU.doImport(fp, "list")
            assemD = {}
            for line in lines:
                fields = line.split("\t")
                entryId = fields[0].strip().upper()
                assemId = fields[1].strip()
                if entryId not in assemD:
                    assemD[entryId.upper()] = []
                assemD[entryId.upper()].append(assemId)
            #
            #
            fp = os.path.join(dirPath, "status", "pdb_bundle_index_list.tsv")
            bundleIdList = self.__mU.doImport(fp, "list")
            bundleD = {}
            for entryId in bundleIdList:
                bundleD[entryId.strip().upper()] = True
            #
            fp = os.path.join(dirPath, "status",
                              "validation_report_list_new.tsv")
            vList = self.__mU.doImport(fp, "list")
            valD = {}
            valImageD = {}
            valCifD = {}
            for line in vList:
                fields = line.split("\t")
                entryId = fields[0].strip().upper()
                imageFlag = fields[1].strip().upper()
                valD[entryId] = True
                valImageD[entryId] = imageFlag == "Y"
                if len(fields) > 2:
                    valCifD[entryId] = fields[2].strip().upper() == "Y"
            #
            #
            fp = os.path.join(dirPath, "status",
                              "entries_without_polymers.tsv")
            pList = self.__mU.doImport(fp, "list")
            pD = {}
            for entryId in pList:
                pD[entryId.strip().upper()] = False
            #
            #
            fp = os.path.join(dirPath, "status", "nmr_restraints_v2_list.tsv")
            nmrV2List = self.__mU.doImport(fp, "list")
            nmrV2D = {}
            for entryId in nmrV2List:
                nmrV2D[entryId.strip().upper()] = False
            #
            if self.__cfgOb:
                configName = self.__cfgOb.getDefaultSectionName()
                fp = self.__cfgOb.getPath("RCSB_EDMAP_LIST_PATH",
                                          sectionName=configName)
            else:
                fp = os.path.join(dirPath, "status", "edmaps.json")
            qD = self.__mU.doImport(fp, "json")
            edD = {}
            for entryId in qD:
                edD[entryId.upper()] = qD[entryId]
            #
            fp = os.path.join(dirPath, "status", "obsolete_entry.json_2")
            oL = self.__mU.doImport(fp, "json")
            obsD = {}
            for dD in oL:
                obsD[dD["entryId"].upper()] = True
            logger.info("Removed entry length %d", len(obsD))
            #
            #
            # Revise content types bundles and assemblies
            #
            for qId, dD in tD.items():
                entryId = qId.strip().upper()
                if entryId in obsD:
                    continue
                rD[entryId] = []
                if entryId in bundleD:
                    rD[entryId].append("entry PDB bundle")
                if "coordinates" in dD:
                    rD[entryId].append("entry mmCIF")
                    rD[entryId].append("entry PDBML")
                if "PDB format coordinates" in dD:
                    rD[entryId].append("entry PDB")
                if entryId in assemD:
                    if entryId in bundleD:
                        rD[entryId].append("assembly mmCIF")
                    else:
                        rD[entryId].append("assembly PDB")
                #
                for cType in dD:
                    if cType not in [
                            "coordinates", "PDB format coordinates",
                            "NMR restraints"
                    ]:
                        rD[entryId].append(cType)
                    if cType == "NMR restraints":
                        rD[entryId].append("NMR restraints V1")

                if entryId in nmrV2D:
                    rD[entryId].append("NMR restraints V2")
                #
                if entryId in valD:
                    rD[entryId].append("validation report")
                if entryId in valImageD and valImageD[entryId]:
                    rD[entryId].append("validation slider image")
                if entryId in valCifD and valCifD[entryId]:
                    rD[entryId].append("validation data mmCIF")
                if entryId in edD:
                    rD[entryId].append("2fo-fc Map")
                    rD[entryId].append("fo-fc Map")
                    rD[entryId].append("Map Coefficients")
                if entryId not in pD:
                    rD[entryId].append("FASTA sequence")
            #
            for entryId in rD:
                if entryId in assemD:
                    retD[entryId] = {
                        "assembly_ids": assemD[entryId],
                        "repository_content_types": rD[entryId]
                    }
                else:
                    retD[entryId] = {"repository_content_types": rD[entryId]}
            return retD
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return retD

    def __getHoldingsUnreleased(self, dirPath=None):
        """Parse the legacy exchange status file containing details for unreleased entries:

        Args:
            updateId (str): update identifier (e.g. 2018_32)
            dirPath (str): directory path containing update list files
            **kwargs: unused

        Returns:
            list: List of dictionaries containing data for rcsb_repository_holdings_unreleased

        """
        retD = {}
        fields = []
        dirPath = dirPath if dirPath else self.__sandboxPath
        try:
            #
            fp = os.path.join(dirPath, "status", "status_v2.txt")
            lines = self.__mU.doImport(fp, "list")
            for line in lines:
                fields = line.split("\t")
                if len(fields) < 15:
                    continue
                entryId = fields[1]
                dD = {
                    "status_code": fields[2]
                    # 'sg_project_name': fields[14],
                    # 'sg_project_abbreviation_': fields[15]}
                }
                if fields[11] and fields[11].strip():
                    dD["title"] = fields[11]
                if fields[10] and fields[10].strip():
                    dD["audit_authors"] = [
                        t.strip() for t in fields[10].split(";")
                    ]
                    # d['audit_authors'] = fields[10]
                if fields[12] and fields[12].strip():
                    dD["author_prerelease_sequence_status"] = str(
                        fields[12]).strip().replace("REALEASE", "RELEASE")
                dTupL = [
                    ("deposit_date", 3),
                    ("deposit_date_coordinates", 4),
                    ("deposit_date_structure_factors", 5),
                    ("hold_date_structure_factors", 6),
                    ("deposit_date_nmr_restraints", 7),
                    ("hold_date_nmr_restraints", 8),
                    ("release_date", 9),
                    ("hold_date_coordinates", 13),
                ]
                for dTup in dTupL:
                    fN = dTup[1]
                    if fields[fN] and len(fields[fN]) >= 4:
                        dD[dTup[0]] = dateutil.parser.parse(
                            fields[fN]) if self.__assignDates else fields[fN]
                #
                retD[entryId] = {k: v for k, v in dD.items() if v}
        except Exception as e:
            logger.error("Fields: %r", fields)
            logger.exception("Failing with %s", str(e))

        return retD

    def __getHoldingsRemoved(self, dirPath=None):
        """Parse the legacy exchange file containing details of removed entries:

            {
                "entryId": "125D",
                "obsoletedDate": "1998-04-15",
                "title": "SOLUTION STRUCTURE OF THE DNA-BINDING DOMAIN OF CD=2=-GAL4 FROM S. CEREVISIAE",
                "details": "",
                "depositionAuthors": [
                    "Baleja, J.D.",
                    "Wagner, G."
                ],
                "depositionDate": "1993-05-05",
                "releaseDate": "1994-01-31",
                "obsoletedBy": [
                    "1AW6"
                ],
                "content_type": [
                    "entry mmCIF",
                    "entry PDB",
                    "entry PDBML",
                    "structure factors"
                ]},

        Returns;
            (dict) : dictionaries for rcsb_repository_holdings_removed
            (dict) : dictionaries for rcsb_repository_holdings_removed_audit_authors
            (dict) : dictionaries for rcsb_repository_holdings_superseded

        """
        # rcsb_repository_holdings_removed
        rL1D = {}
        # rcsb_repository_holdings_removed_audit_authors
        rL2D = {}
        # rcsb_repository_holdings_superseded
        rL3D = {}
        #
        sD = {}
        dirPath = dirPath if dirPath else self.__sandboxPath
        try:
            fp = os.path.join(dirPath, "status", "obsolete_entry.json_2")
            dD = self.__mU.doImport(fp, "json")
            for dT in dD:
                # ---
                ctL = dT["content_type"] if "content_type" in dT else []
                # ---
                rbL = dT["obsoletedBy"] if "obsoletedBy" in dT else []
                d1 = {
                    "title": dT["title"],
                    "details": dT["details"],
                    "audit_authors": dT["depositionAuthors"]
                }
                if rbL:
                    d1["id_codes_replaced_by"] = [t.upper() for t in rbL]
                if ctL:
                    d1["repository_content_types"] = ctL

                dTupL = [("deposit_date", "depositionDate"),
                         ("remove_date", "obsoletedDate"),
                         ("release_date", "releaseDate")]
                for dTup in dTupL:
                    fN = dTup[1]
                    if dT[fN] and len(dT[fN]) > 4:
                        d1[dTup[0]] = dateutil.parser.parse(
                            dT[fN]) if self.__assignDates else dT[fN]

                rL1D[dT["entryId"]] = {k: v for k, v in d1.items() if v}
                #
                for ii, author in enumerate(dT["depositionAuthors"]):
                    d2 = {"ordinal_id": ii + 1, "audit_author": author}
                    rL2D.setdefault(dT["entryId"], []).append(d2)
                if "obsoletedBy" in dT:
                    for pdbId in dT["obsoletedBy"]:
                        if pdbId not in sD:
                            sD[pdbId] = []
                        sD[pdbId].append(dT["entryId"])
            #
            for pdbId in sD:
                if sD[pdbId]:
                    rL3D[pdbId] = {"id_codes_superseded": sD[pdbId]}

            logger.debug("Computed data lengths  %d %d %d", len(rL1D),
                         len(rL2D), len(rL3D))
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return rL1D, rL2D, rL3D

    def __getHoldingsPrerelease(self, dirPath=None):
        """Parse the legacy exchange status file containing prerelease sequence data.

        Args:
            updateId (str): update identifier (e.g. 2018_32)
            dirPath (str): directory path containing update list files
            **kwargs: unused

        Returns:
            list: List of dictionaries containing data for rcsb_repository_holdings_prerelease

        >6I99 Entity 1
        HHHHHHENLYFQGELKREEITLLKELGSGQFGVVKLGKWKGQYDVAVKMIKEG....
        >6JKE Entity 1
        GRVTNQLQYLHKVVMKALWKHQFAWPFRQPVDAVKLGLPDYHKIIKQPMDMGTI....

        """
        retD = {}
        fields = []
        dirPath = dirPath if dirPath else self.__sandboxPath
        try:
            # Get prerelease sequence data
            fp = os.path.join(dirPath, "sequence", "pdb_seq_prerelease.fasta")
            sD = self.__mU.doImport(fp, "fasta", commentStyle="prerelease")
            seqD = {}
            for sid in sD:
                fields = sid.split("_")
                entryId = str(fields[0]).upper()
                entityId = str(fields[1])
                if entryId not in seqD:
                    seqD[entryId] = []
                seqD[entryId].append((entityId, sD[sid]["sequence"]))
            logger.debug("Loaded prerelease sequences for %d entries",
                         len(seqD))
            #
            for entryId, seqTupL in seqD.items():
                # dD = {"seq_one_letter_code": seqL}
                logger.debug("Adding prerelease sequences for %s", entryId)
                for entityId, seqS in seqTupL:
                    if not seqS:
                        continue
                    retD.setdefault(entryId, []).append({
                        "entity_id":
                        entityId,
                        "seq_one_letter_code":
                        seqS
                    })
                #
                # retD[entryId] = {k: v for k, vTup in dD.items() if vTup[1]}
        except Exception as e:
            logger.error("Fields: %r", fields)
            logger.exception("Failing with %s", str(e))

        return retD
class IMGTTargetProvider(StashableBase):
    """Accessors for IMGT target annotations."""
    def __init__(self, cachePath, useCache, **kwargs):
        #
        self.__cachePath = cachePath
        self.__dirName = "IMGT-targets"
        imgtDumpUrl = kwargs.get("IMGTDumpUrl", None)
        super(IMGTTargetProvider, self).__init__(self.__cachePath,
                                                 [self.__dirName])
        self.__dirPath = os.path.join(self.__cachePath, self.__dirName)
        self.__version = None
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__imgtD = self.__reload(self.__dirPath,
                                     useCache=useCache,
                                     imgtDumpUrl=imgtDumpUrl)
        #

    def testCache(self, minCount=1000):
        if self.__imgtD and "chains" in self.__imgtD and len(
                self.__imgtD["chains"]) > minCount:
            return True
        else:
            return False

    def getVersion(self):
        return self.__version

    def getChains(self):
        return self.__imgtD["chains"]

    def __reload(self,
                 dirPath,
                 useCache=False,
                 imgtDumpUrl=None,
                 testList=None,
                 maxCount=None):
        imgtD = {}
        startTime = time.time()

        fU = FileUtil()
        fU.mkdir(dirPath)
        #
        imgtDataPath = os.path.join(self.__dirPath, "imgt-data.json")
        #
        logger.info("useCache %r imgtFeaturePath %r", useCache, imgtDataPath)
        if useCache and self.__mU.exists(imgtDataPath):
            imgtD = self.__mU.doImport(imgtDataPath, fmt="json")
            self.__version = imgtD["version"]
        else:
            imgtDumpUrl = imgtDumpUrl if imgtDumpUrl else "http://www.imgt.org/download/3Dstructure-DB/IMGT3DFlatFiles.tgz"
            imgtReadmeUrl = "http://www.imgt.org/download/3Dstructure-DB/RELEASE"
            imgtDumpFileName = fU.getFileName(imgtDumpUrl)
            imgtDumpPath = os.path.join(dirPath, imgtDumpFileName)
            imgtReleasePath = os.path.join(dirPath, "IMGT-release.txt")
            _, fn = os.path.split(imgtDumpUrl)
            imgtFlatFilePath = os.path.join(self.__dirPath, fn[:-4])
            #
            logger.info("Fetching url %s path %s", imgtDumpUrl, imgtDumpPath)
            ok1 = fU.get(imgtDumpUrl, imgtDumpPath)
            ok2 = fU.get(imgtReadmeUrl, imgtReleasePath)
            fU.unbundleTarfile(imgtDumpPath, dirPath=dirPath)
            logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1
                        and ok2,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
            # ---
            readmeLines = self.__mU.doImport(imgtReleasePath, fmt="list")
            self.__version = readmeLines[0].strip() if readmeLines else None
            logger.info("IMGT version %r", self.__version)
            # ---
            chainD, rawD = self.__imgtFlatFileProcessor(imgtFlatFilePath,
                                                        maxCount=maxCount,
                                                        testList=testList)
            # ---
            tS = datetime.datetime.now().isoformat()
            # vS = datetime.datetime.now().strftime("%Y-%m-%d")
            if testList:
                imgtD = {
                    "version": self.__version,
                    "date": tS,
                    "chains": chainD,
                    "raw": rawD
                }
            else:
                imgtD = {
                    "version": self.__version,
                    "date": tS,
                    "chains": chainD
                }
            ok = self.__mU.doExport(imgtDataPath, imgtD, fmt="json", indent=3)
            logger.info("Completed flatfile prep (%r) at %s (%.4f seconds)",
                        ok, time.strftime("%Y %m %d %H:%M:%S",
                                          time.localtime()),
                        time.time() - startTime)
        return imgtD

    def exportFasta(self, withGaps=False):
        """
        Example:
            The IMGT/GENE-DB FASTA header contains 15 fields separated by '|':

            1. IMGT/LIGM-DB accession number(s)
            2. IMGT gene and allele name
            3. species (may be followed by an "_" and the name of the strain, breed or isolate, if defined)
            4. IMGT gene and allele functionality
            5. exon(s), region name(s), or extracted label(s)
            6. start and end positions in the IMGT/LIGM-DB accession number(s)
            7. number of nucleotides in the IMGT/LIGM-DB accession number(s)
            8. codon start, or 'NR' (not relevant) for non coding labels
            9. +n: number of nucleotides (nt) added in 5' compared to the corresponding label extracted from IMGT/LIGM-DB
            10. +n or -n: number of nucleotides (nt) added or removed in 3' compared to the corresponding label extracted from IMGT/LIGM-DB
            11. +n, -n, and/or nS: number of added, deleted, and/or substituted nucleotides to correct sequencing errors, or 'not corrected' if non corrected sequencing errors
            12. number of amino acids (AA): this field indicates that the sequence is in amino acids
            13. number of characters in the sequence: nt (or AA)+IMGT gaps=total
            14. partial (if it is)
            15. reverse complementary (if it is)

        """
        # --
        fU = FileUtil()
        fU.mkdir(self.__dirPath)
        if withGaps:
            imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithGaps-F+ORF+inframeP"
        else:
            imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithoutGaps-F+ORF+inframeP"
        imgtTargetFileName = fU.getFileName(imgtTargetUrl)
        rawFastaPath = os.path.join(self.__dirPath, imgtTargetFileName)
        # --
        logger.debug("Fetching url %s path %s", imgtTargetUrl, rawFastaPath)
        ok = fU.get(imgtTargetUrl, rawFastaPath)
        logger.info("Fetch status (%r) url %s path %s", ok, imgtTargetUrl,
                    rawFastaPath)
        # --
        fastaPath = os.path.join(self.__dirPath, "imgt-reference.fa")
        taxonPath = os.path.join(self.__dirPath, "imgt-reference-taxon.tdd")
        tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=True)
        ok = tP.testCache()
        if not ok:
            tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=False)

        rawQD = self.__mU.doImport(rawFastaPath,
                                   fmt="fasta",
                                   commentStyle="default")
        oD = {}
        taxonL = []
        for queryId, sD in rawQD.items():
            qL = queryId.split("|")
            tL = qL[2].split("_")
            taxName = tL[0]
            taxVar = tL[1].replace(" ", "_") if len(tL) > 1 else None
            taxId = tP.getTaxId(taxName)
            if taxId:
                tD = {
                    "seqId": qL[0],
                    "imgtGene": qL[1],
                    "functionality": qL[3],
                    "labels": qL[4],
                    "taxId": taxId
                }
                if taxVar:
                    tD["taxVar"] = taxVar
                sD.update(tD)
            else:
                logger.info("Unknown taxonomy %r (taxName=%r)", queryId,
                            taxName)
            sD["sequence"].replace(".", "-")
            seqId = ""
            cL = []
            for k, v in sD.items():
                if k in ["sequence"]:
                    continue
                cL.append(str(v))
                cL.append(str(k))
            seqId = "|".join(cL)
            oD[seqId] = sD
            taxonL.append("%s\t%s" % (seqId, taxId))
        #
        ok1 = self.__mU.doExport(taxonPath, taxonL, fmt="list")
        ok2 = self.__mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True)
        return ok1 and ok2

    def __imgtFlatFileProcessor(self,
                                flatFilePath,
                                maxCount=None,
                                testList=None):
        chainD = {}
        rawD = {}
        failures = []
        idList = []
        ic = 0
        filePattern = os.path.join(flatFilePath, "*.pdb.gz")
        logger.info("Collecting flat files with pattern %r", filePattern)
        for fp in glob.glob(filePattern):
            ic += 1
            if maxCount and ic > maxCount:
                break
            logger.debug("Processing file %r", fp)
            _, fn = os.path.split(fp)
            pdbId = fn[5:9].lower()
            if testList and pdbId not in testList:
                continue
            idList.append(pdbId)
            cD = {}
            tmpD = {}
            with gzip.open(fp, "rb") as ifh:
                try:
                    cD, tmpD = self.__imgtRemarkParser(pdbId, ifh)
                except Exception as e:
                    failures.append(pdbId)
                    logger.exception("Failing for %r with %s", pdbId, str(e))
                    continue
            # --
            chainD.update(cD)
            rawD[pdbId] = tmpD
        #
        logger.info("ID List (%d)", len(set(idList)))
        sL = list(rawD.keys())
        logger.info("Successes (%d) chains (%d)", len(sL), len(chainD))
        logger.info("Exceptions (%d) %r", len(failures), failures)
        mL = list(set(idList) - set(sL))
        logger.info("Missing (%d) %r", len(mL), mL)
        #
        return chainD, rawD

    def __imgtRemarkParser(self, pdbId, ifh):
        """IMGT REMARK 410 Parser

        Args:
            pdbId (str): input PDB ID [description]
            ifh (obj): input file handle

        Returns:
            dict: content dictionary of parsed details
        """
        sD = {
            "IMGT protein name": {
                "section": "proteins"
            },
            "ligand(s)": {
                "section": "ligands"
            },
            "Chain ID  ": {
                "section": "chains"
            },
        }
        pD = {
            "Chain ID  ": {
                "ky": "chain_data",
                "action": "appendAll"
            },
            #
            "ligand(s)": {
                "ky": "ligands",
                "action": "appendLine"
            },
            "IMGT protein name": {
                "ky": "proteinName",
                "action": "appendLine"
            },
            "IMGT receptor type": {
                "ky": "receptorType",
                "action": "appendLine"
            },
            "IMGT receptor description": {
                "ky": "receptorDescription",
                "action": "appendLine"
            },
            "Species": {
                "ky": "species",
                "action": "appendLine"
            },
            "Chain ID": {
                "ky": "chain_ids",
                "action": "appendLine"
            },
            #
        }
        cD = {}
        oD = {}
        curSection = None
        action = None
        curKy = None
        curSection = None
        curChain = None
        for ul in ifh.readlines():
            line = ul.decode("utf-8")
            if not line.startswith("REMARK 410 "):
                continue
            #
            curLine = line[11:-1]

            for section, sectionD in sD.items():
                if curLine.startswith(section):
                    logger.debug("%r Detected section %r", pdbId, section)
                    curSection = sectionD["section"]
                    first = True
                    break
            #
            for label, labelD in pD.items():
                if curLine.startswith(label):
                    logger.debug("%r detected label %r", pdbId, label)
                    curKy = labelD["ky"]
                    action = labelD["action"]
                    first = True
                    break
            #
            if action == "appendLine":
                if first:
                    first = False
                    logger.debug("Skipped %r", curKy)
                    continue
                logger.debug(">> SECTION %r KEY %r Adding %r", curSection,
                             curKy, curLine.strip())
                oD.setdefault(curSection,
                              {}).setdefault(curKy, []).append(curLine.strip())
            elif action == "appendAll":
                if first:
                    tL = [t for t in curLine.split(" ") if t]
                    curChain = tL[2]
                    first = False
                    logger.debug("%r current chain key %r", pdbId, curChain)
                    continue
                oD.setdefault(curSection, {}).setdefault(curChain,
                                                         []).append(curLine)
        # --  rD raw extracted REM 410 content
        #  Post-process the domain annotations and alignments

        for chId, cL in oD["chains"].items() if "chains" in oD else {}:
            logger.debug("%r chainId %r (%d)", pdbId, chId, len(cL))
            tD = {}
            tD["description"] = self.__getField(
                cL, label="IMGT chain description  ")
            tD["domains"] = self.__splitDomains(pdbId, cL)
            #
            aD = self.__getAlignment(pdbId, cL)
            if aD and (len(aD["alignMapDL"]) == len(tD["domains"])):
                aL = aD["alignMapDL"]
                for ii, dD in enumerate(tD["domains"].values()):
                    if aL and len(aL) > ii:
                        dD["alignment"] = aL[ii]
            # --
            #  Integrate raw "proteins"  content
            if "proteins" in oD:
                paD = self.__getProteinAnnotations(chId, oD["proteins"])
                logger.debug("paD %r", paD)
                tD.update(paD)
            # --
            cD[chId] = tD
        #
        return cD, oD
        #

    def __getProteinAnnotations(self, chainId, pLD):
        """
        Example:

         "proteins": {
            "proteinName": [
               "IgG4 Sigma1 Fc"
            ],
            "receptorType": [
               "IG"
            ],
            "receptorDescription": [
               "FUSION-[TNFRSF1B]2-FC-GAMMA-1"
            ],
            "species": [
               "H**o sapiens (human)"
            ],
            "chain_ids": [
               "5w5m_A,5w5m_B"
            ]
         },
        """
        retD = {}
        try:
            ind = -1
            if "chain_ids" in pLD:
                for ii, chS in enumerate(pLD["chain_ids"]):
                    if chainId in chS:
                        ind = ii
                        break
                if ind >= 0:
                    for ky in [
                            "proteinName", "receptorType",
                            "receptorDescription", "species"
                    ]:
                        if ky in pLD and len(pLD[ky]) > ind:
                            retD[ky] = pLD[ky][ind]
                else:
                    logger.info("missing chain %r in %r", chainId,
                                pLD["chain_ids"])
            else:
                logger.info("missing chain details for %r in %r", chainId, pLD)
        except Exception as e:
            logger.exception("Failing for %r with %s", chainId, str(e))

        return retD

    def __getField(self, lineList, label):
        label = "IMGT chain description  "
        ret = None
        for line in lineList:
            if line.startswith(label):
                ret = line[len(label):]
                break
        return ret

    def __splitDomains(self, pdbId, lineList):
        retD = {}
        startLabel1 = "-DOMAIN      IMGT domain description  "
        startLabel2 = "-LIKE-DOMAIN IMGT domain description  "
        startLabel3 = "-LIKE-DOMAIN IMGT domain description "
        #
        geneLabel1 = "-DOMAIN      IMGT gene and allele     "
        geneLabel2 = "-LIKE-DOMAIN IMGT gene and allele     "
        geneLabel3 = "-LIKE-DOMAIN IMGT gene and allele    "
        tD = {}
        domain = None
        numD = 0
        for line in lineList:
            if line[1:].startswith(startLabel1):
                numD += 1
                domain = line.split(" ")[0].strip()
                description = line[len(startLabel1) + 1:].strip()
                continue
            if line[1:].startswith(startLabel2):
                numD += 1
                domain = line.split(" ")[0].strip()
                description = line[len(startLabel2) + 1:].strip()
                continue
            if line[2:].startswith(startLabel3):
                numD += 1
                domain = line.split(" ")[0].strip()
                description = line[len(startLabel3) + 1:].strip()
                continue
            if domain and line.startswith(domain):
                tD.setdefault((domain, description, numD), []).append(line)
        #
        qD = {}
        for (domain, description, numD), cL in tD.items():
            for line in cL:
                if line[1:].startswith(geneLabel1):
                    qD.setdefault(
                        (domain + "|" + description + "|" + str(numD)),
                        []).append(line[len(geneLabel1) + 1:])
                if line[1:].startswith(geneLabel2):
                    qD.setdefault(
                        (domain + "|" + description + "|" + str(numD)),
                        []).append(line[len(geneLabel2) + 1:])
                if line[2:].startswith(geneLabel3):
                    qD.setdefault(
                        (domain + "|" + description + "|" + str(numD)),
                        []).append(line[len(geneLabel3) + 1:])
        #
        # "H**o sapiens IGHG4*01 (96.4%), H**o sapiens IGHG4*03 (96.4%), H**o sapiens IGHG4*04 (96.4%)",
        for ky, cL in qD.items():
            logger.debug("cL %r", cL)
            tS = "".join(cL)
            tS = " ".join(tS.split())
            logger.debug("tS %r", tS)
            #  handle some missing commas in the raw data -
            tS = tS.replace(") ", "),")
            logger.debug("TAX> %r tS %r", pdbId, tS)
            gnSL = tS.split(",")
            gDL = []
            for gnS in gnSL:
                tL = gnS.strip().split()
                logger.debug("tL %r", tL)
                geneAllele = tL[-2]
                taxName = " ".join(tL[:-2])
                gDL.append({"taxName": taxName, "geneAllele": geneAllele})
            retD[ky] = {"geneAlleles": gDL}

        return retD

    def __getAlignment(self, pdbId, lineList):
        try:
            startPat = "Chain amino acid sequence"
            endPat1 = "-DOMAIN"
            endPat2 = "-LIKE-DOMAIN"
            aL = []
            keep = False
            for line in lineList:
                if line.startswith(startPat):
                    keep = True
                    continue
                if line[1:].startswith(endPat1):
                    break
                if line[1:].startswith(endPat2):
                    break
                if line[2:].startswith(endPat2):
                    break
                if keep:
                    aL.append(line)
            #
            sLine = "".join(aL[1::2])
            mLine = "".join(aL[0::2])
            # Lots of cases with (UNK) sequences where REM 410 format is corrupt --
            # if "(UNK)" in sLine:
            #    logger.error("%r unknown or modified residue in one-letter-code sequence %r", pdbId, sLine[:30] + "...")
            #    return {}
            ok, indD = self.__findMatchingGroups(mLine,
                                                 startGroup="[",
                                                 endGroup="]")
            if not ok:
                logger.error("%r determining alignment boundaries fails",
                             pdbId)
                return {}
            pdbRangeL = []
            for iBeg, iEnd in indD.items():
                if iEnd - iBeg <= 3:
                    continue
                pdbRangeL.append({
                    "begEntitySeqId": iBeg + 1,
                    "endEntitySeqId": iEnd + 1
                })
            ok, indD = self.__findMatchingGroups(mLine,
                                                 startGroup="(",
                                                 endGroup=")")
            if not ok:
                logger.error("%r determining alignment boundaries fails",
                             pdbId)
                return {}
            imgtRangeL = []
            try:
                for k, v in indD.items():
                    tS = mLine[k + 1:v]
                    tL = tS.split("-")
                    iBeg = tL[0]
                    iEnd = tL[1]
                    imgtRangeL.append({
                        "begIMGTSeqId": iBeg,
                        "endIMGTSeqId": iEnd
                    })
            except Exception as e:
                logger.error("%r parsing boundaries fails with %r for %r",
                             pdbId, str(e), mLine[:30] + "...")
                return {}
            #
            alignMapDL = []
            for pdbD, imgtD in zip(pdbRangeL, imgtRangeL):
                dD = pdbD
                dD.update(imgtD)
                alignMapDL.append(dD)
            return {
                "mapping": mLine,
                "pdbSeq": sLine,
                "alignMapDL": alignMapDL
            }
        except Exception as e:
            logger.exception("Failing %r with %r with %s", pdbId, mLine,
                             str(e))
        return {}

    def __findMatchingGroups(self, strIn, startGroup="[", endGroup="]"):
        retD = {}
        dStack = []
        ok = True
        try:
            for i, cS in enumerate(strIn):
                if cS == startGroup:
                    dStack.append(i)
                elif cS == endGroup:
                    if len(dStack) == 0:
                        logger.error(
                            "No matching closing group at position: %r",
                            str(i))
                        ok = False
                    retD[dStack.pop()] = i

            if len(dStack) > 0:
                logger.error("No matching opening group at: %r",
                             strIn(dStack.pop()))
                ok = False
        except Exception:
            pass
        return ok, retD
Esempio n. 26
0
class ScopClassificationProvider(StashableBase):
    """Extract SCOPe assignments, term descriptions and SCOP classifications
    from SCOP flat files.

    """
    def __init__(self, **kwargs):
        #
        self.__dirName = "scop"
        if "cachePath" in kwargs:
            self.__cachePath = os.path.abspath(kwargs.get("cachePath", None))
            self.__scopDirPath = os.path.join(self.__cachePath, self.__dirName)
        else:
            self.__scopDirPath = kwargs.get("scopDirPath", ".")
            self.__cachePath, self.__dirName = os.path.split(
                os.path.abspath(self.__scopDirPath))
        super(ScopClassificationProvider,
              self).__init__(self.__cachePath, [self.__dirName])
        #
        useCache = kwargs.get("useCache", True)
        # urlTarget = kwargs.get("scopTargetUrl", "http://scop.berkeley.edu/downloads/update")
        # self.__version = kwargs.get("scopVersion", "2.07-2019-07-23")
        # self.__version = kwargs.get("scopVersion", "2.07-2020-01-23")
        # self.__version = kwargs.get("scopVersion", "2.07-2020-05-07")
        # self.__version = kwargs.get("scopVersion", "2.07-2021-07-07")
        urlTarget = kwargs.get("scopTargetUrl",
                               "http://scop.berkeley.edu/downloads/parse")
        self.__version = kwargs.get("scopVersion", "2.08-stable")
        #
        urlBackupPath = kwargs.get(
            "scopUrlBackupPath",
            "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP"
        )
        #
        self.__mU = MarshalUtil(workPath=self.__scopDirPath)
        self.__nD, self.__pD, self.__pdbD = self.__reload(
            urlTarget,
            self.__scopDirPath,
            useCache=useCache,
            version=self.__version)
        #
        if not useCache and not self.testCache():
            ok = self.__fetchFromBackup(urlBackupPath, self.__scopDirPath)
            if ok:
                self.__nD, self.__pD, self.__pdbD = self.__reload(
                    urlTarget,
                    self.__scopDirPath,
                    useCache=True,
                    version=self.__version)

    def testCache(self):
        logger.info("SCOP lengths nD %d pD %d pdbD %d", len(self.__nD),
                    len(self.__pD), len(self.__pdbD))
        if (len(self.__nD) > 100) and (len(self.__pD) > 100) and (len(
                self.__pdbD) > 100):
            return True
        return False

    def __fetchFromBackup(self, urlBackupPath, scopDirPath):
        pyVersion = sys.version_info[0]
        fn = "scop_domains-py%s.pic" % str(pyVersion)
        scopDomainPath = os.path.join(scopDirPath, fn)
        self.__mU.mkdir(scopDirPath)
        #
        backupUrl = urlBackupPath + "/" + fn
        logger.info("Using backup URL %r", backupUrl)
        fU = FileUtil()
        ok = fU.get(backupUrl, scopDomainPath)
        return ok

    def getScopVersion(self):
        return self.__version

    def getScopSunIds(self, pdbId, authAsymId):
        """
        Get the sunid of the domain assignment for the assignment -

        aD[(pdbId, authAsymId)] = [(sunId, domainId, (authAsymId, resBeg, resEnd))]

        aD[(pdbId, authAsymId)] = [(domSunId, domainId, sccs, (authAsymId, resBeg, resEnd))]
        """
        try:
            return list(
                set([tup[0] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))

        return []

    def getScopDomainNames(self, pdbId, authAsymId):
        try:
            return list(
                set([tup[1] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))

        return []

    def getScopSccsNames(self, pdbId, authAsymId):
        try:
            return list(
                set([tup[2] for tup in self.__pdbD[(pdbId, authAsymId)]]))
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))

        return []

    def getScopResidueRanges(self, pdbId, authAsymId):
        try:
            return [(tup[0], tup[1], tup[2], tup[3][0], tup[3][1], tup[3][2])
                    for tup in self.__pdbD[(pdbId, authAsymId)]]
        except Exception as e:
            logger.debug("Failing for %r %r with %s", pdbId, authAsymId,
                         str(e))

        return []

    def getScopName(self, sunId):
        try:
            return self.__nD[sunId]
        except Exception:
            logger.debug("Undefined SCOP sunId %r", sunId)
        return None

    def getIdLineage(self, sunId):
        pList = []
        try:
            pList.append(sunId)
            pt = self.__pD[sunId]
            while (pt is not None) and (pt != 0):
                pList.append(pt)
                pt = self.__pD[pt]
        except Exception as e:
            logger.exception("Failing for %r with %s", sunId, str(e))
        #
        pList.reverse()
        return pList

    def getNameLineage(self, sunId):
        try:
            return [self.getScopName(cId) for cId in self.getIdLineage(sunId)]
        except Exception as e:
            logger.exception("Failing for %r with %s", sunId, str(e))
        return None

    def getTreeNodeList(self):
        return self.__exportTreeNodeList(self.__nD, self.__pD)

    #
    ###
    ###
    #
    def __reload(self, urlTarget, scopDirPath, useCache=True, version=None):
        nD = pD = pdbD = {}
        pyVersion = sys.version_info[0]
        scopDomainPath = os.path.join(scopDirPath,
                                      "scop_domains-py%s.pic" % str(pyVersion))
        self.__mU.mkdir(scopDirPath)
        #
        # scopDomainPath = os.path.join(scopDirPath, "scop_domains.json")
        #
        if useCache and self.__mU.exists(scopDomainPath):
            sD = self.__mU.doImport(scopDomainPath, fmt="pickle")
            logger.debug(
                "SCOPe name length %d parent length %d assignments %d",
                len(sD["names"]), len(sD["parents"]), len(sD["assignments"]))
            nD = sD["names"]
            pD = sD["parents"]
            pdbD = sD["assignments"]

        elif not useCache:
            ok = False
            minLen = 1000
            logger.info(
                "Fetch SCOPe name and domain assignment data using target URL %s",
                urlTarget)
            desL, claL, hieL = self.__fetchFromSource(urlTarget,
                                                      version=version)
            #
            nD = self.__extractDescription(desL)
            dmD = self.__extractAssignments(claL)
            pD = self.__extractHierarchy(hieL, nD)
            pdbD = self.__buildAssignments(dmD)
            logger.info("nD %d dmD %d pD %d", len(nD), len(dmD), len(pD))
            scopD = {"names": nD, "parents": pD, "assignments": pdbD}
            if (len(nD) > minLen) and (len(pD) > minLen) and (len(pD) >
                                                              minLen):
                ok = self.__mU.doExport(scopDomainPath, scopD, fmt="pickle")
            logger.debug("Cache save status %r", ok)
            #
        return nD, pD, pdbD

    def __fetchFromSource(self, urlTarget, version="2.07-2019-07-23"):
        """Fetch the classification names and domain assignments from SCOPe repo.
        #
                dir.des.scope.2.07-2019-03-07.txt
                dir.cla.scope.2.07-2019-03-07.txt
                dir.hie.scope.2.07-2019-03-07.txt
        """
        encoding = "utf-8-sig" if sys.version_info[0] > 2 else "ascii"
        fn = "dir.des.scope.%s.txt" % version
        url = os.path.join(urlTarget, fn)
        desL = self.__mU.doImport(url,
                                  fmt="tdd",
                                  rowFormat="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(desL))
        #
        fn = "dir.cla.scope.%s.txt" % version
        url = os.path.join(urlTarget, fn)
        claL = self.__mU.doImport(url,
                                  fmt="tdd",
                                  rowFormat="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(claL))
        #
        fn = "dir.hie.scope.%s.txt" % version
        url = os.path.join(urlTarget, fn)
        hieL = self.__mU.doImport(url,
                                  fmt="tdd",
                                  rowFormat="list",
                                  uncomment=True,
                                  encoding=encoding)
        logger.info("Fetched URL is %s len %d", url, len(hieL))
        #
        return desL, claL, hieL

    def __extractDescription(self, desL):
        """
        From  dir.des.scope.2.07-2019-03-07.txt:

        # dir.des.scope.txt
        # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07)  [File format version 1.02]
        # http://scop.berkeley.edu/
        # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about
        46456   cl      a       -       All alpha proteins
        46457   cf      a.1     -       Globin-like
        46458   sf      a.1.1   -       Globin-like
        46459   fa      a.1.1.1 -       Truncated hemoglobin
        46460   dm      a.1.1.1 -       Protozoan/bacterial hemoglobin

        116748  sp      a.1.1.1 -       Bacillus subtilis [TaxId: 1423]
        113449  px      a.1.1.1 d1ux8a_ 1ux8 A:
        46461   sp      a.1.1.1 -       Ciliate (Paramecium caudatum) [TaxId: 5885]
        14982   px      a.1.1.1 d1dlwa_ 1dlw A:
        100068  px      a.1.1.1 d1uvya_ 1uvy A:
        46462   sp      a.1.1.1 -       Green alga (Chlamydomonas eugametos) [TaxId: 3054]
        14983   px      a.1.1.1 d1dlya_ 1dly A:
        100067  px      a.1.1.1 d1uvxa_ 1uvx A:
        63437   sp      a.1.1.1 -       Mycobacterium tuberculosis, HbN [TaxId: 1773]
        164742  px      a.1.1.1 d2gkma_ 2gkm A:
        164743  px      a.1.1.1 d2gkmb_ 2gkm B:

        """
        nD = {}

        for fields in desL:
            if fields[1] in ["cl", "cf", "sf", "fa", "dm"]:
                nD[int(fields[0])] = str(fields[4]).strip()
        logger.debug("Length of name dictionary %d", len(nD))
        nD[0] = "root" if 0 not in nD else nD[0]

        return nD

    def __extractAssignments(self, claL):
        """
        returns:

            aD[sunId] = [(), ... ]
        From dir.cla.scope.2.07-2019-03-07.txt:

        # dir.cla.scope.txt
        # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07)  [File format version 1.02]
        # http://scop.berkeley.edu/
        # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about
        #
        old_sunId                  sccs  sunid
        d1ux8a_ 1ux8    A:      a.1.1.1 113449  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=116748,px=113449
        d1dlwa_ 1dlw    A:      a.1.1.1 14982   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=14982
        d1uvya_ 1uvy    A:      a.1.1.1 100068  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46461,px=100068
        d1dlya_ 1dly    A:      a.1.1.1 14983   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=14983
        d1uvxa_ 1uvx    A:      a.1.1.1 100067  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=46462,px=100067
        d2gkma_ 2gkm    A:      a.1.1.1 164742  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164742
        d2gkmb_ 2gkm    B:      a.1.1.1 164743  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164743
        d2gl3a_ 2gl3    A:      a.1.1.1 164754  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164754
        d2gl3b_ 2gl3    B:      a.1.1.1 164755  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=164755
        d1idra_ 1idr    A:      a.1.1.1 62301   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=62301
        d1idrb_ 1idr    B:      a.1.1.1 62302   cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=62302
        d1rtea_ 1rte    A:      a.1.1.1 105096  cl=46456,cf=46457,sf=46458,fa=46459,dm=46460,sp=63437,px=105096

        """
        dmD = {}
        logger.info("Length of class list %d", len(claL))
        rng = rngL = tL = None
        for fields in claL:
            try:
                rngL = str(fields[2]).strip().split(",")
                # dmTupL = [(tt[0], tt[1]) for tt in for rng.split(":") in rngL]
                #
                dmTupL = []
                for rng in rngL:
                    tL = [t for t in str(rng).strip().split(":") if len(t)]
                    if len(tL) > 1:
                        rL = tL[1].split("-")
                        tt = (tL[0], rL[0], rL[1])
                    else:
                        tt = (tL[0], None, None)

                    dmTupL.append(tt)
                #
                # Get the sid of the domain  -
                #
                sfL = str(fields[5]).strip().split(",")
                dmfL = sfL[4].split("=")
                dmf = int(dmfL[1])

                #                                         old domid      sccs    sunid for domain assignment
                dmD[int(fields[4])] = (fields[1], dmTupL, fields[0], fields[3],
                                       dmf)
                #
            except Exception as e:
                logger.exception(
                    "Failing fields %r rngL %r rng %r tL %r with %s", fields,
                    rngL, rng, tL, str(e))

        #
        #
        logger.info("Length of domain assignments %d", len(dmD))
        return dmD

    def __buildAssignments(self, dmD):
        """
        Input internal data structure with domain assignments -

        dmD[sunId] = (pdbId, [(authAsymId, begRes, endRes), ...], domain_name, sccs, sid_domain_assigned)

        Returns:

           aD[(pdbId, authAsymId)] = [(domSunId, domainId, sccs, (authAsymId, resBeg, resEnd))]


        """
        pdbD = {}
        for _, dTup in dmD.items():
            for rTup in dTup[1]:
                pdbD.setdefault((dTup[0], rTup[0]), []).append(
                    (dTup[4], dTup[2], dTup[3], rTup))
        return pdbD

    def __extractHierarchy(self, hieL, nD):
        """
        From dir.hie.scope.2.07-2019-03-07.txt:

        # dir.hie.scope.txt
        # SCOPe release 2.07 (2018-03-02, last updated 2019-03-07)  [File format version 1.01]
        # http://scop.berkeley.edu/
        # Copyright (c) 1994-2019 the SCOP and SCOPe authors; see http://scop.berkeley.edu/about
        0       -       46456,48724,51349,53931,56572,56835,56992,57942,58117,58231,58788,310555
        46456   0       46457,46556,46625,46688,46928,46954,46965,46996,47004,47013,47026,47039,47044,47049,47054,47059,47071,...,...
        46457   46456   46458,46548
        46458   46457   46459,46463,46532,74660,191420
        46459   46458   46460,190322

        """
        pD = {}
        logger.debug("Length of input hierarchy list %d", len(hieL))
        for fields in hieL:
            chId = int(fields[0])
            #
            if chId not in nD:
                continue
            pId = int(fields[1]) if fields[1].isdigit() else None
            pD[chId] = pId
        #
        logger.info("Length of domain parent dictionary %d", len(pD))
        return pD

    def __exportTreeNodeList(self, nD, pD):
        """Create node list from the SCOPe (sunid) parent and name/description dictionaries.

        Exclude the root node from the tree.

        """
        #
        rootId = 0
        pL = [rootId]
        logger.info("nD %d pD %d", len(nD), len(pD))
        # create child dictionary
        cD = {}
        for ctId, ptId in pD.items():
            cD.setdefault(ptId, []).append(ctId)
        #
        logger.debug("cD %d", len(cD))
        #
        idL = []
        for rootId in sorted(pL):
            visited = set([rootId])
            queue = collections.deque(visited)
            while queue:
                tId = queue.popleft()
                idL.append(tId)
                if tId not in cD:
                    # logger.warning("No children for scop tId %r", tId)
                    continue
                for childId in cD[tId]:
                    if childId not in visited:
                        queue.append(childId)
                        visited.add(childId)
        #
        dL = []
        for tId in idL:
            displayName = nD[tId] if tId in nD else None
            ptId = pD[tId] if tId in pD else None
            lL = self.getIdLineage(tId)[1:]
            #
            # d = {'id': str(tId), 'name': displayName, 'lineage': [str(t) for t in lL], 'parents': [str(ptId)], 'depth': len(lL)}
            if tId == rootId:
                continue
            elif ptId == rootId:
                dD = {"id": str(tId), "name": displayName, "depth": 0}
            else:
                dD = {
                    "id": str(tId),
                    "name": displayName,
                    "parents": [str(ptId)],
                    "depth": len(lL)
                }
            dL.append(dD)

        return dL
class EntryInfoProvider(StashableBase):
    """Accessors (only) for entry-level annotations."""
    def __init__(self, **kwargs):
        #
        self.__version = "0.50"
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        self.__dirName = "rcsb_entry_info"
        self.__dirPath = os.path.join(cachePath, self.__dirName)
        super(EntryInfoProvider, self).__init__(cachePath, [self.__dirName])
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__entryInfoD = self.__reload(fmt="json", useCache=useCache)
        #

    def testCache(self, minCount=1):
        if minCount == 0:
            return True
        if self.__entryInfoD and minCount and "entryInfo" in self.__entryInfoD and len(
                self.__entryInfoD["entryInfo"]) > minCount:
            logger.info("Entry annotations for (%d) entries",
                        len(self.__entryInfoD["entryInfo"]))
            return True
        return False

    def getEntryInfo(self, entryId):
        """Return a dictionary of entry-level annotations.

        Returns:
            (dict): of entry-level annotations
        """
        try:
            return self.__entryInfoD["entryInfo"][entryId.upper(
            )] if entryId.upper() in self.__entryInfoD["entryInfo"] else {}
        except Exception as e:
            logger.error("Failing with %r", str(e))
        return {}

    def getEntriesByPolymerEntityCount(self, count):
        oL = []
        try:
            for entryId, eD in self.__entryInfoD["entryInfo"].items():
                if eD["polymer_entity_count"] == count:
                    oL.append(entryId)
        except Exception as e:
            logger.error("Failing with %r", str(e))
        return oL

    def __getEntryInfoFilePath(self, fmt="json"):
        baseFileName = "entry_info_details"
        fExt = ".json" if fmt == "json" else ".pic"
        fp = os.path.join(self.__dirPath, baseFileName + fExt)
        return fp

    def reload(self):
        """Reload from the current cache file."""
        ok = False
        try:
            self.__entryInfoD = self.__reload(fmt="json", useCache=True)
            ok = self.__entryInfoD is not None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __reload(self, fmt="json", useCache=True):
        entryInfoFilePath = self.__getEntryInfoFilePath(fmt=fmt)
        tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        pcD = {"version": self.__version, "created": tS, "identifiers": {}}

        if useCache and self.__mU.exists(entryInfoFilePath):
            logger.info("Reading entry-info cached path %r", entryInfoFilePath)
            pcD = self.__mU.doImport(entryInfoFilePath, fmt=fmt)
        return pcD
class OeMoleculeProvider(object):
    """Utilities build and deliver OE molecule databases from PDB chemical component definition data"""
    def __init__(self, **kwargs):
        """Utilities build and deliver OE molecule databases from PDB chemical component definition data
        Args:
            cachePath (str, optional): path to the directory containing cache files (default: '.')
            molBuildType (str,optional): data source for building OE molecules (default: "model-xyz")
            oeFileNamePrefix (str, optional) file name prefix for all generated databases (default: "oe")

        """
        # Database file names with be prefixed with base prefix plus the molecular build type and perception options
        oeFileNamePrefixBase = kwargs.get("oeFileNamePrefix", "oe")
        limitPerceptions = kwargs.get("limitPerceptions", False)
        molBuildType = kwargs.get("molBuildType", "model-xyz")
        if limitPerceptions and molBuildType in [
                "oe-smiles", "oe-iso-smiles", "inchi"
        ]:
            self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType + "-limit"
        else:
            self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType
        #
        cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(cachePath, "oe_mol")
        #
        self.__fpDbD = {}
        self.__ssDb = None
        self.__oeMolD = {}
        self.__oeMolDb = None
        self.__oeMolDbTitleD = None
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__molCount = self.__reload(**kwargs)

    def testCache(self):
        return self.__mU.exists(
            os.path.join(self.__dirPath,
                         self.__getOeMolFileName())) and self.__mU.exists(
                             os.path.join(self.__dirPath,
                                          self.__getOeMolDbFileName()))

    def getSubSearchDb(self,
                       screenType="SMARTS",
                       numProc=1,
                       forceRefresh=False):
        if not self.__ssDb or forceRefresh:
            oeIo = OeIoUtils()
            fp = os.path.join(self.__dirPath,
                              self.__getSubSearchFileName(screenType))
            logger.info("Opening screened substructure search database %r", fp)
            self.__ssDb = oeIo.loadOeSubSearchDatabase(fp,
                                                       screenType,
                                                       numProc=numProc)
        return self.__ssDb

    def getFingerPrintDb(self, fpType, fpDbType="STANDARD", rebuild=False):
        if fpType not in self.__fpDbD or rebuild:
            oeIo = OeIoUtils()
            fastFpDbPath = os.path.join(self.__dirPath,
                                        self.__getFastFpDbFileName(fpType))
            oeMolDbFilePath = os.path.join(self.__dirPath,
                                           self.__getOeMolDbFileName())
            fpDb = oeIo.loadOeFingerPrintDatabase(oeMolDbFilePath,
                                                  fastFpDbPath,
                                                  inMemory=True,
                                                  fpType=fpType,
                                                  fpDbType=fpDbType)
            if fpDb:
                self.__fpDbD[fpType] = fpDb
        #
        return self.__fpDbD[fpType]

    def __getOeMolDbTitleIndex(self):
        oeMolDbTitleD = {}
        try:
            for idx in range(self.__oeMolDb.GetMaxMolIdx()):
                oeMolDbTitleD[self.__oeMolDb.GetTitle(idx)] = idx
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return oeMolDbTitleD

    def getOeMolDatabase(self):
        if not self.__oeMolDb:
            oeIo = OeIoUtils()
            self.__oeMolDb = oeIo.loadOeBinaryDatabaseAndIndex(
                os.path.join(self.__dirPath, self.__getOeMolDbFileName()))
            self.__oeMolDbTitleD = self.__getOeMolDbTitleIndex()
        return self.__oeMolDb, self.__oeMolDbTitleD

    def getOeMolD(self):
        try:
            if not self.__oeMolD:
                oeIo = OeIoUtils()
                self.__oeMolD = oeIo.readOeBinaryMolCache(
                    os.path.join(self.__dirPath, self.__getOeMolFileName()))
                logger.info("Loading OE binary molecule cache length %d",
                            len(self.__oeMolD))
            return self.__oeMolD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def getMol(self, ccId):
        try:
            if not self.__oeMolD:
                oeIo = OeIoUtils()
                self.__oeMolD = oeIo.readOeBinaryMolCache(
                    os.path.join(self.__dirPath, self.__getOeMolFileName()))
                logger.info("Loading OE binary molecule cache length %d",
                            len(self.__oeMolD))
            return self.__oeMolD[ccId]
        except Exception as e:
            logger.exception("Get molecule %r failing with %s", ccId, str(e))
        return None

    def __getFastFpDbFileName(self, fpType):
        return "%s-fast-fp-database-%s.fpbin" % (self.__oeFileNamePrefix,
                                                 fpType)

    def __getSubSearchFileName(self, screenType):
        return "%s-ss-database-%s.oeb" % (self.__oeFileNamePrefix, screenType)

    def __getOeMolDbFileName(self):
        return "%s-mol-db-components.oeb" % self.__oeFileNamePrefix

    def __getOeMolFileName(self):
        return "%s-mol-components.oeb" % self.__oeFileNamePrefix

    def __reload(self, **kwargs):
        """Reload the dictionary of OE molecules and related data artifacts for chemical component definitions.

        Args:
            molBuildType (str):  coordinates to use in building OE molecules from CIF components (model, ideal or None)
            limitPerceptions(bool): process input descriptors in essentially verbatim mode (default: True)
            fpTypeList (list): fingerprint type (TREE,PATH,MACCS,CIRCULAR,LINGO)
            screenTypeList (list): fast sub search screen type (MOLECULE, SMARTS, MDL, ... )
            useCache (bool, optional): flag to use cached files. Defaults to True.
            cachePath (str): path to the top cache directory. Defaults to '.'.
            numProc (int): number processors to engage in screen substructure search database generation.
            molLimit (int, optional): limiting number of molecules in data store (default: 0 no limit)
            suppressHydrogens (bool, optional): flag to suppress explicit hydrogens in the OE data store.

        Returns:
            (dict): dictionary of constructed OE molecules

        """
        useCache = kwargs.get("useCache", True)
        cachePath = kwargs.get("cachePath", ".")
        numProc = kwargs.get("numProc", 2)
        molLimit = kwargs.get("molLimit", 0)
        fpTypeList = kwargs.get("fpTypeList",
                                ["TREE", "PATH", "MACCS", "CIRCULAR", "LINGO"])
        # screenTypeList = kwargs.get("screenTypeList", ["SMARTS"])
        screenTypeList = kwargs.get("screenTypeList", [])
        molBuildType = kwargs.get("molBuildType", "model-xyz")
        limitPerceptions = kwargs.get("limitPerceptions", False)
        quietFlag = kwargs.get("quietFlag", True)
        suppressHydrogens = kwargs.get("suppressHydrogens", False)
        logSizes = kwargs.get("logSizes", False)
        fpDbType = "STANDARD"
        #
        ccCount = 0
        oeCount = 0
        errCount = 0
        failIdList = []
        oeIo = OeIoUtils(quietFlag=quietFlag)
        # --------
        oeMolFilePath = os.path.join(self.__dirPath, self.__getOeMolFileName())
        if not useCache or (useCache and not self.__mU.exists(oeMolFilePath)):
            cmpKwargs = {
                k: v
                for k, v in kwargs.items()
                if k not in ["cachePath", "useCache", "molLimit"]
            }
            ccmP = ChemCompMoleculeProvider(cachePath=cachePath,
                                            useCache=True,
                                            molLimit=molLimit,
                                            **cmpKwargs)
            ok = ccmP.testCache(minCount=molLimit, logSizes=logSizes)
            ccObjD = ccmP.getMolD() if ok else {}
            ccCount = len(ccObjD)
            # -------
            startTime = time.time()
            oeCount, errCount, failIdList = oeIo.buildOeBinaryMolCache(
                oeMolFilePath,
                ccObjD,
                molBuildType=molBuildType,
                quietFlag=quietFlag,
                fpTypeList=fpTypeList,
                limitPerceptions=limitPerceptions,
                suppressHydrogens=suppressHydrogens)
            logger.info(
                "Stored %d/%d OeMols (suppressH = %r) created with molBuildType %r (unconverted %d)",
                oeCount, ccCount, suppressHydrogens, molBuildType, errCount)
            if failIdList:
                logger.info("%r failures %r", molBuildType, failIdList)
            endTime = time.time()
            logger.info("Constructed %d/%d cached oeMols (%.4f seconds)",
                        oeCount, ccCount, endTime - startTime)
        # --------
        oeMolDbFilePath = os.path.join(self.__dirPath,
                                       self.__getOeMolDbFileName())
        if not useCache or (useCache
                            and not self.__mU.exists(oeMolDbFilePath)):
            startTime = time.time()
            molCount = oeIo.createOeBinaryDatabaseAndIndex(
                oeMolFilePath, oeMolDbFilePath)
            endTime = time.time()
            logger.info(
                "Created and stored %d indexed OeMols in OE database format (%.4f seconds)",
                molCount, endTime - startTime)

        # --------
        if fpDbType == "FAST":
            for fpType in fpTypeList:
                startTime = time.time()
                #  Fast FP search database file names
                fpPath = os.path.join(self.__dirPath,
                                      self.__getFastFpDbFileName(fpType))
                if not useCache or (useCache and not self.__mU.exists(fpPath)):
                    ok = oeIo.createOeFingerPrintDatabase(oeMolDbFilePath,
                                                          fpPath,
                                                          fpType=fpType)
                    endTime = time.time()
                    logger.info(
                        "Created and stored %s fingerprint database (%.4f seconds)",
                        fpType, endTime - startTime)
        # --------
        if molBuildType in ["oe-iso-smiles"]:
            for screenType in screenTypeList:
                startTime = time.time()
                fp = os.path.join(self.__dirPath,
                                  self.__getSubSearchFileName(screenType))
                if not useCache or (useCache and not self.__mU.exists(fp)):
                    ok = oeIo.createOeSubSearchDatabase(oeMolFilePath,
                                                        fp,
                                                        screenType=screenType,
                                                        numProc=numProc)
                    endTime = time.time()
                    logger.info(
                        "Constructed screened substructure database (status %r) with screenType %s (%.4f seconds)",
                        ok, screenType, endTime - startTime)
                    # ---------
                    ssDb = oeIo.loadOeSubSearchDatabase(fp,
                                                        screenType=screenType,
                                                        numProc=numProc)
                    ok = ssDb.NumMolecules() == oeCount
                    # ----------
        return oeCount
Esempio n. 29
0
class OeIoUtils(object):
    """Utility methods to manage OE specific IO and format conversion operations."""
    def __init__(self, **kwargs):
        self.__dirPath = kwargs.get("dirPath", ".")
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__oeErrorLevel = oechem.OEErrorLevel_Info
        if kwargs.get("quietFlag", False):
            self.setQuiet()
        #

    def setQuiet(self):
        """Suppress OE warnings and processing errors"""
        oechem.OEThrow.SetLevel(oechem.OEErrorLevel_Quiet)
        self.__oeErrorLevel = oechem.OEErrorLevel_Quiet

    def getComponentDefinitions(self, ccdFilePath):
        rdCcObjL = []
        try:
            rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif")
            logger.info("Read %s with %d definitions", ccdFilePath,
                        len(rdCcObjL))
        except Exception as e:
            logger.exception("Loading %s failing with %s", ccdFilePath, str(e))
        return rdCcObjL

    def suppressHydrogens(self, oeMol):
        tMol = oechem.OEMol(oeMol) if oeMol else None
        if tMol:
            oechem.OESuppressHydrogens(tMol)
        return tMol

    def chemCompToMol(self,
                      ccdFilePath,
                      molBuildType="model-xyz",
                      quietFlag=False):
        retMolL = []
        try:
            rdCcObjL = self.__mU.doImport(ccdFilePath, fmt="mmcif")
            logger.info("Read %s with %d definitions", ccdFilePath,
                        len(rdCcObjL))
            oemf = OeMoleculeFactory()
            if quietFlag:
                oemf.setQuiet()
            for ccObj in rdCcObjL:
                ccId = oemf.setChemCompDef(ccObj)
                if ccId:
                    ok = oemf.build(molBuildType=molBuildType)
                    if ok:
                        oeMol = oemf.getMol()
                        retMolL.append(oeMol)
        except Exception as e:
            logger.exception("Loading %s failing with %s", ccdFilePath, str(e))
        return retMolL

    def descriptorToSmiles(self,
                           descr,
                           descrType,
                           limitPerceptions=False,
                           messageTag=None):
        """Parse the input descriptor string and return an OE smiles.

        Args:
            descr (str): descriptor
            descrType (str): descriptor type
            limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor
            messageTag (srt, optional): prefix string for error messages. Defaults to None.

        Returns:
            str: SMILES string
        """
        try:
            if "SMILES" in descrType.upper() and "ISO" in descrType.upper():
                oeMol = self.smilesToMol(descr,
                                         limitPerceptions=limitPerceptions,
                                         messageTag=messageTag)
                if oeMol:
                    return oechem.OECreateIsoSmiString(oeMol)
                else:
                    return None
            if "SMILES" in descrType.upper():
                oeMol = self.smilesToMol(descr,
                                         limitPerceptions=limitPerceptions,
                                         messageTag=messageTag)
                if oeMol:
                    return oechem.OECreateCanSmiString(oeMol)
                else:
                    return None
            elif "INCHI" in descrType.upper():
                oeMol = self.inchiToMol(descr,
                                        limitPerceptions=limitPerceptions,
                                        messageTag=messageTag)
                if oeMol:
                    return oechem.OECreateIsoSmiString(oeMol)
            else:
                return None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def descriptorToMol(self,
                        descr,
                        descrType,
                        limitPerceptions=False,
                        messageTag=None):
        """Parse the input descriptor string and return a molecule object (OeGraphMol/OeQMol).

        Args:
            descr (str): descriptor
            descrType (str): descriptor type
            limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor
            messageTag (srt, optional): prefix string for error messages. Defaults to None.

        Returns:
            object: OeGraphMol()/OeQmol() object or None for failure

            ifs.SetFlavor(oechem.OEFormat_PDB, oechem.OEIFlavor_PDB_Default | oechem.OEIFlavor_PDB_DATA | oechem.OEIFlavor_PDB_ALTLOC)  # noq
        """
        try:
            if "SMILES" in descrType.upper() and "ISO" in descrType.upper():
                oeMol = self.smilesToMol(descr,
                                         limitPerceptions=limitPerceptions,
                                         messageTag=messageTag)
                if oeMol:
                    isoSmiles = oechem.OECreateIsoSmiString(oeMol)
                    return self.smilesToMol(isoSmiles,
                                            limitPerceptions=limitPerceptions,
                                            messageTag=messageTag)
                else:
                    return None
            if "SMILES" in descrType.upper():
                oeMol = self.smilesToMol(descr,
                                         limitPerceptions=limitPerceptions,
                                         messageTag=messageTag)
                if oeMol:
                    smiles = oechem.OECreateCanSmiString(oeMol)
                    return self.smilesToMol(smiles,
                                            limitPerceptions=limitPerceptions,
                                            messageTag=messageTag)
                else:
                    return None
            elif "INCHI" in descrType.upper():
                oeMol = self.inchiToMol(descr,
                                        limitPerceptions=limitPerceptions,
                                        messageTag=messageTag)
                if oeMol:
                    isoSmiles = oechem.OECreateIsoSmiString(oeMol)
                    return self.smilesToMol(isoSmiles,
                                            limitPerceptions=limitPerceptions,
                                            messageTag=messageTag)
            elif "SMARTS" in descrType.upper():
                return self.smartsToQmol(descr, messageTag=messageTag)
            else:
                return None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def smilesToMol(self, smiles, limitPerceptions=False, messageTag=None):
        """Parse the input SMILES string and return a molecule object (OeGraphMol).

        Args:
            smiles (str): SMILES string
            limitPerceptions (bool): flag to limit the perceptions/transformations of input SMILES

        Returns:
            object: OeGraphMol() object or None for failure
        """
        try:
            label = messageTag if messageTag else ""
            mol = oechem.OEGraphMol()
            smiles.strip()
            if limitPerceptions:
                # convert the SMILES string into a molecule
                if oechem.OEParseSmiles(mol, smiles, False, False):
                    return mol
                else:
                    logger.debug(
                        "%s parsing failed for input SMILES string %s", label,
                        smiles)
                    logger.error("%s parsing failed for input SMILES string",
                                 label)
            else:
                if oechem.OESmilesToMol(mol, smiles):
                    return mol
                else:
                    logger.debug(
                        "%s converting failed for input SMILES string %s",
                        label, smiles)
                    logger.error(
                        "%s converting failed for input SMILES string", label)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def inchiToMol(self, inchi, limitPerceptions=False, messageTag=None):
        """Parse the input InChI string and return a molecule object (OeGraphMol).

        Args:
            inchi (str): InChI string

        Returns:
            object: OeGraphMol() object or None for failure

        """
        try:
            label = messageTag if messageTag else ""
            mol = oechem.OEGraphMol()
            inchi = inchi.strip()
            if limitPerceptions:
                if oechem.OEParseInChI(mol, inchi):
                    return mol
                else:
                    logger.debug("%s parsing failed for InChI string %r",
                                 label, inchi)
                    logger.error("%s parsing failed for InChI string", label)
            else:
                if oechem.OEInChIToMol(mol, inchi):
                    return mol
                else:
                    logger.debug("%s converting failed for InChI string %r",
                                 label, inchi)
                    logger.error("%s converting failed for InChI string",
                                 label)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def descriptorToQMol(self,
                         descr,
                         descrType,
                         limitPerceptions=False,
                         messageTag=None):
        """Parse the input descriptor string and return a query molecule object (OeQMol).

        Args:
            descr (str): descriptor
            descrType (str): descriptor type
            limitPerceptions (bool): flag to limit the perceptions/transformations of input descriptor
            messageTag (srt, optional): prefix string for error messages. Defaults to None.

        Returns:
            object: OeQmol() object or None for failure

        """
        oeQMol = label = None
        try:
            label = messageTag if messageTag else ""
            tMol = self.descriptorToMol(descr,
                                        descrType,
                                        limitPerceptions=limitPerceptions,
                                        messageTag=messageTag)
            if tMol:
                oeQMol = oechem.OEQMol(tMol)

        except Exception as e:
            logger.error("%s Failing for with %s", label, str(e))
        return oeQMol if oeQMol else None

    def smartsToQmol(self, smarts, messageTag=None):
        """Parse the input SMARTS query string and return a query molecule object (OeQMol).

        Args:
            smarts (str): SMARTS query string

        Returns:
            object : OeQMol() object or None for failure
        """
        try:
            label = messageTag if messageTag else ""
            qmol = oechem.OEQMol()
            if oechem.OEParseSmarts(qmol, smarts):
                return qmol
            else:
                logger.debug("%s parsing failed for SMARTS string %s", label,
                             smarts)
                logger.error("%s parsing failed for SMARTS string", label)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def fileToMols(self, filePath, use3D=False, largestPart=False):
        """Parse the input path returning a list of molecule objects (OeGraphMol).

        Args:
            filePath (str): file path must have strandard recognized extension ('mol', 'sdf', 'smi', 'oeb').

        Returns:
            list : list of OeGraphMol() objects

        """
        mL = []
        oemf = OeMoleculeFactory()
        try:
            ifs = oechem.oemolistream()
            if ifs.open(filePath):
                for tMol in ifs.GetOEGraphMols():
                    oeMol = oechem.OEGraphMol(tMol)
                    # if oechem.OEReadMolecule(ifs, oeMol):
                    if largestPart:
                        molL = oemf.getParts(oeMol)
                        if len(molL) > 0:
                            oeMol = molL[0]
                            logger.info(
                                "Using largest bonded molecule part (%d/%d)",
                                len(molL), oeMol.NumAtoms())
                    if use3D:
                        mL.append(
                            oemf.updateOePerceptions3D(
                                oeMol, aromaticModel=oechem.OEAroModelOpenEye))
                    else:
                        mL.append(
                            oemf.updateOePerceptions2D(
                                oeMol, aromaticModel=oechem.OEAroModelOpenEye))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return mL

    def stringToMols(self, txt, sType="mol2", use3D=False):
        """Parse the input string as input format type (sType) returning a list of
        molecule objects (OeGraphMol)

        Args:
            txt (str): string text of molecule data
            sType (str, optional): string data format (mol2, sdf, smiles) . Defaults to "mol2".

        Returns:
            list: list of OeGraphMol() objects
        """
        #
        mL = []
        oemf = OeMoleculeFactory()
        try:
            if sType not in ["mol2", "sdf", "smiles"]:
                logger.error("Unsupported string data format")
                return None
            fD = {
                "mol2": oechem.OEFormat_MOL2,
                "sdf": oechem.OEFormat_SDF,
                "smiles": oechem.OEFormat_SMI
            }
            ifs = oechem.oemolistream()
            ifs.SetFormat(fD["sType"])
            if not ifs.openstring(txt):
                logger.error("Unable open string data for molecule reader")
                return None
            for tMol in ifs.GetOEGraphMols():
                oeMol = oechem.OEGraphMol(tMol)
                if use3D:
                    mL.append(
                        oemf.updateOePerceptions3D(
                            oeMol, aromaticModel=oechem.OEAroModelOpenEye))
                else:
                    mL.append(
                        oemf.updateOePerceptions2D(
                            oeMol, aromaticModel=oechem.OEAroModelOpenEye))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return mL

    def readOeBinaryMolCache(self, filePath):
        """Return a list of OeGraphMol() objects read from the cached binary file.

        Args:
            filePath (str): file path for the binary OeMol cache

        Returns:
            dict: dictionary of OeGraphMol()'s {<ccId>: OeGraphMol(), ... }
        """
        retD = {}
        startTime = time.time()
        try:
            ifs = oechem.oemolistream()
            if ifs.open(filePath):
                for oeMol in ifs.GetOEGraphMols():
                    tMol = oechem.OEGraphMol(oeMol)
                    retD[tMol.GetTitle()] = tMol
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        endTime = time.time()
        logger.info("Completed operation at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
        return retD

    def createOeFingerPrintDatabase(self,
                                    oeMolDbFilePath,
                                    oeFpDbFilePath,
                                    fpType="TREE",
                                    dbType="FAST"):
        if dbType == "FAST":
            return self.__createOeFastFingerPrintDatabase(oeMolDbFilePath,
                                                          oeFpDbFilePath,
                                                          fpType=fpType)
        else:
            return True

    def __createOeFastFingerPrintDatabase(self,
                                          oeMolDbFilePath,
                                          oeFpDbFilePath,
                                          fpType="TREE"):
        """Create fast search fingerprint database from the input molecular database.

        Args:
            oeMolDbFilePath (str): path to the input molecular database
            oeFpDbFilePath (str): path to the output fingerprint database
            fpType (str):  finger print type

        Returns:
            bool: True for success or False otherwise

        Supports:
            OEFPType_Circular
            OEFPType_Path
            OEFPType_Tree

        Not currently supported by OE fp search -
            OEFPType_MACCS166
            OEFPType_Lingo
        """
        startTime = time.time()
        ok = False
        try:
            _ = fpType
            fpD = {
                "TREE": oegraphsim.OEFPType_Tree,
                "CIRCULAR": oegraphsim.OEFPType_Circular,
                "PATH": oegraphsim.OEFPType_Path
            }
            myFpType = fpD[
                fpType] if fpType in fpD else oegraphsim.OEFPType_Tree
            opts = oegraphsim.OECreateFastFPDatabaseOptions(
                oegraphsim.OEGetFPType(myFpType))
            ok = oegraphsim.OECreateFastFPDatabaseFile(oeFpDbFilePath,
                                                       oeMolDbFilePath, opts)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        endTime = time.time()
        logger.info("Completed operation at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
        return ok

    def loadOeFingerPrintDatabase(self,
                                  oeMolDbFilePath,
                                  oeFpDbFilePath,
                                  inMemory=False,
                                  fpType="TREE",
                                  fpDbType="FAST"):
        if fpDbType == "FAST":
            return self.__loadOeFastFingerPrintDatabase(oeFpDbFilePath,
                                                        inMemory=inMemory,
                                                        fpType=fpType)
        else:
            return self.__loadOeFingerPrintDatabase(oeMolDbFilePath,
                                                    fpType=fpType)

    def __loadOeFingerPrintDatabase(self, oeMolDbFilePath, fpType="TREE"):
        """Create conventional search fingerprint database from the input molecular database.

        Args:
            oeMolDbFilePath (str): path to the input molecular database
            oeFpDbFilePath (str): path to the output fingerprint database
            fpType (str):  finger print type

        Returns:
            bool: True for success or False otherwise

        Supports:
            OEFPType_Circular
            OEFPType_Path
            OEFPType_Tree
            OEFPType_MACCS166
            OEFPType_Lingo
        """
        fpDb = None
        ok = False
        startTime = time.time()
        try:
            fpD = {
                "TREE": oegraphsim.OEFPType_Tree,
                "CIRCULAR": oegraphsim.OEFPType_Circular,
                "PATH": oegraphsim.OEFPType_Path,
                "MACCS": oegraphsim.OEFPType_MACCS166,
                "LINGO": oegraphsim.OEFPType_Lingo,
            }
            fpType = fpType if fpType and fpType in fpD else "TREE"
            tag = "FP_" + fpType
            oeFpType = fpD[
                fpType] if fpType in fpD else oegraphsim.OEFPType_Tree
            oeMolDb = self.loadOeBinaryDatabaseAndIndex(oeMolDbFilePath)
            #
            fpDb = oegraphsim.OEFPDatabase(oeFpType)
            numMols = oeMolDb.GetMaxMolIdx()
            logger.debug("fpType %r tag %r oeFpType %r", fpType, tag, oeFpType)
            oeMol = oechem.OEGraphMol()
            for idx in range(0, numMols):
                if oeMolDb.GetMolecule(oeMol, idx):
                    if oeMol.HasData(tag):
                        tfp = oeMol.GetData(tag)
                        fpDb.AddFP(tfp)
                    else:
                        fpDb.AddFP(oeMol)
                else:
                    logger.info("Missing molecule at index %r", idx)

            numFp = fpDb.NumFingerPrints()
            ok = numMols == numFp
            logger.info(
                "Loaded molecules  %d %s fingerprints %d (%.4f seconds)",
                numMols, fpType, numFp,
                time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            fpDb = None
        endTime = time.time()
        logger.debug("Completed with status %r operation at %s (%.4f seconds)",
                     ok, time.strftime("%Y %m %d %H:%M:%S",
                                       time.localtime()), endTime - startTime)
        return fpDb

    def __loadOeFastFingerPrintDatabase(self,
                                        oeFpDbFilePath,
                                        inMemory=False,
                                        fpType="TREE"):
        #
        _ = fpType
        startTime = time.time()
        if inMemory:
            memType = oegraphsim.OEFastFPDatabaseMemoryType_InMemory
        else:
            memType = oegraphsim.OEFastFPDatabaseMemoryType_MemoryMapped
        if not self.__mU.exists(oeFpDbFilePath):
            logger.error("Missing fingerprint database file %r",
                         oeFpDbFilePath)
        fpDb = oegraphsim.OEFastFPDatabase(oeFpDbFilePath, memType)
        if not fpDb.IsValid():
            logger.error("Cannot open fingerprint database %r", oeFpDbFilePath)
        #
        lenFp = fpDb.NumFingerPrints()
        memTypeStr = fpDb.GetMemoryTypeString()
        endTime = time.time()
        logger.info(
            "Read fingerprint database length %d loaded %s (%.4f seconds)",
            lenFp, memTypeStr, endTime - startTime)
        return fpDb

    def loadOeBinaryDatabaseAndIndex(self, oeMolDbFilePath):
        molDb = None
        try:
            molDb = oechem.OEMolDatabase()
            if not molDb.Open(oeMolDbFilePath):
                logger.error("Unable to open %r", oeMolDbFilePath)
            molCount = molDb.NumMols()
            logger.info("Loaded OE database file containing %d molecules",
                        molCount)
        except Exception as e:
            logger.exception("Loading %r failing with %s", oeMolDbFilePath,
                             str(e))
        return molDb

    def createOeBinaryDatabaseAndIndex(self, oebMolFilePath, oeMolDbFilePath):
        """Create OE binary database file and associated index from the input serial
        binary data file.

        Args:
            oebMolFilePath (str): input OeMol stream binary file path
            oeMolDbFilePath (str): output OeMolDatabase file path

        Returns:
           int:  number of molecules processed in the database.
        """
        molCount = 0
        try:
            startTime = time.time()
            moldb = oechem.OEMolDatabase()
            if not moldb.Open(oebMolFilePath):
                logger.error("Read fails for %r", oebMolFilePath)
                return molCount
            #
            logger.info(
                "Opened database in format %r num mols %d max index %d",
                moldb.GetFormat(), moldb.NumMols(), moldb.GetMaxMolIdx())
            moldb.Save(oeMolDbFilePath)
            tL = list(moldb.GetTitles())
            logger.info("First and last titles: %r %r", tL[0], tL[-1])
            molCount = moldb.NumMols()
            endTime = time.time()
            logger.info("Completed operation at %s (%.4f seconds)",
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        endTime - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return molCount

    def buildOeBinaryMolCache(self,
                              filePath,
                              ccObjD,
                              molBuildType="model-xyz",
                              quietFlag=False,
                              fpTypeList=None,
                              limitPerceptions=False,
                              suppressHydrogens=False):
        """Build cache of OEMol() objects from the input chemical component definition list.

        Args:
            filePath (str): output cache file path
            ccObjD (dict):  chemical component object dictionary
            molBuildType (str, optional): [description]. Defaults to "model-xyz".
            quietFlag (bool, optional): [description]. Defaults to False.
            fpTypeList (list, optional): fingerprint type list. Defaults to None.
            limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False.
            suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False.

        Returns:
            (int, int, list): chem comp success count, error count, chem comp identifier failure list

        """
        ok = False
        startTime = time.time()
        failIdList = []
        ccCount = 0
        errCount = 0
        try:
            ofs = oechem.oemolostream()
            ofs.SetFormat(oechem.OEFormat_OEB)
            if ofs.open(filePath):
                oemf = OeMoleculeFactory()
                if quietFlag:
                    oemf.setQuiet()
                for ccId, ccObj in ccObjD.items():
                    tId = oemf.setChemCompDef(ccObj)
                    if tId and tId == ccId:
                        ok = oemf.build(molBuildType=molBuildType,
                                        limitPerceptions=limitPerceptions)
                        if ok and fpTypeList:
                            fpOk = oemf.addFingerPrints(fpTypeList)
                            if not fpOk:
                                logger.info(
                                    "Fingerprint generation fails for %r",
                                    ccId)
                        if ok:
                            oeMol = oemf.getMol(
                                suppressHydrogens=suppressHydrogens)
                            oechem.OEWriteMolecule(ofs, oeMol)
                            ccCount += 1
                    if not ok or not tId:
                        # build failed incomplete component (e.g. missing atoms or bonds)
                        errCount += 1
                        failIdList.append(ccId)
            else:
                logger.error("Unable to open cache database %s", filePath)
                errCount += 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        endTime = time.time()
        logger.info("Completed operation at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
        return ccCount, errCount, failIdList

    #
    def buildOeBinaryMolCacheFromIndex(self,
                                       filePath,
                                       ccIdxD,
                                       quietFlag=False,
                                       fpTypeList=None,
                                       limitPerceptions=False,
                                       suppressHydrogens=False):
        """Build cache of OEGraphMol() objects from the input chemical component search index.

        Args:
            filePath (str): output cache file path
            ccIdxD (dict): search index dictionary
            quietFlag (bool, optional): suppress OE output. Defaults to False.
            fpTypeList (list, optional): list of fingerprint types. Defaults to None.
            limitPerceptions (bool, optional): suppress automatic chemical perceptions. Defaults to False.
            suppressHydrogens (bool, optional): suppress explicit hydrogen count. Defaults to False.

        Returns:
            (int, int, list): chem comp success count, error count, chem comp identifier failure list
        """
        failIdList = []
        ccCount = 0
        errCount = 0
        startTime = time.time()
        try:
            ofs = oechem.oemolostream()
            ofs.SetFormat(oechem.OEFormat_OEB)
            if ofs.open(filePath):
                oemf = OeMoleculeFactory()
                if quietFlag:
                    oemf.setQuiet()
                for searchCcId, ccIdx in ccIdxD.items():
                    oemf.setDescriptor(ccIdx["smiles"], "oe-iso-smiles",
                                       searchCcId)
                    ok = oemf.build(molBuildType="oe-iso-smiles",
                                    limitPerceptions=limitPerceptions)
                    if ok and fpTypeList:
                        fpOk = oemf.addFingerPrints(fpTypeList)
                        if not fpOk:
                            logger.info("Fingerprint generation fails for %r",
                                        searchCcId)
                    if ok:
                        if not suppressHydrogens:
                            oemf.addExplicitHydrogens()
                            oemf.setSimpleAtomNames()
                        oeMol = oemf.getMol(
                            suppressHydrogens=suppressHydrogens)
                        oechem.OEWriteMolecule(ofs, oeMol)
                        ccCount += 1
                    if not ok:
                        # build failed incomplete component (e.g. missing atoms or bonds)
                        errCount += 1
                        failIdList.append(searchCcId)
            else:
                logger.error("Unable to open cache database %s", filePath)
                errCount += 1
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        endTime = time.time()
        logger.info("Completed operation at %s (%.4f seconds)",
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - startTime)
        return ccCount, errCount, failIdList

    def createOeSubSearchDatabase(self,
                                  oebMolFilePath,
                                  oeSubSearchFilePath,
                                  screenType="SMARTS",
                                  numProc=2):
        sort = True
        keepTitle = True
        myScreenType = None
        if screenType == "MOLECULE":
            myScreenType = oechem.OEGetSubSearchScreenType(
                oechem.OESubSearchScreenType_Molecule)
        elif screenType == "MDL":
            myScreenType = oechem.OEGetSubSearchScreenType(
                oechem.OESubSearchScreenType_MDL)
        elif screenType == "SMARTS":
            myScreenType = oechem.OEGetSubSearchScreenType(
                oechem.OESubSearchScreenType_SMARTS)

        opts = oechem.OECreateSubSearchDatabaseOptions(myScreenType)
        opts.SetSortByBitCounts(sort)
        opts.SetKeepTitle(keepTitle)
        opts.SetNumProcessors(numProc)

        screenStr = myScreenType.GetName()
        logger.info("Using %d processor(s) to generate database with %s",
                    numProc, screenStr)

        tracer = oechem.OEConsoleProgressTracer()
        ok = oechem.OECreateSubSearchDatabaseFile(oeSubSearchFilePath,
                                                  oebMolFilePath, opts, tracer)
        return ok

    def loadOeSubSearchDatabase(self,
                                oeSubSearchFilePath,
                                screenType=None,
                                numProc=1):
        ssDb = None
        try:
            _ = screenType
            ssDb = oechem.OESubSearchDatabase(
                oechem.OESubSearchDatabaseType_Default, numProc)
            tracer = oechem.OEConsoleProgressTracer()
            if not ssDb.Open(oeSubSearchFilePath, tracer):
                logger.error("Unable to open %r", oeSubSearchFilePath)
            logger.info("Opened %r with %r molecules", oeSubSearchFilePath,
                        ssDb.NumMolecules())
        except Exception as e:
            logger.exception("Loading %r failing with %s", oeSubSearchFilePath,
                             str(e))
        return ssDb

    def write(self, filePath, oeMol, constantMol=False, addSdTags=True):
        """Write an oeMol with format type inferred from the filePath extension (e.g. .mol)

        Args:
            filePath (str): filepath with a chemical type extension
            constantMol (bool, optional): copies molecule before performing format specific perceptions

        Returns:
            bool: True for success or False otherwise
        """
        try:
            molId = os.path.splitext(os.path.basename(filePath))[0]
            fmt = os.path.splitext(os.path.basename(filePath))[1][1:].lower()
            #
            if addSdTags:
                oemf = OeMoleculeFactory()
                oemf.setOeMol(oeMol, molId)
                oemf.addSdTags()
                oeMol = oemf.getMol()
            #
            self.__mU.mkdir(os.path.dirname(filePath))
            ofs = oechem.oemolostream()
            ofs.open(filePath)
            logger.debug("Writing (fmt=%s) molId %s path %s title %s", fmt,
                         molId, filePath, oeMol.GetTitle())
            #
            if constantMol:
                oechem.OEWriteConstMolecule(ofs, oeMol)
            else:
                oechem.OEWriteMolecule(ofs, oeMol)
            #
            # If this is a mol2 file, we need to replace the resname
            if fmt.startswith("mol2"):
                # If this is a mol2/mol2h substitute the default substructure id
                with open(filePath, "r", encoding="utf-8") as ifh:
                    lines = ifh.readlines()
                lines = [line.replace("<0>", molId) for line in lines]
                with open(filePath, "w", encoding="utf-8") as ofh:
                    ofh.writelines(lines)
            return True
        except Exception as e:
            logger.exception("Failing for %s with %s", filePath, str(e))
        return False

    def serializeOe(self, oeMol):
        """Create a string representing the content of the current OE molecule.   This
        serialization uses the OE internal binary format.
        """
        try:
            oms = oechem.oemolostream()
            oms.SetFormat(oechem.OEFormat_OEB)
            oms.openstring()
            oechem.OEWriteMolecule(oms, oeMol)
            logger.debug("SMILES %s", oechem.OECreateCanSmiString(oeMol))
            logger.debug("Atoms = %d", oeMol.NumAtoms())
            return oms.GetString()
        except Exception as e:
            logger.exception("Failing with %s", str(e))

    def deserializeOe(self, oeS):
        """Reconstruct an OE molecule from the input string serialization (OE binary).

        The deserialized molecule is used to initialize the internal OE molecule
        within this object.

        Returns:
            list:  OE GraphMol list
        """
        molList = []
        try:
            ims = oechem.oemolistream()
            ims.SetFormat(oechem.OEFormat_OEB)
            ims.openstring(oeS)
            for mol in ims.GetOEGraphMols():
                logger.debug("SMILES %s", oechem.OECreateCanSmiString(mol))
                logger.debug("title  %s", mol.GetTitle())
                logger.debug("atoms  %d", mol.NumAtoms())
                molList.append(oechem.OEGraphMol(mol))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return molList
Esempio n. 30
0
class EntryInfoProvider(StashableBase):
    """Accessors and generators for entry-level annotations."""
    def __init__(self, **kwargs):
        #
        self.__version = "0.50"
        cachePath = kwargs.get("cachePath", ".")
        useCache = kwargs.get("useCache", True)
        self.__dirName = "rcsb_entry_info"
        self.__dirPath = os.path.join(cachePath, self.__dirName)
        super(EntryInfoProvider, self).__init__(cachePath, [self.__dirName])
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__entryInfoD = self.__reload(fmt="json", useCache=useCache)
        #

    def testCache(self, minCount=1):
        if minCount == 0:
            return True
        if self.__entryInfoD and minCount and "entryInfo" in self.__entryInfoD and len(
                self.__entryInfoD["entryInfo"]) > minCount:
            logger.info("Entry annotations for (%d) entries",
                        len(self.__entryInfoD["entryInfo"]))
            return True
        return False

    def getEntryInfo(self, entryId):
        """Return a dictionary of entry-level annotations.

        Returns:
            (dict): of entry-level annotations
        """
        try:
            return self.__entryInfoD["entryInfo"][entryId.upper(
            )] if entryId.upper() in self.__entryInfoD["entryInfo"] else {}
        except Exception as e:
            logger.error("Failing with %r", str(e))
        return {}

    def getEntriesByPolymerEntityCount(self, count):
        oL = []
        try:
            for entryId, eD in self.__entryInfoD["entryInfo"].items():
                if eD["polymer_entity_count"] == count:
                    oL.append(entryId)
        except Exception as e:
            logger.error("Failing with %r", str(e))
        return oL

    def __getEntryInfoFilePath(self, fmt="json"):
        baseFileName = "entry_info_details"
        fExt = ".json" if fmt == "json" else ".pic"
        fp = os.path.join(self.__dirPath, baseFileName + fExt)
        return fp

    def update(self, cfgOb, fmt="json", indent=3):
        """Update branched entity glycan accession mapping cache.

        Args:
            cfgObj (object): ConfigInfo() object instance

        Returns:
            (bool): True for success for False otherwise
        """
        ok = False
        try:
            entryInfoD = self.__updateEntryInfo(cfgOb)

            logger.info("Got entry_info for (%d)", len(entryInfoD))
            #
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            self.__entryInfoD = {
                "version": self.__version,
                "created": tS,
                "entryInfo": entryInfoD
            }
            #
            infoFilePath = self.__getEntryInfoFilePath(fmt=fmt)
            kwargs = {"indent": indent} if fmt == "json" else {}
            ok = self.__mU.doExport(infoFilePath,
                                    self.__entryInfoD,
                                    fmt=fmt,
                                    **kwargs)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def reload(self):
        """Reload from the current cache file."""
        ok = False
        try:
            self.__entryInfoD = self.__reload(fmt="json", useCache=True)
            ok = self.__entryInfoD is not None
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __reload(self, fmt="json", useCache=True):
        entryInfoFilePath = self.__getEntryInfoFilePath(fmt=fmt)
        tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        pcD = {"version": self.__version, "created": tS, "identifiers": {}}

        if useCache and self.__mU.exists(entryInfoFilePath):
            logger.info("Reading entry-info cached path %r", entryInfoFilePath)
            pcD = self.__mU.doImport(entryInfoFilePath, fmt=fmt)
        return pcD

    def __updateEntryInfo(self, cfgOb):
        """Get entry_info data"""
        rD = {}
        try:
            obEx = ObjectExtractor(
                cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_entry",
                useCache=False,
                keyAttribute="entry",
                uniqueAttributes=["rcsb_id"],
                selectionQuery={},
                selectionList=[
                    "rcsb_id", "rcsb_entry_info.polymer_entity_count"
                ],
            )
            #
            eCount = obEx.getCount()
            logger.info("Entry count is %d", eCount)

            objD = obEx.getObjects()
            for _, eD in objD.items():
                rcsbId = eD["rcsb_id"]
                try:
                    rD[rcsbId] = eD["rcsb_entry_info"]
                except Exception:
                    pass
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return rD