Beispiel #1
0
    def __getPolymerReferenceSequenceAssignments(self, databaseName, collectionName, polymerType, fetchLimit):
        """Get all accessions assigned to input reference sequence database for the input polymerType.

        Returns:
         (dict): {"1abc_1": "rcsb_polymer_entity_container_identifiers": {"reference_sequence_identifiers": []},
                            "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
        """
        try:

            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName=databaseName,
                collectionName=collectionName,
                cacheFilePath=None,
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=None,
                objectLimit=fetchLimit,
                selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType},
                selectionList=[
                    "rcsb_id",
                    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers",
                    "rcsb_polymer_entity_container_identifiers.auth_asym_ids",
                    "rcsb_entity_source_organism.ncbi_taxonomy_id",
                ],
            )
            eCount = obEx.getCount()
            logger.info("Polymer entity count type %s is %d", polymerType, eCount)
            objD = obEx.getObjects()
            logger.info("Reading polymer entity count %d reference accession length %d ", eCount, len(objD))
            #
        except Exception as e:
            logger.exception("Failing for %s (%s) with %s", databaseName, collectionName, str(e))
        return objD
Beispiel #2
0
 def __extractCitations(self):
     """Test case - extract unique entity source and host taxonomies"""
     try:
         obEx = ObjectExtractor(
             self.__cfgOb,
             databaseName=self.__databaseName,
             collectionName=self.__collectionName,
             cacheFilePath=None,
             useCache=False,
             keyAttribute="entry",
             uniqueAttributes=["rcsb_id"],
             cacheKwargs=None,
             objectLimit=None,
             selectionQuery={},
             selectionList=["rcsb_id", "citation"],
         )
         eCount = obEx.getCount()
         logger.info("Entry count is %d", eCount)
         objD = obEx.getObjects()
         # for ky, eD in objD.items():
         #    logger.info("%s: %r", ky, eD)
         return objD
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return {}
Beispiel #3
0
    def __updateEntryInfo(self, cfgOb):
        """Get entry_info data"""
        rD = {}
        try:
            obEx = ObjectExtractor(
                cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_entry",
                useCache=False,
                keyAttribute="entry",
                uniqueAttributes=["rcsb_id"],
                selectionQuery={},
                selectionList=[
                    "rcsb_id", "rcsb_entry_info.polymer_entity_count"
                ],
            )
            #
            eCount = obEx.getCount()
            logger.info("Entry count is %d", eCount)

            objD = obEx.getObjects()
            for _, eD in objD.items():
                rcsbId = eD["rcsb_id"]
                try:
                    rD[rcsbId] = eD["rcsb_entry_info"]
                except Exception:
                    pass
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return rD
 def testExtractEntriesBefore(self):
     """Test case - extract entries subject to date restriction"""
     try:
         tU = TimeUtil()
         tS = tU.getTimestamp(useUtc=True, before={"days": 365 * 5})
         tD = tU.getDateTimeObj(tS)
         obEx = ObjectExtractor(
             self.__cfgOb,
             databaseName="pdbx_core",
             collectionName="pdbx_core_entry",
             useCache=False,
             keyAttribute="entry",
             uniqueAttributes=["rcsb_id"],
             selectionQuery={
                 "rcsb_accession_info.initial_release_date": {
                     "$gt": tD
                 }
             },
             selectionList=["rcsb_id", "rcsb_accession_info"],
         )
         eD = obEx.getObjects()
         eCount = obEx.getCount()
         logger.info("Entry count is %d", eCount)
         logger.info("Entries are %r", list(eD.keys()))
         self.assertGreaterEqual(eCount, 6)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
    def __getPolymerReferenceSequenceAssignments(self, databaseName,
                                                 collectionName, polymerType,
                                                 **kwargs):
        """Get all accessions assigned to input reference sequence database for the input polymerType.

        Returns:
         (dict): {"1abc_1": "rcsb_entity_container_identifiers": {"reference_sequence_identifiers": []},
                            "rcsb_polymer_entity_align": [],
                            "rcsb_entity_source_organism"" {"ncbi_taxonomy_id": []}
        """
        cachePath = kwargs.get("cachePath", ".")
        exDbDir = "exdb"
        cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "json", "indent": 3})
        useCache = kwargs.get("useCache", True)
        fetchLimit = kwargs.get("fetchLimit", None)
        cacheFilePath = os.path.join(cachePath, exDbDir,
                                     "entity-poly-ref-seq-assign-cache.json")
        #
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName=databaseName,
                collectionName=collectionName,
                cacheFilePath=cacheFilePath,
                useCache=useCache,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=cacheKwargs,
                objectLimit=fetchLimit,
                selectionQuery={
                    "entity_poly.rcsb_entity_polymer_type": polymerType
                },
                selectionList=[
                    "rcsb_id",
                    "rcsb_entity_container_identifiers.reference_sequence_identifiers",
                    "rcsb_entity_container_identifiers.auth_asym_ids",
                    "rcsb_polymer_entity_align",
                    "rcsb_entity_source_organism.ncbi_taxonomy_id",
                ],
            )
            eCount = obEx.getCount()
            logger.info("Entity count is %d", eCount)
            objD = obEx.getObjects()
            logger.info(
                "Reading polymer entity entity count %d ref accession length %d ",
                eCount, len(objD))
            #
        except Exception as e:
            logger.exception("Failing for %s (%s) with %s", databaseName,
                             collectionName, str(e))
        return objD
Beispiel #6
0
 def __getReferenceData(self, databaseName, collectionName, selectD=None):
     logger.info("Searching %s %s with selection query %r", databaseName, collectionName, selectD)
     obEx = ObjectExtractor(
         self.__cfgOb,
         databaseName=databaseName,
         collectionName=collectionName,
         keyAttribute="rcsb_id",
         uniqueAttributes=["rcsb_id"],
         selectionQuery=selectD,
     )
     docCount = obEx.getCount()
     logger.debug("Reference data match count %d", docCount)
     objD = obEx.getObjects()
     return objD
 def __extractLigandNeighbors(self):
     """Extract unique chemical component ids involved in neighbor interactions with each
     polymer and branched entity instance."""
     try:
         databaseName = "pdbx_core"
         collectionName = "pdbx_core_polymer_entity_instance"
         obEx = ObjectExtractor(
             self.__cfgOb,
             databaseName=databaseName,
             collectionName=collectionName,
             cacheFilePath=None,
             useCache=False,
             keyAttribute="rcsb_id",
             uniqueAttributes=["rcsb_id"],
             cacheKwargs=None,
             objectLimit=None,
             # selectionQuery={"rcsb_polymer_entity_annotation.type": annotationType},
             selectionQuery=None,
             selectionList=[
                 "rcsb_id",
                 "rcsb_polymer_entity_instance_container_identifiers.entry_id",
                 "rcsb_polymer_entity_instance_container_identifiers.entity_id",
                 "rcsb_polymer_entity_instance_container_identifiers.asym_id",
                 "rcsb_ligand_neighbors.ligand_comp_id",
                 "rcsb_ligand_neighbors.ligand_is_bound",
             ],
         )
         eCount = obEx.getCount()
         logger.info("Total neighbor count (%d)", eCount)
         rD = {}
         objD = obEx.getObjects()
         for _, peiD in objD.items():
             try:
                 entryId = peiD["rcsb_polymer_entity_instance_container_identifiers"]["entry_id"]
                 entityId = peiD["rcsb_polymer_entity_instance_container_identifiers"]["entity_id"]
                 ky = entryId + "_" + entityId
                 for lnD in peiD["rcsb_ligand_neighbors"] if "rcsb_ligand_neighbors" in peiD else []:
                     if "ligand_comp_id" in lnD and "ligand_is_bound" in lnD:
                         rD.setdefault(ky, set()).add((lnD["ligand_comp_id"], lnD["ligand_is_bound"]))
                     else:
                         logger.warning("%s %s missing details lnD %r", entryId, entityId, lnD)
             except Exception as e:
                 logger.exception("Failing with %s", str(e))
         rD = {k: list(v) for k, v in rD.items()}
         logger.info("Unique instance %d", len(rD))
         return rD
     except Exception as e:
         logger.exception("Failing with %s", str(e))
Beispiel #8
0
    def getReferenceSequenceDetails(self):
        """Get reference protein sequence essential details (sequence, taxonomy, name, gene, ...)"""
        uD = None
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="uniprot_exdb",
                collectionName="reference_entry",
                useCache=False,
                keyAttribute="uniprot",
                uniqueAttributes=["rcsb_id"],
                selectionQuery={},
                selectionList=[
                    "source_scientific",
                    "taxonomy_id",
                    "rcsb_id",
                    "gene",
                    "names",
                    "sequence",
                ],
            )
            #
            eCount = obEx.getCount()
            logger.info("Reference entry count is %d", eCount)
            objD = obEx.getObjects()
            rD = {}
            for rId, uD in objD.items():
                taxId = uD["taxonomy_id"]
                sn = uD["source_scientific"]
                sequence = uD["sequence"]
                gn = None
                pn = None
                if "gene" in uD:
                    for tD in uD["gene"]:
                        if tD["type"] == "primary":
                            gn = tD["name"]
                            break
                for tD in uD["names"]:
                    if tD["nameType"] == "recommendedName":
                        pn = tD["name"]
                        break
                rD[rId] = {"accession": rId, "taxId": taxId, "scientific_name": sn, "gene": gn, "name": pn, "sequence": sequence}

        except Exception as e:
            logger.exception("Failing uD %r with %s", uD, str(e))
        #
        return rD
Beispiel #9
0
    def getBranchedDetails(self):
        """Get branched entity details (BIRD mapping and WURCS descriptors)"""
        rD = {}
        try:

            #
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_branched_entity",
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                selectionQuery={},
                selectionList=["rcsb_id", "pdbx_entity_branch_descriptor", "rcsb_branched_entity_container_identifiers"],
            )
            #
            # eCount = obEx.getCount()
            # logger.info("Branched entity count is %d", eCount)
            objD = obEx.getObjects()
            rD = {}
            for _, eD in objD.items():
                rcsbId = eD["rcsb_id"]
                #
                prdId = None
                try:
                    pD = eD["rcsb_branched_entity_container_identifiers"]
                    prdId = pD["prd_id"]
                except Exception:
                    pass
                #
                wurcs = None
                try:
                    for tD in eD["pdbx_entity_branch_descriptor"]:
                        if tD["type"] == "WURCS":
                            wurcs = tD["descriptor"]
                except Exception:
                    pass
                if prdId or wurcs:
                    rD[rcsbId] = {"prdId": prdId, "wurcs": wurcs}

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return rD
Beispiel #10
0
    def getChemCompAccessionMapping(self, referenceResourceName):
        """Get the accession code mapping between chemical component identifiers and identifier(s) for the
            input external reference resource.

        Args:
            referenceResourceName (str):  resource name (e.g. DrugBank, ChEMBL, CCDC)

        Returns:
            dict: {referenceResourceId: chem_comp/bird_id, referenceResourceId: chem_comp/bird_id, ...  }

        """
        idD = {}
        try:
            databaseName = "bird_chem_comp_core"
            collectionName = "bird_chem_comp_core"
            selectD = {
                "rcsb_chem_comp_related.resource_name": referenceResourceName
            }
            selectionList = ["rcsb_id", "rcsb_chem_comp_related"]
            logger.info("Searching %s %s with selection query %r",
                        databaseName, collectionName, selectD)
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName=databaseName,
                collectionName=collectionName,
                keyAttribute="rcsb_id",
                uniqueAttributes=["rcsb_id"],
                selectionQuery=selectD,
                selectionList=selectionList,
                stripObjectId=True,
            )
            logger.info("Reference data object count %d", obEx.getCount())
            objD = obEx.getObjects()
            for _, doc in objD.items():
                dL = doc["rcsb_chem_comp_related"] if "rcsb_chem_comp_related" in doc else []
                for dD in dL:
                    if dD["resource_name"] == referenceResourceName and "resource_accession_code" in dD:
                        idD.setdefault(dD["resource_accession_code"],
                                       []).append(dD["comp_id"])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return idD
    def testExtractEntityTaxonomyContent(self):
        """Test case - extract unique entity source and host taxonomies"""
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                cacheFilePath=os.path.join(self.__workPath,
                                           "entity-taxonomy-test-cache.json"),
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=self.__testEntryCacheKwargs,
                # objectLimit=self.__objectLimitTest,
                objectLimit=None,
                selectionQuery=None,
                selectionList=[
                    "rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id",
                    "rcsb_entity_host_organism.ncbi_taxonomy_id"
                ],
            )
            eCount = obEx.getCount()
            logger.info("Polymer entity count is %d", eCount)
            taxIdS = set()
            objD = obEx.getObjects()
            for _, eD in objD.items():
                try:
                    for tD in eD["rcsb_entity_source_organism"]:
                        taxIdS.add(tD["ncbi_taxonomy_id"])
                except Exception:
                    pass
                try:
                    for tD in eD["rcsb_entity_host_organism"]:
                        taxIdS.add(tD["ncbi_taxonomy_id"])
                except Exception:
                    pass

            logger.info("Unique taxons %d", len(taxIdS))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #12
0
    def testExtractEntityTaxonomyContent(self):
        """Test case - extract unique entity source and host taxonomies"""
        tL = []
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                selectionQuery=None,
                selectionList=["rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_host_organism.ncbi_taxonomy_id"],
            )
            eCount = obEx.getCount()
            logger.info("Polymer entity count is %d", eCount)
            objD = obEx.getObjects()
            sD = {}
            hD = {}
            for rId, eD in objD.items():
                try:
                    for tD in eD["rcsb_entity_source_organism"]:
                        sD.setdefault(rId, []).append(str(tD["ncbi_taxonomy_id"]))

                except Exception:
                    pass
                try:
                    for tD in eD["rcsb_entity_host_organism"]:
                        hD.setdefault(rId, []).append(str(tD["ncbi_taxonomy_id"]))
                except Exception:
                    pass
            for rId, taxIdL in sD.items():
                tS = "|".join(sorted(set(taxIdL)))
                if tS:
                    lS = "%s\t%s" % (rId, "|".join(sorted(set(taxIdL))))
                tL.append(lS)
            self.__mU.doExport(self.__entityTaxonPath, tL, fmt="list")
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
    def testExtractEntities(self):
        """Test case - extract entities"""
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                cacheFilePath=os.path.join(self.__workPath,
                                           "entity-data-test-cache.json"),
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=self.__testEntryCacheKwargs,
                objectLimit=self.__objectLimitTest,
            )
            eCount = obEx.getCount()
            logger.info("Entity count is %d", eCount)
            self.assertGreaterEqual(eCount, self.__objectLimitTest)

            objD = obEx.getObjects()
            for _, obj in objD.items():
                obEx.genPathList(obj, path=None)
            #
            pL = obEx.getPathList(filterList=False)
            logger.debug("Path list (unfiltered) %r", pL)
            #
            pL = obEx.getPathList()
            logger.debug("Path list %r", pL)
            obEx.setPathList(pL)
            if self.__verbose:
                for ky, obj in objD.items():
                    obEx.genValueList(obj, path=None)
                    tD = obEx.getValues()
                    logger.info("Index object %r %s", ky,
                                pprint.pformat(tD, indent=3, width=120))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #14
0
 def __extractEntityTaxons(self):
     """Test case - extract unique entity source and host taxonomies"""
     try:
         obEx = ObjectExtractor(
             self.__cfgOb,
             databaseName=self.__databaseName,
             collectionName=self.__collectionName,
             cacheFilePath=None,
             useCache=False,
             keyAttribute="entity",
             uniqueAttributes=["rcsb_id"],
             cacheKwargs=None,
             objectLimit=None,
             # selectionQuery={"entity.type": "polymer"},
             selectionQuery=None,
             selectionList=[
                 "rcsb_id", "rcsb_entity_source_organism.ncbi_taxonomy_id",
                 "rcsb_entity_host_organism.ncbi_taxonomy_id"
             ],
         )
         eCount = obEx.getCount()
         logger.info("Polymer entity count is %d", eCount)
         taxIdS = set()
         objD = obEx.getObjects()
         for _, eD in objD.items():
             try:
                 for tD in eD["rcsb_entity_source_organism"]:
                     taxIdS.add(tD["ncbi_taxonomy_id"])
             except Exception:
                 pass
             try:
                 for tD in eD["rcsb_entity_host_organism"]:
                     taxIdS.add(tD["ncbi_taxonomy_id"])
             except Exception:
                 pass
         logger.info("Unique taxons %d", len(taxIdS))
         return list(taxIdS)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
Beispiel #15
0
 def __extractEntityAnnotationIdentifiers(self, annotationType):
     """Extract unique rcsb_polymer_entity_annotation ids for the input annotation type."""
     try:
         databaseName = "pdbx_core"
         collectionName = "pdbx_core_polymer_entity"
         obEx = ObjectExtractor(
             self.__cfgOb,
             databaseName=databaseName,
             collectionName=collectionName,
             cacheFilePath=None,
             useCache=False,
             keyAttribute="entity",
             uniqueAttributes=["rcsb_id"],
             cacheKwargs=None,
             objectLimit=None,
             # selectionQuery={"rcsb_polymer_entity_annotation.type": annotationType},
             selectionQuery=None,
             selectionList=[
                 "rcsb_id", "rcsb_polymer_entity_annotation.annotation_id",
                 "rcsb_polymer_entity_annotation.type"
             ],
         )
         eCount = obEx.getCount()
         logger.info(
             "For type %r polymer entity annotation object count is %d",
             annotationType, eCount)
         idS = set()
         objD = obEx.getObjects()
         for _, eD in objD.items():
             try:
                 for tD in eD["rcsb_polymer_entity_annotation"]:
                     if tD["type"] == annotationType:
                         idS.add(tD["annotation_id"])
             except Exception:
                 pass
         logger.info("Unique identifiers %d", len(idS))
         return list(idS)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
    def testExtractSelectedEntityContent(self):
        """Test case - extract selected entity content

        "reference_sequence_identifiers": [
                    {
                        "database_name": "UniProt",
                        "database_accession": "Q5SHN1",
                        "provenance_source": "SIFTS"
                    },
                    {
                        "database_name": "UniProt",
                        "database_accession": "Q5SHN1",
                        "provenance_source": "PDB"
                    }
                    ]
        """
        try:
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                cacheFilePath=os.path.join(
                    self.__workPath,
                    "entity-selected-content-test-cache.json"),
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                cacheKwargs=self.__testEntryCacheKwargs,
                # objectLimit=self.__objectLimitTest,
                objectLimit=None,
                selectionQuery={
                    "entity_poly.rcsb_entity_polymer_type": "Protein"
                },
                selectionList=[
                    "rcsb_id",
                    "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers"
                ],
            )
            eCount = obEx.getCount()
            logger.info("Entity count is %d", eCount)
            #
            #
            if self.__objectLimitTest is not None:
                self.assertGreaterEqual(eCount, self.__objectLimitTest)
                objD = obEx.getObjects()
                for _, obj in objD.items():
                    obEx.genPathList(obj, path=None)
                #
                pL = obEx.getPathList(filterList=False)
                logger.debug("Path list (unfiltered) %r", pL)
                #
                pL = obEx.getPathList()
                logger.debug("Path list %r", pL)
                obEx.setPathList(pL)
                if self.__verbose:
                    for ky, obj in objD.items():
                        obEx.genValueList(obj, path=None)
                        tD = obEx.getValues()
                        logger.info("Index object %r %s", ky,
                                    pprint.pformat(tD, indent=3, width=120))

            objD = obEx.getObjects()
            # logger.info("objD.keys() %r", list(objD.keys()))
            totCount = 0
            difCount = 0
            pdbUnpIdD = defaultdict(int)
            siftsUnpIdD = defaultdict(int)
            pdbDifUnpIdD = defaultdict(int)
            for entityKey, eD in objD.items():
                try:
                    siftsS = set()
                    pdbS = set()
                    for tD in eD["rcsb_polymer_entity_container_identifiers"][
                            "reference_sequence_identifiers"]:
                        if tD["database_name"] == "UniProt":
                            if tD["provenance_source"] == "SIFTS":
                                siftsS.add(tD["database_accession"])
                                siftsUnpIdD[tD["database_accession"]] += 1
                            elif tD["provenance_source"] == "PDB":
                                pdbS.add(tD["database_accession"])
                                pdbUnpIdD[tD["database_accession"]] += 1
                        else:
                            logger.debug(
                                "No UniProt for %r",
                                eD["rcsb_polymer_entity_container_identifiers"]
                            )
                    logger.debug("PDB assigned sequence length %d", len(pdbS))
                    logger.debug("SIFTS assigned sequence length %d",
                                 len(siftsS))

                    if pdbS and siftsS:
                        totCount += 1
                        if pdbS != siftsS:
                            difCount += 1
                            for idV in pdbS:
                                pdbDifUnpIdD[idV] += 1

                except Exception as e:
                    logger.warning("No identifiers for %s with %s", entityKey,
                                   str(e))
            logger.info("Total %d differences %d", totCount, difCount)
            logger.info("Unique UniProt ids  PDB %d  SIFTS %d", len(pdbUnpIdD),
                        len(siftsUnpIdD))
            logger.info("Unique UniProt differences %d ", len(pdbDifUnpIdD))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #17
0
 def testUpdateSelectedEntityContent(self):
     """Test case - update of selected entity reference sequence content"""
     try:
         databaseName = "pdbx_core"
         collectionName = "pdbx_core_polymer_entity"
         obEx = ObjectExtractor(
             self.__cfgOb,
             databaseName=databaseName,
             collectionName=collectionName,
             cacheFilePath=os.path.join(
                 self.__workPath,
                 "entity-selected-content-test-cache.json"),
             useCache=False,
             keyAttribute="entity",
             uniqueAttributes=["rcsb_id"],
             cacheKwargs=self.__testEntryCacheKwargs,
             objectLimit=self.__objectLimitTest,
             # objectLimit=None,
             selectionQuery={
                 "entity_poly.rcsb_entity_polymer_type": "Protein"
             },
             selectionList=[
                 "rcsb_id",
                 "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers"
             ],
         )
         eCount = obEx.getCount()
         logger.info("Entity count is %d", eCount)
         objD = obEx.getObjects()
         updateDL = []
         for entityKey, eD in objD.items():
             try:
                 selectD = {"rcsb_id": entityKey}
                 tL = (eD["rcsb_polymer_entity_container_identifiers"]
                       ["reference_sequence_identifiers"]
                       if "reference_sequence_identifiers"
                       in eD["rcsb_polymer_entity_container_identifiers"]
                       else [])
                 tL.append({
                     "database_accession": "1111111",
                     "database_name": "PDB",
                     "provenance_source": "RCSB"
                 })
                 #
                 updateD = {
                     "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers":
                     tL
                 }
                 updateDL.append({"selectD": selectD, "updateD": updateD})
             except Exception as e:
                 logger.exception("Failing with %s", str(e))
         for ii, uD in enumerate(updateDL):
             logger.debug(" >>>> (%d) selectD %r updateD %r", ii,
                          uD["selectD"], uD["updateD"])
         #
         obUpd = ObjectUpdater(self.__cfgOb)
         numUpd = obUpd.update(databaseName, collectionName, updateDL)
         self.assertGreaterEqual(numUpd, len(updateDL))
         logger.info("Update count is %d", numUpd)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
    def getProteinSequenceDetails(self, minSeqLen=0):
        """Get protein sequence and taxonomy data (required to build protein sequence fasta file)"""
        missingSrcD = {}
        rD = {}
        try:
            unpEx = UniProtExtractor(self.__cfgOb)
            unpD = unpEx.getReferenceSequenceDetails()
            #
            obEx = ObjectExtractor(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                useCache=False,
                keyAttribute="entity",
                uniqueAttributes=["rcsb_id"],
                selectionQuery={
                    "entity_poly.rcsb_entity_polymer_type": "Protein"
                },
                selectionList=[
                    "rcsb_id",
                    "rcsb_entity_source_organism",
                    "rcsb_polymer_entity.rcsb_source_part_count",
                    "rcsb_polymer_entity.rcsb_source_taxonomy_count",
                    "rcsb_polymer_entity.src_method",
                    "entity_poly",
                    "rcsb_polymer_entity_align",
                ],
            )
            #
            eCount = obEx.getCount()
            logger.info("Polymer entity count is %d", eCount)
            objD = obEx.getObjects()
            rD = {}
            for rId, eD in objD.items():

                try:
                    pD = eD["entity_poly"]
                    seqS = pD["pdbx_seq_one_letter_code_can"]
                    seqLen = len(seqS)
                except Exception:
                    logger.warning("%s no one-letter-code sequence", rId)
                #
                if seqLen < minSeqLen:
                    continue
                #
                srcMethod = None
                try:
                    pD = eD["rcsb_polymer_entity"]
                    srcMethod = pD["src_method"]
                except Exception:
                    pass
                #
                if "rcsb_entity_source_organism" not in eD:
                    logger.debug(
                        "%s No source information (%r) skipping (seqLen %d)",
                        rId, srcMethod, seqLen)
                    continue
                try:
                    sL = []
                    for tD in eD["rcsb_entity_source_organism"]:
                        srcName = tD[
                            "scientific_name"] if "scientific_name" in tD else None
                        if "beg_seq_num" in tD and "end_seq_num" in tD:
                            begSeqNum = tD["beg_seq_num"]
                            endSeqNum = tD["end_seq_num"] if tD[
                                "end_seq_num"] <= seqLen else seqLen
                        else:
                            begSeqNum = 1
                            endSeqNum = seqLen
                        srcId = tD["pdbx_src_id"]
                        srcType = tD["source_type"]
                        taxId = tD[
                            "ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in tD else -1
                        if srcName and taxId == -1:
                            missingSrcD.setdefault(srcName, []).append(rId)
                        orgName = tD[
                            "ncbi_scientific_name"] if "ncbi_scientific_name" in tD else ""
                        sL.append({
                            "srcId": srcId,
                            "taxId": taxId,
                            "orgName": orgName,
                            "entitySeqBeg": begSeqNum,
                            "entitySeqEnd": endSeqNum
                        })
                    if len(sL) == 1:
                        sL[0]["entitySeqBeg"] = 1
                        sL[0]["entitySeqEnd"] = seqLen

                except Exception as e:
                    logger.exception("Failing for (%r) tD %r with %s", rId, tD,
                                     str(e))
                #
                try:
                    pD = eD["rcsb_polymer_entity"]
                    partCount = pD["rcsb_source_part_count"]
                except Exception:
                    logger.warning("%s no source part count", rId)
                    partCount = 1
                try:
                    pD = eD["rcsb_polymer_entity"]
                    taxCount = pD["rcsb_source_taxonomy_count"]
                except Exception:
                    if srcType == "synthetic":
                        taxCount = 0
                    else:
                        logger.warning(
                            "%s (srcName %r) no source taxonomy count type %r",
                            rId, srcName, srcType)
                        if srcName:
                            taxCount = 1
                        else:
                            taxCount = 0
                #
                uDL = []
                try:
                    for tD in eD["rcsb_polymer_entity_align"]:
                        uD = {}
                        if tD["reference_database_name"] in [
                                "UniProt", "GenBank", "PIR", "EMBL", "NORINE",
                                "PRF"
                        ]:
                            uD["refDbId"] = tD["reference_database_accession"]
                            uD["refDbName"] = tD["reference_database_name"]
                            uD["provSource"] = tD["provenance_source"]
                            if tD["reference_database_accession"] in unpD:
                                uD.update(
                                    unpD[tD["reference_database_accession"]])
                            aL = []
                            for qD in tD["aligned_regions"]:
                                if qD["entity_beg_seq_id"] + qD[
                                        "length"] - 1 > seqLen:
                                    qD["length"] = seqLen - qD[
                                        "entity_beg_seq_id"] + 1
                                srcId = self.__getSourcePart(
                                    rId, sL, qD["entity_beg_seq_id"],
                                    qD["length"])

                                aL.append({
                                    "srcId":
                                    srcId,
                                    "entitySeqBeg":
                                    qD["entity_beg_seq_id"],
                                    "refSeqBeg":
                                    qD["ref_beg_seq_id"],
                                    "length":
                                    qD["length"]
                                })
                            uD["alignList"] = aL
                            uDL.append(uD)
                        else:
                            logger.info("%s reference database %s", rId,
                                        tD["reference_database_name"])

                except Exception:
                    pass
                rD[rId] = {
                    "alignmentL": uDL,
                    "sourceOrgL": sL,
                    "partCount": partCount,
                    "taxCount": taxCount,
                    "sequence": seqS,
                    "seqLen": seqLen
                }

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return rD, missingSrcD