Ejemplo n.º 1
0
    def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
        """Return a list of object identifiers for the input selection query."""
        try:

            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    selectL = ["_id"]
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return dL
Ejemplo n.º 2
0
    def update(self, databaseName, collectionName, updateDL):
        """Update documents satisfying the selection details with the content of updateDL.

        Args:
            databaseName (str): Target database name
            collectionName (str): Target collection name
            updateDL = [{selectD: ..., updateD: ... }, ....]
                selectD    = {'ky1': 'val1', 'ky2': 'val2',  ...}
                updateD = {'key1.subkey1...': 'val1', 'key2.subkey2..': 'val2', ...}

        """
        try:
            numUpdated = 0
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.debug("%s %s document count is %d", databaseName,
                                 collectionName,
                                 mg.count(databaseName, collectionName))
                    for updateD in updateDL:
                        num = mg.update(databaseName,
                                        collectionName,
                                        updateD["updateD"],
                                        updateD["selectD"],
                                        upsertFlag=True)
                        numUpdated += num

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return numUpdated
Ejemplo n.º 3
0
 def getEntityIds(self, entryIdList):
     """ """
     dbName = "pdbx_core"
     collectionName = "pdbx_core_polymer_entity"
     docD = {}
     try:
         with Connection(cfgOb=self.__cfgOb,
                         resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             if mg.collectionExists(dbName, collectionName):
                 logger.info("%s %s document count is %d",
                             dbName, collectionName,
                             mg.count(dbName, collectionName))
                 for entryId in entryIdList:
                     qD = {
                         "rcsb_polymer_entity_container_identifiers.entry_id":
                         entryId
                     }
                     selectL = ["rcsb_polymer_entity_container_identifiers"]
                     tL = mg.fetch(dbName,
                                   collectionName,
                                   selectL,
                                   queryD=qD)
                     #
                     logger.debug("Selection %r fetch result count %d",
                                  selectL, len(tL))
                     docD[entryId] = [
                         vv["rcsb_polymer_entity_container_identifiers"]
                         for vv in tL
                     ]
         logger.debug("docD is %r", docD)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return docD
Ejemplo n.º 4
0
    def getEntryInfo(self, **kwargs):
        """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""

        resLimit = kwargs.get("resLimit", 3.5)
        expMethod = kwargs.get("expMethod", "X-ray")
        #
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        #
        entryD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    qD = {
                        "rcsb_entry_info.experimental_method": expMethod,
                        "refine.0.ls_d_res_high": {
                            "$lte": resLimit
                        }
                    }
                    selectL = [
                        "rcsb_entry_container_identifiers", "rcsb_entry_info",
                        "refine"
                    ]
                    dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))
                    #
                    for dV in dL:
                        if "rcsb_entry_container_identifiers" not in dV:
                            continue
                        entryId = dV["rcsb_entry_container_identifiers"][
                            "entry_id"]
                        entryD[entryId] = {}
                        if "rcsb_entry_info" in dV and "polymer_composition" in dV[
                                "rcsb_entry_info"]:
                            entryD[entryId] = {
                                "polymer_composition":
                                dV["rcsb_entry_info"]["polymer_composition"],
                                "experimental_method":
                                dV["rcsb_entry_info"]["experimental_method"],
                            }
                        if "refine" in dV and dV[
                                "refine"] and "ls_d_res_high" in dV["refine"][
                                    0]:
                            entryD[entryId]["ls_d_res_high"] = dV["refine"][0][
                                "ls_d_res_high"]
                            logger.debug("Got res %r",
                                         dV["refine"][0]["ls_d_res_high"])

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
Ejemplo n.º 5
0
 def __selectObjects(self, **kwargs):
     """Return a dictionary of objects satisfying the input conditions (e.g. method, resolution limit)"""
     databaseName = kwargs.get("databaseName", "pdbx_core")
     collectionName = kwargs.get("collectionName", "pdbx_core_entry")
     selectionQueryD = kwargs.get("selectionQuery", {})
     #
     uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
     #
     tV = kwargs.get("objectLimit", None)
     objLimit = int(tV) if tV is not None else None
     stripObjectId = kwargs.get("stripObjectId", False)
     logIncrement = kwargs.get("logIncrement", 10000)
     #
     objectD = {}
     try:
         with Connection(cfgOb=self.__cfgOb,
                         resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             if mg.collectionExists(databaseName, collectionName):
                 logger.info("%s %s document count is %d", databaseName,
                             collectionName,
                             mg.count(databaseName, collectionName))
                 qD = {}
                 if selectionQueryD:
                     qD.update(selectionQueryD)
                 selectL = ["_id"]
                 dL = mg.fetch(databaseName,
                               collectionName,
                               selectL,
                               queryD=qD)
                 numDoc = len(dL) if dL else 0
                 logger.info("Selection %r fetch result count %d", selectL,
                             numDoc)
                 #
                 for ii, dD in enumerate(dL, 1):
                     if "_id" not in dD:
                         continue
                     rObj = mg.fetchOne(databaseName, collectionName, "_id",
                                        dD["_id"])
                     if stripObjectId and rObj and "_id" in rObj:
                         rObj.pop("_id")
                     else:
                         rObj["_id"] = str(rObj["_id"])
                     #
                     stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
                     objectD[stKey] = copy.copy(rObj)
                     if objLimit and ii >= objLimit:
                         break
                     logger.debug("Saving %d %s", ii, stKey)
                     if ii % logIncrement == 0 or ii == numDoc:
                         logger.info("Extracting object (%d of %d)", ii,
                                     numDoc)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return objectD
Ejemplo n.º 6
0
 def count(self, databaseName, collectionName):
     try:
         numTotal = 0
         with Connection(cfgOb=self.__cfgOb,
                         resourceName=self.__resourceName) as client:
             mg = MongoDbUtil(client)
             if mg.collectionExists(databaseName, collectionName):
                 numTotal = mg.count(databaseName, collectionName)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return numTotal
Ejemplo n.º 7
0
    def __select(self, **kwargs):
        """Return a dictionary of object content satisfying the input conditions
        (e.g. method, resolution limit) and selection options.
        """
        databaseName = kwargs.get("databaseName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("selectionQuery", {})
        uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"])
        selectL = kwargs.get("selectionList", [])
        stripObjectId = kwargs.get("stripObjectId", False)
        #
        tV = kwargs.get("objectLimit", None)
        objLimit = int(tV) if tV is not None else None
        #
        objectD = {}
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD,
                                  suppressId=True)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))
                    #
                    for ii, rObj in enumerate(dL, 1):
                        stKey = ".".join([rObj[ky] for ky in uniqueAttributes])
                        if stripObjectId and rObj and "_id" in rObj:
                            rObj.pop("_id")
                        objectD[stKey] = copy.copy(rObj)
                        if objLimit and ii >= objLimit:
                            break
                        # logger.debug("Saving %d %s", ii, stKey)
                        # logger.debug("Current objectD keys %r", list(objectD.keys()))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return objectD
Ejemplo n.º 8
0
    def delete(self, databaseName, collectionName, selectD):
        """Remove documents satisfying the input selection details.

        Args:
            databaseName (str): Target database name
            collectionName (str): Target collection name
            selectD    = {'ky1': 'val1', 'ky2': 'val2',  ...}

        """
        try:
            numDeleted = 0
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    numDeleted = mg.delete(databaseName, collectionName,
                                           selectD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return numDeleted
Ejemplo n.º 9
0
    def __selectPolymerEntities(self, entryD, **kwargs):
        """Skeleton entity selector recovering essential biological sequence mapping features
        for macromolecules (default type = protein).

         "1CP9": {
             "polymer_entity_ids": [
                "1",
                "2"
             ],
             "selected_polymer_entities": {
                "1": {
                   "rcsb_multiple_source_flag": "N",
                   "asym_ids": [
                      "A"
                   ],
                   "auth_asym_ids": [
                      "A"
                   ],
                   "entity_id": "1",
                   "type": "polypeptide(L)",
                   "rcsb_entity_polymer_type": "Protein",
                   "rcsb_entity_source_organism": [
                      {
                         "ncbi_taxonomy_id": 587,
                         "beg_seq_num": 1,
                         "end_seq_num": 205,
                         "ncbi_scientific_name": "Providencia rettgeri"
                      }
                   ],
                   "struct_ref": [
                      {
                         "id": "1",
                         "db_name": "UNP",
                         "pdbx_db_accession": "Q7WZI9",
                         "entity_id": "1",
                         "pdbx_seq_one_letter_code": "QSTQIKIERDNYGVPHIYANDTYSLFYGYGYA...",
                         "alignD": {
                            "A": [
                               {
                                  "align_id": "1",
                                  "ref_id": "1",
                                  "pdbx_PDB_id_code": "1CP9",
                                  "pdbx_strand_id": "A",
                                  "seq_align_beg": 1,
                                  "seq_align_end": 205,
                                  "pdbx_db_accession": "Q7WZI9",
                                  "db_align_beg": 24,
                                  "db_align_end": 228,
                                  "pdbx_auth_seq_align_beg": "1",
                                  "pdbx_auth_seq_align_end": "205",
                                  "rcsb_entity_id": "1"
                               }
                            ]
                         }
                      }
                   ]
                },
            "2": {
                   "rcsb_multiple_source_flag": "N",
                   "asym_ids": [
                      "B"
                   ],
                   "auth_asym_ids": [
                      "B"
                   ],
                   "entity_id": "2",
                   "type": "polypeptide(L)",
                   "rcsb_entity_polymer_type": "Protein",
                   "rcsb_entity_source_organism": [
                      {
                         "ncbi_taxonomy_id": 587,
                         "beg_seq_num": 1,
                         "end_seq_num": 553,
                         "ncbi_scientific_name": "Providencia rettgeri"
                      }
                   ],
                   "struct_ref": [
                      {
                         "id": "2",
                         "db_name": "UNP",
                         "pdbx_db_accession": "Q7WZI9",
                         "entity_id": "2",
                         "pdbx_seq_one_letter_code": "SNVWLVGKTKASGAKAILLNGPQFGWFNPAYTYGIGLHG",
                         "alignD": {
                            "B": [
                               {
                                  "align_id": "2",
                                  "ref_id": "2",
                                  "pdbx_PDB_id_code": "1CP9",
                                  "pdbx_strand_id": "B",
                                  "seq_align_beg": 1,
                                  "seq_align_end": 553,
                                  "pdbx_db_accession": "Q7WZI9",
                                  "db_align_beg": 285,
                                  "db_align_end": 837,
                                  "pdbx_auth_seq_align_beg": "1",
                                  "pdbx_auth_seq_align_end": "553",
                                  "rcsb_entity_id": "2"
                               }
                            ]
                         }
                      }
                   ]
                }
             }
           },

        """
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity")
        resultKey = kwargs.get("resultKey", "selected_polymer_entities")

        entryLimit = kwargs.get("entryLimit", None)
        selectionQueryD = kwargs.get("entitySelectionQuery", {"entity_poly.rcsb_entity_polymer_type": "Protein"})
        #
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
                    selectL = [
                        "rcsb_polymer_entity_container_identifiers",
                        "entity.rcsb_multiple_source_flag",
                        "entity_poly.type",
                        "entity_poly.rcsb_entity_polymer_type",
                        "entity_poly.pdbx_seq_one_letter_code_can",
                        "rcsb_entity_source_organism.ncbi_taxonomy_id",
                        "rcsb_entity_source_organism.ncbi_scientific_name",
                        "rcsb_entity_source_organism.beg_seq_num",
                        "rcsb_entity_source_organism.end_seq_num",
                        "struct_ref.id",
                        "struct_ref.pdbx_db_accession",
                        "struct_ref.db_name",
                        "struct_ref.entity_id",
                        "struct_ref.pdbx_seq_one_letter_code",
                        "struct_ref.pdbx_align_begin",
                        "struct_ref_seq",
                        #
                        "entity_src_nat.pdbx_ncbi_taxonomy_id",
                        "entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id",
                        "entity_src_gen.pdbx_host_org_ncbi_taxonomy_id",
                        "pdbx_entity_src_syn.ncbi_taxonomy_id",
                    ]
                    iCount = 0
                    for entryId in entryD:
                        #
                        if resultKey in entryD[entryId]:
                            continue
                        #
                        qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId}
                        qD.update(selectionQueryD)
                        #
                        dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                        logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL))
                        eD = {}
                        for ii, dD in enumerate(dL, 1):
                            rD = {}
                            logger.debug("%s (%4d) d is %r", entryId, ii, dD)
                            if "entity" in dD:
                                rD["rcsb_multiple_source_flag"] = dD["entity"]["rcsb_multiple_source_flag"] if "rcsb_multiple_source_flag" in dD["entity"] else "N"
                            #
                            if "rcsb_polymer_entity_container_identifiers" in dD:
                                rD["asym_ids"] = dD["rcsb_entity_container_identifiers"]["asym_ids"] if "asym_ids" in dD["rcsb_entity_container_identifiers"] else []
                                rD["auth_asym_ids"] = dD["rcsb_entity_container_identifiers"]["auth_asym_ids"] if "auth_asym_ids" in dD["rcsb_entity_container_identifiers"] else []
                                rD["entity_id"] = dD["rcsb_entity_container_identifiers"]["entity_id"]
                            #
                            if "entity_poly" in dD:
                                rD["type"] = dD["entity_poly"]["type"] if "type" in dD["entity_poly"] else None
                                rD["rcsb_entity_polymer_type"] = dD["entity_poly"]["rcsb_entity_polymer_type"] if "rcsb_entity_polymer_type" in dD["entity_poly"] else None
                                rD["entity_polymer_length"] = len(dD["entity_poly"]["pdbx_seq_one_letter_code_can"]) if "pdbx_seq_one_letter_code_can" in dD["entity_poly"] else 0
                            #
                            tL = []
                            if "rcsb_entity_source_organism" in dD:
                                for tD in dD["rcsb_entity_source_organism"]:
                                    tL.append(tD)
                            rD["rcsb_entity_source_organism"] = copy.copy(tL)
                            #
                            qDL = []
                            if "struct_ref" in dD:
                                for tD in dD["struct_ref"]:
                                    if "db_name" in tD:
                                        tD["db_name"] = str(tD["db_name"]).upper().strip()
                                        tD["db_name"] = "UNP" if tD["db_name"] in ["TREMBL"] else tD["db_name"]
                                    qDL.append(tD)
                                if "struct_ref_seq" in dD:
                                    for qD in qDL:
                                        refId = qD["id"]
                                        alignL = []
                                        for tD in dD["struct_ref_seq"]:
                                            if refId == tD["ref_id"]:
                                                alignL.append(tD)
                                        # qD['align_list'] = copy.copy(aL)
                                        for align in alignL:
                                            authAsymId = align["pdbx_strand_id"]
                                            qD.setdefault("alignD", {}).setdefault(authAsymId, []).append(align)

                            rD["struct_ref"] = qDL
                            #
                            taxIdL = []
                            if "entity_src_nat" in dD:
                                for tD in dD["entity_src_nat"]:
                                    if "pdbx_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_ncbi_taxonomy_id"])
                            if "entity_src_gen" in dD:
                                for tD in dD["entity_src_gen"]:
                                    if "pdbx_gene_src_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_gene_src_ncbi_taxonomy_id"])
                                    if "pdbx_host_org_ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["pdbx_host_org_ncbi_taxonomy_id"])
                            if "pdbx_entity_src_syn" in dD:
                                for tD in dD["pdbx_entity_src_syn"]:
                                    if "ncbi_taxonomy_id" in tD:
                                        taxIdL.append(tD["ncbi_taxonomy_id"])
                            qL = []
                            for taxId in taxIdL:
                                ttL = [int(t.strip()) for t in taxId.split(",") if t.strip().isdigit()]
                                qL.extend(ttL)
                            logger.debug("TaxId list %r", qL)
                            rD["original_taxonomy_ids"] = copy.copy(list(set(qL)))
                            #
                            if "entity_id" in rD:
                                eD[rD["entity_id"]] = copy.copy(rD)

                        entryD[entryId][resultKey] = copy.copy(eD)

                        iCount += 1
                        if iCount % 1000 == 0:
                            logger.info("Completed fetch %d/%d entries", iCount, len(entryD))
                        if entryLimit and iCount >= entryLimit:
                            logger.info("Quitting after %d", iCount)
                            break

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return entryD
Ejemplo n.º 10
0
    def __selectEntries(self, **kwargs):
        """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)"""

        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("entrySelectionQuery", {})
        #
        entryD = {}
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(qD)
                    selectL = ["rcsb_entry_container_identifiers"]
                    dL = mg.fetch(dbName, collectionName, selectL, queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL, len(dL))
                    #
                    for dD in dL:
                        #
                        if (
                            ("rcsb_entry_container_identifiers" in dD)
                            and ("entry_id" in dD["rcsb_entry_container_identifiers"])
                            and ("polymer_entity_ids" in dD["rcsb_entry_container_identifiers"])
                            and dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]
                        ):
                            entryD[dD["rcsb_entry_container_identifiers"]["entry_id"]] = {"polymer_entity_ids": dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]}

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
Ejemplo n.º 11
0
    def testSingleIndexSelect(self):
        """Test case -  create collection, create simple single index, insert document list, read check documents.

        """
        try:
            logger.debug("Starting testSingleIndexSelect")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                nDocs = 100
                mg = MongoDbUtil(client)
                ok = mg.createCollection(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                ok = mg.databaseExists(self.__dbName)
                self.assertTrue(ok)
                ok = mg.collectionExists(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
                # Create before insert
                ok = mg.createIndex(self.__dbName,
                                    self.__collectionName,
                                    keyList=["DOC_ID"],
                                    indexName="primary",
                                    indexType="DESCENDING",
                                    uniqueFlag=True)
                self.assertTrue(ok)

                dList = []
                for ii in range(nDocs):
                    dObj = self.__makeDataObj(2, 5, 5, ii)
                    dList.append(dObj)
                #
                keyName = "DOC_ID"
                rIdL = mg.insertList(self.__dbName,
                                     self.__collectionName,
                                     dList,
                                     keyNames=[keyName],
                                     salvage=True)
                self.assertEqual(len(dList), len(rIdL))
                #
                for ii in range(nDocs):
                    kVal = "DOC_%d" % ii
                    rObj = mg.fetchOne(self.__dbName, self.__collectionName,
                                       "DOC_ID", kVal)
                    # logger.debug("Return Object %s" % pprint.pformat(rObj))
                    rObj.pop("_id", None)
                    dList[ii].pop("_id", None)
                    self.assertEqual(len(dList[ii]), len(rObj))
                    self.assertEqual(dList[ii], rObj)
                #
                ok = mg.dropIndex(self.__dbName,
                                  self.__collectionName,
                                  indexName="primary")
                self.assertTrue(ok)
                ok = mg.createIndex(self.__dbName,
                                    self.__collectionName,
                                    keyList=["DOC_ID"],
                                    indexName="primary",
                                    indexType="DESCENDING",
                                    uniqueFlag=True)
                self.assertTrue(ok)
                ok = mg.reIndex(self.__dbName, self.__collectionName)
                self.assertTrue(ok)
                #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                ii = mg.count(self.__dbName, self.__collectionName)
                logger.debug("collection length %d", ii)
                #
                dList = mg.fetch(self.__dbName, self.__collectionName,
                                 ["DOC_ID"])
                self.assertEqual(len(dList), nDocs)
                logger.debug("Fetch length %d", len(dList))
                for ii, dD in enumerate(dList):
                    logger.debug("Fetch num %d: %r", ii, dD)
                #
                dList = mg.fetch(self.__dbName,
                                 self.__collectionName,
                                 ["category_0.attribute_0"],
                                 queryD={"category_0.attribute_0": "val_0_0"})
                self.assertEqual(len(dList), nDocs)
                logger.debug("Fetch length %d", len(dList))
                for ii, dD in enumerate(dList):
                    logger.debug("Fetch num %d: %r", ii, dD)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Ejemplo n.º 12
0
    def getEntityInstances(self, entryD, **kwargs):
        """Get the selected validation data for the instances in the input entry dictionary.

        entryD[entryId]['selected_polymer_entities'][entityId]['validation'] = {}

        Add keys: 'pdbx_vrpt_instance_results'  and  'pdbx_unobs_or_zero_occ_residues' to the validation dictionary above.

        Args:
            resourceName (str):  resource name (e.g. DrugBank, CCDC)
            **kwargs: unused

        Returns:
            entryD: { }
        """
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName",
                                    "pdbx_core_polymer_entity_instance")
        savePath = kwargs.get("savePath", "entry-data.pic")
        saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
        entryLimit = kwargs.get("entryLimit", None)
        #
        try:
            optF = False
            iCount = 0
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s total document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    #
                    for entryId, dV in entryD.items():
                        for entityId, peD in dV[
                                "selected_polymer_entities"].items():
                            # if 'anal_instances' in peD:
                            #    continue
                            vD = {}
                            for asymId in peD["asym_ids"]:
                                qD = {
                                    "rcsb_polymer_entity_instance_container_identifiers.entry_id":
                                    entryId,
                                    "rcsb_polymer_entity_instance_container_identifiers.asym_id":
                                    asymId,
                                }
                                # qD = {'rcsb_entity_instance_container_validation_identifiers.entity_type': 'polymer'}
                                # selectL = ['pdbx_vrpt_instance_results', 'pdbx_unobs_or_zero_occ_residues']
                                selectL = ["pdbx_vrpt_instance_results"]
                                tL = mg.fetch(dbName,
                                              collectionName,
                                              selectL,
                                              queryD=qD)
                                dV = {}
                                if not tL:
                                    logger.info(
                                        "No validation data for %s %s %s(%s)",
                                        dbName, collectionName, entryId,
                                        asymId)
                                    continue
                                #
                                logger.debug(
                                    ">>> %s %s (%s) dict key length %d ",
                                    collectionName, entryId, asymId,
                                    len(tL[0]))

                                #
                                if optF:
                                    dV["pdbx_vrpt_instance_results"] = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    dV["pdbx_unobs_or_zero_occ_residues"] = tL[0][
                                        "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[
                                            0] else []
                                #
                                if optF:
                                    urdL = tL[0][
                                        "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[
                                            0] else []
                                    oL = [{
                                        "label_seq_id": urd["label_seq_id"],
                                        "label_comp_id": urd["label_comp_id"]
                                    } for urd in urdL]
                                    dV["pdbx_unobs_or_zero_occ_residues"] = oL
                                #
                                try:
                                    irdL = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    oL = [{
                                        "label_seq_id": ird["label_seq_id"],
                                        "label_comp_id": ird["label_comp_id"]
                                    } for ird in irdL]
                                    dV["pdbx_vrpt_instance_results_seq"] = oL
                                except Exception as e:
                                    logger.error(
                                        "Failing with entryId %s entityId %s asymId %s bad validation data %s",
                                        entryId, entityId, asymId, str(e))

                                #
                                try:
                                    irdL = tL[0][
                                        "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[
                                            0] else []
                                    oL = [{
                                        "OWAB": ird["OWAB"],
                                        "label_seq_id": ird["label_seq_id"],
                                        "label_comp_id": ird["label_comp_id"]
                                    } for ird in irdL]
                                    dV["pdbx_vrpt_instance_results_occ"] = oL
                                except Exception as e:
                                    logger.debug(
                                        "Failing with entryId %s entityId %s asymId %s bad validation data %s",
                                        entryId, entityId, asymId, str(e))

                                vD[asymId] = copy.copy(dV)
                                #
                            analD = self.analEntity(entryId, peD, vD)
                            entryD[entryId]["selected_polymer_entities"][
                                entityId]["anal_instances"] = copy.copy(analD)
                        iCount += 1
                        if iCount % 500 == 0:
                            logger.info("Completed %d/%d entries", iCount,
                                        len(entryD))
                        if iCount % 2000 == 0:
                            ok = self.__mU.doExport(savePath, entryD,
                                                    **saveKwargs)
                            logger.info(
                                "Saved polymer entity instance results (%d) status %r in %s",
                                iCount, ok, savePath)
                        if entryLimit and iCount >= entryLimit:
                            break
            ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
            logger.info(
                "Saved polymer instance results (%d) entries %d status %r in %s",
                iCount, len(entryD), ok, savePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD
Ejemplo n.º 13
0
    def getPolymerEntities(self, entryD, **kwargs):
        """Add 'selected_polymer_entities' satisfying the input contiditions and add this to the input entry dictionary."""
        dbName = kwargs.get("dbName", "pdbx_core")
        collectionName = kwargs.get("collectionName",
                                    "pdbx_core_polymer_entity")
        resultKey = kwargs.get("resultKey", "selected_polymer_entities")
        savePath = kwargs.get("savePath", "entry-data.pic")
        entryLimit = kwargs.get("entryLimit", None)
        saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"})
        #
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(dbName, collectionName):
                    logger.info("%s %s document count is %d",
                                dbName, collectionName,
                                mg.count(dbName, collectionName))
                    selectL = [
                        "rcsb_polymer_entity_container_identifiers",
                        "entity_poly.type",
                        "entity_poly.pdbx_seq_one_letter_code_can",
                        "rcsb_entity_source_organism.ncbi_taxonomy_id",
                        "rcsb_entity_source_organism.ncbi_scientific_name",
                        "struct_ref.pdbx_seq_one_letter_code",
                        "struct_ref.pdbx_db_accession",
                        "struct_ref.db_name",
                        "struct_ref.entity_id",
                    ]
                    iCount = 0
                    for entryId in entryD:
                        #
                        if resultKey in entryD[entryId]:
                            continue
                        #
                        qD = {
                            "rcsb_polymer_entity_container_identifiers.entry_id":
                            entryId,
                            "entity_poly.rcsb_entity_polymer_type": "Protein",
                            "entity.rcsb_multiple_source_flag": "N",
                        }
                        #
                        dL = mg.fetch(dbName,
                                      collectionName,
                                      selectL,
                                      queryD=qD)
                        logger.debug("%s query %r fetch result count %d",
                                     entryId, qD, len(dL))
                        eD = {}
                        for ii, dV in enumerate(dL, 1):
                            rD = {}
                            logger.debug("%s (%4d) d is %r", entryId, ii, dV)
                            if "rcsb_polymer_entity_container_identifiers" in dV and "asym_ids" in dV[
                                    "rcsb_polymer_entity_container_identifiers"]:
                                rD["asym_ids"] = dV[
                                    "rcsb_polymer_entity_container_identifiers"][
                                        "asym_ids"]
                                rD["entity_id"] = dV[
                                    "rcsb_polymer_entity_container_identifiers"][
                                        "entity_id"]
                            if "entity_poly" in dV and "type" in dV[
                                    "entity_poly"]:
                                rD["type"] = dV["entity_poly"]["type"]
                                rD["seq_one_letter_code_can"] = dV[
                                    "entity_poly"][
                                        "pdbx_seq_one_letter_code_can"]

                            if "rcsb_entity_source_organism" in dV:
                                rD["ncbi_taxonomy_id"] = dV[
                                    "rcsb_entity_source_organism"][0][
                                        "ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in dV[
                                            "rcsb_entity_source_organism"][
                                                0] else None
                                rD["ncbi_scientific_name"] = (
                                    dV["rcsb_entity_source_organism"][0]
                                    ["ncbi_scientific_name"]
                                    if "ncbi_scientific_name"
                                    in dV["rcsb_entity_source_organism"][0]
                                    else None)

                            if "struct_ref" in dV and len(
                                    dV["struct_ref"]) == 1:
                                rD["seq_one_letter_code_ref"] = dV["struct_ref"][
                                    0]["pdbx_seq_one_letter_code"] if "pdbx_seq_one_letter_code" in dV[
                                        "struct_ref"][0] else None
                                rD["db_accession"] = dV["struct_ref"][0][
                                    "pdbx_db_accession"] if "pdbx_db_accession" in dV[
                                        "struct_ref"][0] else None
                                rD["db_name"] = dV["struct_ref"][0][
                                    "db_name"] if "db_name" in dV[
                                        "struct_ref"][0] else None
                                #
                                refDbName = rD["db_name"]
                                dbAccession = rD["db_accession"]
                                dbRefSeq = self.__seqCache[
                                    dbAccession] if dbAccession in self.__seqCache else None

                                if refDbName in ["UNP"] and not dbRefSeq:
                                    dbRefSeq = self.__fetchUniprot(dbAccession)
                                    self.__seqCache[dbAccession] = dbRefSeq
                                    logger.debug("Fetch uniprot %r", dbRefSeq)
                                rD["ref_db_seq"] = dbRefSeq
                            else:
                                rD["seq_one_letter_code_ref"] = rD[
                                    "db_accession"] = rD["db_name"] = None
                            #
                            if "entity_id" in rD:
                                eD[rD["entity_id"]] = copy.copy(rD)

                        entryD[entryId][resultKey] = copy.copy(eD)

                        iCount += 1
                        if iCount % 10 == 0:
                            logger.info(
                                "Completed polymer entities fetch %d/%d entries",
                                iCount, len(entryD))
                        if iCount % 2000 == 0:
                            ok = self.__mU.doExport(savePath, entryD,
                                                    **saveKwargs)
                            logger.info(
                                "Saved polymer entity results (%d) status %r in %s",
                                iCount, ok, savePath)
                        if entryLimit and iCount >= entryLimit:
                            logger.info("Quitting after %d", iCount)
                            break
            #
            # for entryId in entryD:
            #    logger.debug(">>  %s docD  %r" % (entryId, entryD[entryId]))
            ok = self.__mU.doExport(savePath, entryD, **saveKwargs)
            logger.info(
                "Saved polymer entity results (%d) entries %d status %r in %s",
                iCount, len(entryD), ok, savePath)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return entryD