def __selectObjectIds(self, databaseName, collectionName, selectionQueryD): """Return a list of object identifiers for the input selection query.""" try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(databaseName, collectionName): logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName)) qD = {} if selectionQueryD: qD.update(selectionQueryD) selectL = ["_id"] dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD) logger.info("Selection %r fetch result count %d", selectL, len(dL)) except Exception as e: logger.exception("Failing with %s", str(e)) return dL
def __selectEntries(self, **kwargs): """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)""" dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") selectionQueryD = kwargs.get("entrySelectionQuery", {}) # entryD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) qD = {} if selectionQueryD: qD.update(qD) selectL = ["rcsb_entry_container_identifiers"] dL = mg.fetch(dbName, collectionName, selectL, queryD=qD) logger.info("Selection %r fetch result count %d", selectL, len(dL)) # for dD in dL: # if ( ("rcsb_entry_container_identifiers" in dD) and ("entry_id" in dD["rcsb_entry_container_identifiers"]) and ("polymer_entity_ids" in dD["rcsb_entry_container_identifiers"]) and dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"] ): entryD[dD["rcsb_entry_container_identifiers"]["entry_id"]] = {"polymer_entity_ids": dD["rcsb_entry_container_identifiers"]["polymer_entity_ids"]} except Exception as e: logger.exception("Failing with %s", str(e)) return entryD
def getEntityIds(self, entryIdList): """ """ dbName = "pdbx_core" collectionName = "pdbx_core_polymer_entity" docD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) for entryId in entryIdList: qD = { "rcsb_polymer_entity_container_identifiers.entry_id": entryId } selectL = ["rcsb_polymer_entity_container_identifiers"] tL = mg.fetch(dbName, collectionName, selectL, queryD=qD) # logger.debug("Selection %r fetch result count %d", selectL, len(tL)) docD[entryId] = [ vv["rcsb_polymer_entity_container_identifiers"] for vv in tL ] logger.debug("docD is %r", docD) except Exception as e: logger.exception("Failing with %s", str(e)) return docD
def getEntryInfo(self, **kwargs): """Return a dictionary of PDB entries satifying the input conditions (e.g. method, resolution limit)""" resLimit = kwargs.get("resLimit", 3.5) expMethod = kwargs.get("expMethod", "X-ray") # dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") # entryD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) qD = { "rcsb_entry_info.experimental_method": expMethod, "refine.0.ls_d_res_high": { "$lte": resLimit } } selectL = [ "rcsb_entry_container_identifiers", "rcsb_entry_info", "refine" ] dL = mg.fetch(dbName, collectionName, selectL, queryD=qD) logger.info("Selection %r fetch result count %d", selectL, len(dL)) # for dV in dL: if "rcsb_entry_container_identifiers" not in dV: continue entryId = dV["rcsb_entry_container_identifiers"][ "entry_id"] entryD[entryId] = {} if "rcsb_entry_info" in dV and "polymer_composition" in dV[ "rcsb_entry_info"]: entryD[entryId] = { "polymer_composition": dV["rcsb_entry_info"]["polymer_composition"], "experimental_method": dV["rcsb_entry_info"]["experimental_method"], } if "refine" in dV and dV[ "refine"] and "ls_d_res_high" in dV["refine"][ 0]: entryD[entryId]["ls_d_res_high"] = dV["refine"][0][ "ls_d_res_high"] logger.debug("Got res %r", dV["refine"][0]["ls_d_res_high"]) except Exception as e: logger.exception("Failing with %s", str(e)) return entryD
def __selectObjects(self, **kwargs): """Return a dictionary of objects satisfying the input conditions (e.g. method, resolution limit)""" databaseName = kwargs.get("databaseName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") selectionQueryD = kwargs.get("selectionQuery", {}) # uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"]) # tV = kwargs.get("objectLimit", None) objLimit = int(tV) if tV is not None else None stripObjectId = kwargs.get("stripObjectId", False) logIncrement = kwargs.get("logIncrement", 10000) # objectD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(databaseName, collectionName): logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName)) qD = {} if selectionQueryD: qD.update(selectionQueryD) selectL = ["_id"] dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD) numDoc = len(dL) if dL else 0 logger.info("Selection %r fetch result count %d", selectL, numDoc) # for ii, dD in enumerate(dL, 1): if "_id" not in dD: continue rObj = mg.fetchOne(databaseName, collectionName, "_id", dD["_id"]) if stripObjectId and rObj and "_id" in rObj: rObj.pop("_id") else: rObj["_id"] = str(rObj["_id"]) # stKey = ".".join([rObj[ky] for ky in uniqueAttributes]) objectD[stKey] = copy.copy(rObj) if objLimit and ii >= objLimit: break logger.debug("Saving %d %s", ii, stKey) if ii % logIncrement == 0 or ii == numDoc: logger.info("Extracting object (%d of %d)", ii, numDoc) except Exception as e: logger.exception("Failing with %s", str(e)) return objectD
def __select(self, **kwargs): """Return a dictionary of object content satisfying the input conditions (e.g. method, resolution limit) and selection options. """ databaseName = kwargs.get("databaseName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_entry") selectionQueryD = kwargs.get("selectionQuery", {}) uniqueAttributes = kwargs.get("uniqueAttributes", ["rcsb_id"]) selectL = kwargs.get("selectionList", []) stripObjectId = kwargs.get("stripObjectId", False) # tV = kwargs.get("objectLimit", None) objLimit = int(tV) if tV is not None else None # objectD = {} try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(databaseName, collectionName): logger.info("%s %s document count is %d", databaseName, collectionName, mg.count(databaseName, collectionName)) qD = {} if selectionQueryD: qD.update(selectionQueryD) dL = mg.fetch(databaseName, collectionName, selectL, queryD=qD, suppressId=True) logger.info("Selection %r fetch result count %d", selectL, len(dL)) # for ii, rObj in enumerate(dL, 1): stKey = ".".join([rObj[ky] for ky in uniqueAttributes]) if stripObjectId and rObj and "_id" in rObj: rObj.pop("_id") objectD[stKey] = copy.copy(rObj) if objLimit and ii >= objLimit: break # logger.debug("Saving %d %s", ii, stKey) # logger.debug("Current objectD keys %r", list(objectD.keys())) except Exception as e: logger.exception("Failing with %s", str(e)) return objectD
def __selectPolymerEntities(self, entryD, **kwargs): """Skeleton entity selector recovering essential biological sequence mapping features for macromolecules (default type = protein). "1CP9": { "polymer_entity_ids": [ "1", "2" ], "selected_polymer_entities": { "1": { "rcsb_multiple_source_flag": "N", "asym_ids": [ "A" ], "auth_asym_ids": [ "A" ], "entity_id": "1", "type": "polypeptide(L)", "rcsb_entity_polymer_type": "Protein", "rcsb_entity_source_organism": [ { "ncbi_taxonomy_id": 587, "beg_seq_num": 1, "end_seq_num": 205, "ncbi_scientific_name": "Providencia rettgeri" } ], "struct_ref": [ { "id": "1", "db_name": "UNP", "pdbx_db_accession": "Q7WZI9", "entity_id": "1", "pdbx_seq_one_letter_code": "QSTQIKIERDNYGVPHIYANDTYSLFYGYGYA...", "alignD": { "A": [ { "align_id": "1", "ref_id": "1", "pdbx_PDB_id_code": "1CP9", "pdbx_strand_id": "A", "seq_align_beg": 1, "seq_align_end": 205, "pdbx_db_accession": "Q7WZI9", "db_align_beg": 24, "db_align_end": 228, "pdbx_auth_seq_align_beg": "1", "pdbx_auth_seq_align_end": "205", "rcsb_entity_id": "1" } ] } } ] }, "2": { "rcsb_multiple_source_flag": "N", "asym_ids": [ "B" ], "auth_asym_ids": [ "B" ], "entity_id": "2", "type": "polypeptide(L)", "rcsb_entity_polymer_type": "Protein", "rcsb_entity_source_organism": [ { "ncbi_taxonomy_id": 587, "beg_seq_num": 1, "end_seq_num": 553, "ncbi_scientific_name": "Providencia rettgeri" } ], "struct_ref": [ { "id": "2", "db_name": "UNP", "pdbx_db_accession": "Q7WZI9", "entity_id": "2", "pdbx_seq_one_letter_code": "SNVWLVGKTKASGAKAILLNGPQFGWFNPAYTYGIGLHG", "alignD": { "B": [ { "align_id": "2", "ref_id": "2", "pdbx_PDB_id_code": "1CP9", "pdbx_strand_id": "B", "seq_align_beg": 1, "seq_align_end": 553, "pdbx_db_accession": "Q7WZI9", "db_align_beg": 285, "db_align_end": 837, "pdbx_auth_seq_align_beg": "1", "pdbx_auth_seq_align_end": "553", "rcsb_entity_id": "2" } ] } } ] } } }, """ dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity") resultKey = kwargs.get("resultKey", "selected_polymer_entities") entryLimit = kwargs.get("entryLimit", None) selectionQueryD = kwargs.get("entitySelectionQuery", {"entity_poly.rcsb_entity_polymer_type": "Protein"}) # try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) selectL = [ "rcsb_polymer_entity_container_identifiers", "entity.rcsb_multiple_source_flag", "entity_poly.type", "entity_poly.rcsb_entity_polymer_type", "entity_poly.pdbx_seq_one_letter_code_can", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_source_organism.ncbi_scientific_name", "rcsb_entity_source_organism.beg_seq_num", "rcsb_entity_source_organism.end_seq_num", "struct_ref.id", "struct_ref.pdbx_db_accession", "struct_ref.db_name", "struct_ref.entity_id", "struct_ref.pdbx_seq_one_letter_code", "struct_ref.pdbx_align_begin", "struct_ref_seq", # "entity_src_nat.pdbx_ncbi_taxonomy_id", "entity_src_gen.pdbx_gene_src_ncbi_taxonomy_id", "entity_src_gen.pdbx_host_org_ncbi_taxonomy_id", "pdbx_entity_src_syn.ncbi_taxonomy_id", ] iCount = 0 for entryId in entryD: # if resultKey in entryD[entryId]: continue # qD = {"rcsb_polymer_entity_container_identifiers.entry_id": entryId} qD.update(selectionQueryD) # dL = mg.fetch(dbName, collectionName, selectL, queryD=qD) logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL)) eD = {} for ii, dD in enumerate(dL, 1): rD = {} logger.debug("%s (%4d) d is %r", entryId, ii, dD) if "entity" in dD: rD["rcsb_multiple_source_flag"] = dD["entity"]["rcsb_multiple_source_flag"] if "rcsb_multiple_source_flag" in dD["entity"] else "N" # if "rcsb_polymer_entity_container_identifiers" in dD: rD["asym_ids"] = dD["rcsb_entity_container_identifiers"]["asym_ids"] if "asym_ids" in dD["rcsb_entity_container_identifiers"] else [] rD["auth_asym_ids"] = dD["rcsb_entity_container_identifiers"]["auth_asym_ids"] if "auth_asym_ids" in dD["rcsb_entity_container_identifiers"] else [] rD["entity_id"] = dD["rcsb_entity_container_identifiers"]["entity_id"] # if "entity_poly" in dD: rD["type"] = dD["entity_poly"]["type"] if "type" in dD["entity_poly"] else None rD["rcsb_entity_polymer_type"] = dD["entity_poly"]["rcsb_entity_polymer_type"] if "rcsb_entity_polymer_type" in dD["entity_poly"] else None rD["entity_polymer_length"] = len(dD["entity_poly"]["pdbx_seq_one_letter_code_can"]) if "pdbx_seq_one_letter_code_can" in dD["entity_poly"] else 0 # tL = [] if "rcsb_entity_source_organism" in dD: for tD in dD["rcsb_entity_source_organism"]: tL.append(tD) rD["rcsb_entity_source_organism"] = copy.copy(tL) # qDL = [] if "struct_ref" in dD: for tD in dD["struct_ref"]: if "db_name" in tD: tD["db_name"] = str(tD["db_name"]).upper().strip() tD["db_name"] = "UNP" if tD["db_name"] in ["TREMBL"] else tD["db_name"] qDL.append(tD) if "struct_ref_seq" in dD: for qD in qDL: refId = qD["id"] alignL = [] for tD in dD["struct_ref_seq"]: if refId == tD["ref_id"]: alignL.append(tD) # qD['align_list'] = copy.copy(aL) for align in alignL: authAsymId = align["pdbx_strand_id"] qD.setdefault("alignD", {}).setdefault(authAsymId, []).append(align) rD["struct_ref"] = qDL # taxIdL = [] if "entity_src_nat" in dD: for tD in dD["entity_src_nat"]: if "pdbx_ncbi_taxonomy_id" in tD: taxIdL.append(tD["pdbx_ncbi_taxonomy_id"]) if "entity_src_gen" in dD: for tD in dD["entity_src_gen"]: if "pdbx_gene_src_ncbi_taxonomy_id" in tD: taxIdL.append(tD["pdbx_gene_src_ncbi_taxonomy_id"]) if "pdbx_host_org_ncbi_taxonomy_id" in tD: taxIdL.append(tD["pdbx_host_org_ncbi_taxonomy_id"]) if "pdbx_entity_src_syn" in dD: for tD in dD["pdbx_entity_src_syn"]: if "ncbi_taxonomy_id" in tD: taxIdL.append(tD["ncbi_taxonomy_id"]) qL = [] for taxId in taxIdL: ttL = [int(t.strip()) for t in taxId.split(",") if t.strip().isdigit()] qL.extend(ttL) logger.debug("TaxId list %r", qL) rD["original_taxonomy_ids"] = copy.copy(list(set(qL))) # if "entity_id" in rD: eD[rD["entity_id"]] = copy.copy(rD) entryD[entryId][resultKey] = copy.copy(eD) iCount += 1 if iCount % 1000 == 0: logger.info("Completed fetch %d/%d entries", iCount, len(entryD)) if entryLimit and iCount >= entryLimit: logger.info("Quitting after %d", iCount) break except Exception as e: logger.exception("Failing with %s", str(e)) return entryD
def testSingleIndexSelect(self): """Test case - create collection, create simple single index, insert document list, read check documents. """ try: logger.debug("Starting testSingleIndexSelect") with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: nDocs = 100 mg = MongoDbUtil(client) ok = mg.createCollection(self.__dbName, self.__collectionName) self.assertTrue(ok) ok = mg.databaseExists(self.__dbName) self.assertTrue(ok) ok = mg.collectionExists(self.__dbName, self.__collectionName) self.assertTrue(ok) # # Create before insert ok = mg.createIndex(self.__dbName, self.__collectionName, keyList=["DOC_ID"], indexName="primary", indexType="DESCENDING", uniqueFlag=True) self.assertTrue(ok) dList = [] for ii in range(nDocs): dObj = self.__makeDataObj(2, 5, 5, ii) dList.append(dObj) # keyName = "DOC_ID" rIdL = mg.insertList(self.__dbName, self.__collectionName, dList, keyNames=[keyName], salvage=True) self.assertEqual(len(dList), len(rIdL)) # for ii in range(nDocs): kVal = "DOC_%d" % ii rObj = mg.fetchOne(self.__dbName, self.__collectionName, "DOC_ID", kVal) # logger.debug("Return Object %s" % pprint.pformat(rObj)) rObj.pop("_id", None) dList[ii].pop("_id", None) self.assertEqual(len(dList[ii]), len(rObj)) self.assertEqual(dList[ii], rObj) # ok = mg.dropIndex(self.__dbName, self.__collectionName, indexName="primary") self.assertTrue(ok) ok = mg.createIndex(self.__dbName, self.__collectionName, keyList=["DOC_ID"], indexName="primary", indexType="DESCENDING", uniqueFlag=True) self.assertTrue(ok) ok = mg.reIndex(self.__dbName, self.__collectionName) self.assertTrue(ok) # with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) ii = mg.count(self.__dbName, self.__collectionName) logger.debug("collection length %d", ii) # dList = mg.fetch(self.__dbName, self.__collectionName, ["DOC_ID"]) self.assertEqual(len(dList), nDocs) logger.debug("Fetch length %d", len(dList)) for ii, dD in enumerate(dList): logger.debug("Fetch num %d: %r", ii, dD) # dList = mg.fetch(self.__dbName, self.__collectionName, ["category_0.attribute_0"], queryD={"category_0.attribute_0": "val_0_0"}) self.assertEqual(len(dList), nDocs) logger.debug("Fetch length %d", len(dList)) for ii, dD in enumerate(dList): logger.debug("Fetch num %d: %r", ii, dD) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def getEntityInstances(self, entryD, **kwargs): """Get the selected validation data for the instances in the input entry dictionary. entryD[entryId]['selected_polymer_entities'][entityId]['validation'] = {} Add keys: 'pdbx_vrpt_instance_results' and 'pdbx_unobs_or_zero_occ_residues' to the validation dictionary above. Args: resourceName (str): resource name (e.g. DrugBank, CCDC) **kwargs: unused Returns: entryD: { } """ dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity_instance") savePath = kwargs.get("savePath", "entry-data.pic") saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"}) entryLimit = kwargs.get("entryLimit", None) # try: optF = False iCount = 0 with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s total document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) # for entryId, dV in entryD.items(): for entityId, peD in dV[ "selected_polymer_entities"].items(): # if 'anal_instances' in peD: # continue vD = {} for asymId in peD["asym_ids"]: qD = { "rcsb_polymer_entity_instance_container_identifiers.entry_id": entryId, "rcsb_polymer_entity_instance_container_identifiers.asym_id": asymId, } # qD = {'rcsb_entity_instance_container_validation_identifiers.entity_type': 'polymer'} # selectL = ['pdbx_vrpt_instance_results', 'pdbx_unobs_or_zero_occ_residues'] selectL = ["pdbx_vrpt_instance_results"] tL = mg.fetch(dbName, collectionName, selectL, queryD=qD) dV = {} if not tL: logger.info( "No validation data for %s %s %s(%s)", dbName, collectionName, entryId, asymId) continue # logger.debug( ">>> %s %s (%s) dict key length %d ", collectionName, entryId, asymId, len(tL[0])) # if optF: dV["pdbx_vrpt_instance_results"] = tL[0][ "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[ 0] else [] dV["pdbx_unobs_or_zero_occ_residues"] = tL[0][ "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[ 0] else [] # if optF: urdL = tL[0][ "pdbx_unobs_or_zero_occ_residues"] if "pdbx_unobs_or_zero_occ_residues" in tL[ 0] else [] oL = [{ "label_seq_id": urd["label_seq_id"], "label_comp_id": urd["label_comp_id"] } for urd in urdL] dV["pdbx_unobs_or_zero_occ_residues"] = oL # try: irdL = tL[0][ "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[ 0] else [] oL = [{ "label_seq_id": ird["label_seq_id"], "label_comp_id": ird["label_comp_id"] } for ird in irdL] dV["pdbx_vrpt_instance_results_seq"] = oL except Exception as e: logger.error( "Failing with entryId %s entityId %s asymId %s bad validation data %s", entryId, entityId, asymId, str(e)) # try: irdL = tL[0][ "pdbx_vrpt_instance_results"] if "pdbx_vrpt_instance_results" in tL[ 0] else [] oL = [{ "OWAB": ird["OWAB"], "label_seq_id": ird["label_seq_id"], "label_comp_id": ird["label_comp_id"] } for ird in irdL] dV["pdbx_vrpt_instance_results_occ"] = oL except Exception as e: logger.debug( "Failing with entryId %s entityId %s asymId %s bad validation data %s", entryId, entityId, asymId, str(e)) vD[asymId] = copy.copy(dV) # analD = self.analEntity(entryId, peD, vD) entryD[entryId]["selected_polymer_entities"][ entityId]["anal_instances"] = copy.copy(analD) iCount += 1 if iCount % 500 == 0: logger.info("Completed %d/%d entries", iCount, len(entryD)) if iCount % 2000 == 0: ok = self.__mU.doExport(savePath, entryD, **saveKwargs) logger.info( "Saved polymer entity instance results (%d) status %r in %s", iCount, ok, savePath) if entryLimit and iCount >= entryLimit: break ok = self.__mU.doExport(savePath, entryD, **saveKwargs) logger.info( "Saved polymer instance results (%d) entries %d status %r in %s", iCount, len(entryD), ok, savePath) except Exception as e: logger.exception("Failing with %s", str(e)) return entryD
def getPolymerEntities(self, entryD, **kwargs): """Add 'selected_polymer_entities' satisfying the input contiditions and add this to the input entry dictionary.""" dbName = kwargs.get("dbName", "pdbx_core") collectionName = kwargs.get("collectionName", "pdbx_core_polymer_entity") resultKey = kwargs.get("resultKey", "selected_polymer_entities") savePath = kwargs.get("savePath", "entry-data.pic") entryLimit = kwargs.get("entryLimit", None) saveKwargs = kwargs.get("saveKwargs", {"fmt": "pickle"}) # try: with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client: mg = MongoDbUtil(client) if mg.collectionExists(dbName, collectionName): logger.info("%s %s document count is %d", dbName, collectionName, mg.count(dbName, collectionName)) selectL = [ "rcsb_polymer_entity_container_identifiers", "entity_poly.type", "entity_poly.pdbx_seq_one_letter_code_can", "rcsb_entity_source_organism.ncbi_taxonomy_id", "rcsb_entity_source_organism.ncbi_scientific_name", "struct_ref.pdbx_seq_one_letter_code", "struct_ref.pdbx_db_accession", "struct_ref.db_name", "struct_ref.entity_id", ] iCount = 0 for entryId in entryD: # if resultKey in entryD[entryId]: continue # qD = { "rcsb_polymer_entity_container_identifiers.entry_id": entryId, "entity_poly.rcsb_entity_polymer_type": "Protein", "entity.rcsb_multiple_source_flag": "N", } # dL = mg.fetch(dbName, collectionName, selectL, queryD=qD) logger.debug("%s query %r fetch result count %d", entryId, qD, len(dL)) eD = {} for ii, dV in enumerate(dL, 1): rD = {} logger.debug("%s (%4d) d is %r", entryId, ii, dV) if "rcsb_polymer_entity_container_identifiers" in dV and "asym_ids" in dV[ "rcsb_polymer_entity_container_identifiers"]: rD["asym_ids"] = dV[ "rcsb_polymer_entity_container_identifiers"][ "asym_ids"] rD["entity_id"] = dV[ "rcsb_polymer_entity_container_identifiers"][ "entity_id"] if "entity_poly" in dV and "type" in dV[ "entity_poly"]: rD["type"] = dV["entity_poly"]["type"] rD["seq_one_letter_code_can"] = dV[ "entity_poly"][ "pdbx_seq_one_letter_code_can"] if "rcsb_entity_source_organism" in dV: rD["ncbi_taxonomy_id"] = dV[ "rcsb_entity_source_organism"][0][ "ncbi_taxonomy_id"] if "ncbi_taxonomy_id" in dV[ "rcsb_entity_source_organism"][ 0] else None rD["ncbi_scientific_name"] = ( dV["rcsb_entity_source_organism"][0] ["ncbi_scientific_name"] if "ncbi_scientific_name" in dV["rcsb_entity_source_organism"][0] else None) if "struct_ref" in dV and len( dV["struct_ref"]) == 1: rD["seq_one_letter_code_ref"] = dV["struct_ref"][ 0]["pdbx_seq_one_letter_code"] if "pdbx_seq_one_letter_code" in dV[ "struct_ref"][0] else None rD["db_accession"] = dV["struct_ref"][0][ "pdbx_db_accession"] if "pdbx_db_accession" in dV[ "struct_ref"][0] else None rD["db_name"] = dV["struct_ref"][0][ "db_name"] if "db_name" in dV[ "struct_ref"][0] else None # refDbName = rD["db_name"] dbAccession = rD["db_accession"] dbRefSeq = self.__seqCache[ dbAccession] if dbAccession in self.__seqCache else None if refDbName in ["UNP"] and not dbRefSeq: dbRefSeq = self.__fetchUniprot(dbAccession) self.__seqCache[dbAccession] = dbRefSeq logger.debug("Fetch uniprot %r", dbRefSeq) rD["ref_db_seq"] = dbRefSeq else: rD["seq_one_letter_code_ref"] = rD[ "db_accession"] = rD["db_name"] = None # if "entity_id" in rD: eD[rD["entity_id"]] = copy.copy(rD) entryD[entryId][resultKey] = copy.copy(eD) iCount += 1 if iCount % 10 == 0: logger.info( "Completed polymer entities fetch %d/%d entries", iCount, len(entryD)) if iCount % 2000 == 0: ok = self.__mU.doExport(savePath, entryD, **saveKwargs) logger.info( "Saved polymer entity results (%d) status %r in %s", iCount, ok, savePath) if entryLimit and iCount >= entryLimit: logger.info("Quitting after %d", iCount) break # # for entryId in entryD: # logger.debug(">> %s docD %r" % (entryId, entryD[entryId])) ok = self.__mU.doExport(savePath, entryD, **saveKwargs) logger.info( "Saved polymer entity results (%d) entries %d status %r in %s", iCount, len(entryD), ok, savePath) except Exception as e: logger.exception("Failing with %s", str(e)) return entryD