def testExtractEntities(self): """Test case - extract entities""" try: obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", cacheFilePath=os.path.join(self.__workPath, "entity-data-test-cache.json"), useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=self.__testEntryCacheKwargs, objectLimit=self.__objectLimitTest, ) eCount = obEx.getCount() logger.info("Entity count is %d", eCount) self.assertGreaterEqual(eCount, self.__objectLimitTest) objD = obEx.getObjects() for _, obj in objD.items(): obEx.genPathList(obj, path=None) # pL = obEx.getPathList(filterList=False) logger.debug("Path list (unfiltered) %r", pL) # pL = obEx.getPathList() logger.debug("Path list %r", pL) obEx.setPathList(pL) if self.__verbose: for ky, obj in objD.items(): obEx.genValueList(obj, path=None) tD = obEx.getValues() logger.info("Index object %r %s", ky, pprint.pformat(tD, indent=3, width=120)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testExtractSelectedEntityContent(self): """Test case - extract selected entity content "reference_sequence_identifiers": [ { "database_name": "UniProt", "database_accession": "Q5SHN1", "provenance_source": "SIFTS" }, { "database_name": "UniProt", "database_accession": "Q5SHN1", "provenance_source": "PDB" } ] """ try: obEx = ObjectExtractor( self.__cfgOb, databaseName="pdbx_core", collectionName="pdbx_core_polymer_entity", cacheFilePath=os.path.join( self.__workPath, "entity-selected-content-test-cache.json"), useCache=False, keyAttribute="entity", uniqueAttributes=["rcsb_id"], cacheKwargs=self.__testEntryCacheKwargs, # objectLimit=self.__objectLimitTest, objectLimit=None, selectionQuery={ "entity_poly.rcsb_entity_polymer_type": "Protein" }, selectionList=[ "rcsb_id", "rcsb_polymer_entity_container_identifiers.reference_sequence_identifiers" ], ) eCount = obEx.getCount() logger.info("Entity count is %d", eCount) # # if self.__objectLimitTest is not None: self.assertGreaterEqual(eCount, self.__objectLimitTest) objD = obEx.getObjects() for _, obj in objD.items(): obEx.genPathList(obj, path=None) # pL = obEx.getPathList(filterList=False) logger.debug("Path list (unfiltered) %r", pL) # pL = obEx.getPathList() logger.debug("Path list %r", pL) obEx.setPathList(pL) if self.__verbose: for ky, obj in objD.items(): obEx.genValueList(obj, path=None) tD = obEx.getValues() logger.info("Index object %r %s", ky, pprint.pformat(tD, indent=3, width=120)) objD = obEx.getObjects() # logger.info("objD.keys() %r", list(objD.keys())) totCount = 0 difCount = 0 pdbUnpIdD = defaultdict(int) siftsUnpIdD = defaultdict(int) pdbDifUnpIdD = defaultdict(int) for entityKey, eD in objD.items(): try: siftsS = set() pdbS = set() for tD in eD["rcsb_polymer_entity_container_identifiers"][ "reference_sequence_identifiers"]: if tD["database_name"] == "UniProt": if tD["provenance_source"] == "SIFTS": siftsS.add(tD["database_accession"]) siftsUnpIdD[tD["database_accession"]] += 1 elif tD["provenance_source"] == "PDB": pdbS.add(tD["database_accession"]) pdbUnpIdD[tD["database_accession"]] += 1 else: logger.debug( "No UniProt for %r", eD["rcsb_polymer_entity_container_identifiers"] ) logger.debug("PDB assigned sequence length %d", len(pdbS)) logger.debug("SIFTS assigned sequence length %d", len(siftsS)) if pdbS and siftsS: totCount += 1 if pdbS != siftsS: difCount += 1 for idV in pdbS: pdbDifUnpIdD[idV] += 1 except Exception as e: logger.warning("No identifiers for %s with %s", entityKey, str(e)) logger.info("Total %d differences %d", totCount, difCount) logger.info("Unique UniProt ids PDB %d SIFTS %d", len(pdbUnpIdD), len(siftsUnpIdD)) logger.info("Unique UniProt differences %d ", len(pdbDifUnpIdD)) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()