def setUp(self): configPath = os.path.join(HERE, "test-data", "drugbank-config-example.yml") configName = "site_info_configuration" cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName) self.__user = cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName) self.__pw = cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName) self.__cachePath = os.path.join(HERE, "test-output", "CACHE") self.__fastaPath = os.path.join(HERE, "test-output", "drugbank-targets.fa") self.__taxonPath = os.path.join(HERE, "test-output", "drugbank-targets-taxon.tdd") # self.__seqMatchResultsPath = os.path.join(HERE, "test-data", "drugbank-vs-pdbprent-filtered-results.json.gz") self.__startTime = time.time() logger.info("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
class CitationUtilsTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(CitationUtilsTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) # self.__cachePath = os.path.join(TOPDIR, "CACHE") # self.__cacheKwargs = {"fmt": "json", "indent": 3} self.__exdbDirPath = os.path.join( self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName)) # self.__mU = MarshalUtil() self.__entryLimitTest = 20 # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 10**6, unitS) endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testEntryCitationAccess(self): """Test case - extract entry citations""" try: ce = CitationUtils(self.__cfgOb, exdbDirPath=self.__exdbDirPath, useCache=True, cacheKwargs=self.__cacheKwargs, entryLimit=self.__entryLimitTest) eCount = ce.getCitationEntryCount() self.assertGreaterEqual(eCount, self.__entryLimitTest) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testExportToYaml(self): cfgOb = ConfigUtil(configFormat="yaml", mockTopPath=self.__mockTopPath, roundTrip=True) # cD = self.__createDataSet() cfgOb.importConfig(cD) # ok = cfgOb.writeConfig(self.__outPathConfigYamlExport, configFormat="yaml") self.assertTrue(ok) cfgOb = ConfigUtil(configPath=self.__outPathConfigYamlExport, configFormat="yaml", mockTopPath=self.__mockTopPath) rD = cfgOb.exportConfig() self.assertGreaterEqual(len(rD), 1) v = cfgOb.get("SubA.Name", sectionName="Section1") self.assertEqual(v, "THE_NAME") v = cfgOb.get("SubA.Counts", sectionName="Section3") self.assertEqual(len(v), 3)
class DictionaryProviderTests(unittest.TestCase): def setUp(self): mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__dirPath = os.path.join(self.__cachePath, "dictionaries") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__configName = configName self.__contentInfoConfigName = "content_info_helper_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) dictLocatorMap = self.__cfgOb.get( "DICT_LOCATOR_CONFIG_MAP", sectionName=self.__contentInfoConfigName) schemaName = "pdbx_core" self.__dictLocators = [ self.__cfgOb.getPath(configLocator, sectionName=self.__configName) for configLocator in dictLocatorMap[schemaName] ] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testResourceCache(self): """Test case - generate and check dictonary artifact and api caches """ try: logger.debug("Dictionary locators %r", self.__dictLocators) dp = DictionaryApiProvider(dirPath=self.__dirPath, useCache=False) dApi = dp.getApi(self.__dictLocators) ok = dApi.testCache() self.assertTrue(ok) title = dApi.getDictionaryTitle() logger.debug("Title %r", title) self.assertEqual( title, "mmcif_pdbx.dic,rcsb_mmcif_ext.dic,vrpt_mmcif_ext.dic") # revL = dApi.getDictionaryHistory() numRev = dApi.getDictionaryRevisionCount() logger.debug("Number of dictionary revisions (numRev) %r", numRev) self.assertGreater(numRev, 220) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class EntityPolymerExtractorFixture(unittest.TestCase): def __init__(self, methodName="runTest"): super(EntityPolymerExtractorFixture, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) # self.__cachePath = os.path.join(TOPDIR, "CACHE") # self.__cacheKwargs = {"fmt": "pickle"} self.__exdbCacheDirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName)) # self.__entryLimitTest = None # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testExtractEntityPolymers(self): """Fixture - extract and save entity polymer info""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs, entryLimit=self.__entryLimitTest) eCount = epe.getEntryCount() self.assertGreaterEqual(eCount, 10) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testReadIniConfig(self): try: cfgOb = ConfigUtil(configPath=self.__inpPathConfigIni, mockTopPath=self.__dataPath) sName = "DEFAULT" pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName) # self.assertEqual( pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO")) self.assertEqual( pathPdbx, os.path.join(self.__mockTopPath, "MOCK_PDBX_SANDBOX")) pathBird = cfgOb.get("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.get("PDBX_REPO_PATH", sectionName=sName) self.assertEqual(pathBird, "MOCK_BIRD_REPO") self.assertEqual(pathPdbx, "MOCK_PDBX_SANDBOX") sName = "Section1" # helperMethod = cfgOb.getHelper("DICT_METHOD_HELPER_MODULE", sectionName=sName) tv = helperMethod.echo("test_value") self.assertEqual(tv, "test_value") # tEnv = "TEST_ENV_VAR" tVal = "TEST_ENV_VAR_VALUE" os.environ[tEnv] = tVal eVal = cfgOb.getEnvValue("ENV_OPTION_A", sectionName=sName) self.assertEqual(tVal, eVal) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return {}
def testReadIniConfigWithEnv(self): try: os.environ["TEST_MOCKPATH_ENV"] = self.__mockTopPath cfgOb = ConfigUtil(configPath=self.__inpPathConfigWithEnvIni, mockTopPath=self.__mockTopPath, importEnvironment=True) testEnv = cfgOb.get("test_mockpath_env") self.assertEqual(testEnv, self.__mockTopPath) logger.debug("Environmental variable test_mock_path is %r", testEnv) # Verify environment keys all lowercased - testEnv = cfgOb.get("TEST_MOCKPATH_ENV") self.assertEqual(testEnv, None) logger.debug("Environmental variable TEST_MOCK_PATH is %r", testEnv) # testEnv = cfgOb.get("TOP_PROJECT_PATH") self.assertEqual(testEnv, self.__mockTopPath) logger.debug("Derived path is %r", testEnv) # sName = "Section1" testEnv = cfgOb.get("PROJ_DIR_PATH", sectionName=sName) self.assertEqual(testEnv, os.path.join(self.__mockTopPath, "da_top")) testEnv = cfgOb.get("PROJ_ARCHIVE_PATH", sectionName=sName) self.assertEqual( testEnv, os.path.join(self.__mockTopPath, "da_top", "archive")) testEnv = cfgOb.get("proj_deposit_path", sectionName=sName) self.assertEqual( testEnv, os.path.join(self.__mockTopPath, "da_top", "deposit")) # ok = cfgOb.writeConfig(self.__outPathConfigWithEnvIni, configFormat="ini") self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return {}
class PharosTargetProviderTests(unittest.TestCase): skipFull = True def setUp(self): configPath = os.path.join(HERE, "test-data", "pharos-config-example.yml") self.__configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=self.__configName) self.__user = self.__cfgOb.get("_MYSQL_DB_USER_NAME", sectionName=self.__configName) self.__pw = self.__cfgOb.get("_MYSQL_DB_PASSWORD", sectionName=self.__configName) self.__cachePath = os.path.join(HERE, "test-output", "CACHE") self.__dirPath = os.path.join(self.__cachePath, "Pharos-targets") self.__dataPath = os.path.join(HERE, "test-data") # self.__pharosFixture() def tearDown(self): pass # def __pharosFixture(self): try: ok = False fU = FileUtil() srcPath = os.path.join(self.__dataPath, "Pharos") dstPath = self.__dirPath for fn in [ "drug_activity", "cmpd_activity", "target", "protein", "t2tc" ]: inpPath = os.path.join(srcPath, fn + ".tdd.gz") outPath = os.path.join(dstPath, fn + ".tdd.gz") fU.get(inpPath, outPath) fU.uncompress(outPath, outputDir=dstPath) fU.remove(outPath) fU.put(os.path.join(srcPath, "pharos-readme.txt"), os.path.join(dstPath, "pharos-readme.txt")) ok = True except Exception as e: logger.exception("Failing with %s", str(e)) ok = False return ok @unittest.skip("Bootstrap test") def testBootstrap(self): try: ptP = PharosTargetProvider(cachePath=self.__cachePath, useCache=False, reloadDb=False) configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") configName = "site_info_remote_configuration" cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName) ok = ptP.backup(cfgOb, configName) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skipIf(skipFull, "Database dependency") def testFetchAndLoadPharosTargets(self): try: # Now about 630s on macos ptP = PharosTargetProvider(cachePath=self.__cachePath, useCache=False, reloadDb=True, fromDb=True, mysqlUser=self.__user, mysqlPassword=self.__pw) ok = ptP.testCache() self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skipIf(skipFull, "Very long test") def testExportPharosTargets(self): try: ptP = PharosTargetProvider(cachePath=self.__cachePath, useCache=True, reloadDb=False, mysqlUser=self.__user, mysqlPassword=self.__pw) ok = ptP.testCache() self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testExportPharosTargetFasta(self): try: ptP = PharosTargetProvider(cachePath=self.__cachePath, useCache=True, reloadDb=False) ok = ptP.testCache() self.assertTrue(ok) fastaPath = self.__cachePath = os.path.join( HERE, "test-output", "pharos-targets.fa") taxonPath = self.__cachePath = os.path.join( HERE, "test-output", "pharos-targets-taxon.tdd") ok = ptP.exportProteinFasta(fastaPath, taxonPath, addTaxonomy=False) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skipIf(skipFull, "Internal test") def testStashDependencies(self): try: ptP = PharosTargetProvider(cachePath=self.__cachePath, useCache=True, reloadDb=False, fromDb=False) ok = ptP.testCache() self.assertTrue(ok) # ok = ptP.backup(self.__cfgOb, self.__configName) self.assertTrue(ok) # ok = ptP.restore(self.__cfgOb, self.__configName) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() @unittest.skipIf(skipFull, "Very long test") def testExportPharosTargetFastaTax(self): try: ptP = PharosTargetProvider(cachePath=self.__cachePath, useCache=True, reloadDb=False) ok = ptP.testCache() self.assertTrue(ok) # fastaPath = self.__cachePath = os.path.join( HERE, "test-output", "pharos-targets.fa") taxonPath = self.__cachePath = os.path.join( HERE, "test-output", "pharos-targets-taxon.tdd") ok = ptP.exportProteinFasta(fastaPath, taxonPath, addTaxonomy=True) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def main(): parser = argparse.ArgumentParser() # defaultConfigName = "site_info_configuration" # parser.add_argument("--full", default=False, action="store_true", help="Fresh full load in a new tables/collections") parser.add_argument( "--replace", default=False, action="store_true", help="Load with replacement in an existing table/collection (default)") # parser.add_argument( "--load_chem_comp_ref", default=False, action="store_true", help="Load Chemical Component reference definitions (public subset)") parser.add_argument( "--load_chem_comp_core_ref", default=False, action="store_true", help= "Load Chemical Component Core reference definitions (public subset)") parser.add_argument( "--load_bird_chem_comp_ref", default=False, action="store_true", help= "Load Bird Chemical Component reference definitions (public subset)") parser.add_argument( "--load_bird_chem_comp_core_ref", default=False, action="store_true", help= "Load Bird Chemical Component Core reference definitions (public subset)" ) parser.add_argument("--load_bird_ref", default=False, action="store_true", help="Load Bird reference definitions (public subset)") parser.add_argument( "--load_bird_family_ref", default=False, action="store_true", help="Load Bird Family reference definitions (public subset)") parser.add_argument("--load_entry_data", default=False, action="store_true", help="Load PDBx entry data (current released subset)") parser.add_argument( "--load_pdbx_core", default=False, action="store_true", help="Load all PDBx core collections (current released subset)") parser.add_argument( "--load_pdbx_core_merge", default=False, action="store_true", help= "Load all PDBx core collections with merged content (current released subset)" ) # parser.add_argument("--load_pdbx_core_entry", default=False, action="store_true", help="Load PDBx core entry (current released subset)") parser.add_argument("--load_pdbx_core_entity", default=False, action="store_true", help="Load PDBx core entity (current released subset)") parser.add_argument( "--load_pdbx_core_entity_monomer", default=False, action="store_true", help="Load PDBx core entity monomer (current released subset)") parser.add_argument( "--load_pdbx_core_assembly", default=False, action="store_true", help="Load PDBx core assembly (current released subset)") parser.add_argument( "--load_ihm_dev", default=False, action="store_true", help="Load I/HM DEV model data (current released subset)") # parser.add_argument("--config_path", default=None, help="Path to configuration options file") parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name") parser.add_argument("--db_type", default="mongo", help="Database server type (default=mongo)") parser.add_argument( "--document_style", default="rowwise_by_name_with_cardinality", help= "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name", ) parser.add_argument("--read_back_check", default=False, action="store_true", help="Perform read back check on all documents") parser.add_argument("--schema_level", default=None, help="Schema validation level (full|min default=None)") # parser.add_argument( "--load_file_list_path", default=None, help= "Input file containing load file path list (override automatic repository scan)" ) parser.add_argument( "--fail_file_list_path", default=None, help="Output file containing file paths that fail to load") parser.add_argument( "--save_file_list_path", default=None, help="Save repo file paths from automatic file system scan in this path" ) parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)") parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process") parser.add_argument("--file_limit", default=None, help="Load file limit for testing") parser.add_argument("--prune_document_size", default=None, help="Prune large documents to this size limit (MB)") parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging") parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing") parser.add_argument("--cache_path", default=None, help="Cache path for resource files") parser.add_argument("--rebuild_cache", default=False, action="store_true", help="Rebuild cached resource files") parser.add_argument("--rebuild_schema", default=False, action="store_true", help="Rebuild schema on-the-fly if not cached") parser.add_argument("--vrpt_repo_path", default=None, help="Path to validation report repository") args = parser.parse_args() # debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Configuration Details configPath = args.config_path configName = args.config_name if not configPath: configPath = os.getenv("DBLOAD_CONFIG_PATH", None) try: if os.access(configPath, os.R_OK): os.environ["DBLOAD_CONFIG_PATH"] = configPath logger.info("Using configuation path %s (%s)", configPath, configName) else: logger.error("Missing or access issue with config file %r", configPath) exit(1) mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath) if configName != defaultConfigName: cfgOb.replaceSectionName(defaultConfigName, configName) # if args.vrpt_repo_path: vrptPath = args.vrpt_repo_path if not os.access(vrptPath, os.R_OK): logger.error("Unreadable validation report repository path %r", vrptPath) envName = cfgOb.get("VRPT_REPO_PATH_ENV", sectionName=configName) os.environ[envName] = vrptPath logger.info("Using alternate validation report path %s", os.getenv(envName)) except Exception as e: logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) exit(1) # try: readBackCheck = args.read_back_check numProc = int(args.num_proc) chunkSize = int(args.chunk_size) fileLimit = int(args.file_limit) if args.file_limit else None failedFilePath = args.fail_file_list_path fPath = args.load_file_list_path schemaLevel = args.schema_level if args.schema_level in [ "min", "full", "minimum" ] else None loadType = "full" if args.full else "replace" loadType = "replace" if args.replace else "full" saveInputFileListPath = args.save_file_list_path pruneDocumentSize = float( args.prune_document_size) if args.prune_document_size else None cachePath = args.cache_path if args.cache_path else "." cachePath = os.path.abspath(cachePath) rebuildCache = args.rebuild_cache if args.rebuild_cache else False rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False if args.document_style not in [ "rowwise_by_name", "rowwise_by_name_with_cardinality", "columnwise_by_name", "rowwise_by_id", "rowwise_no_name" ]: logger.error("Unsupported document style %s", args.document_style) if args.db_type != "mongo": logger.error("Unsupported database server type %s", args.db_type) except Exception as e: logger.exception("Argument processing problem %s", str(e)) parser.print_help(sys.stderr) exit(1) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Rebuild or check resource cache okS = True ok = buildResourceCache(cfgOb, configName, cachePath, rebuildCache=rebuildCache) if not ok: logger.error("Cache rebuild or check failure (rebuild %r) %r", rebuildCache, cachePath) exit(1) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Read any input path lists - # inputPathList = None if fPath: mu = MarshalUtil(workPath=cachePath) inputPathList = mu.doImport(fPath, fmt="list") if not inputPathList: logger.error("Missing or empty input file path list %s", fPath) exit(1) # ## if args.db_type == "mongo": mw = PdbxLoader( cfgOb, cachePath, resourceName="MONGO_DB", numProc=numProc, chunkSize=chunkSize, fileLimit=fileLimit, verbose=debugFlag, readBackCheck=readBackCheck, rebuildSchemaFlag=rebuildSchemaFlag, ) if args.load_chem_comp_ref: ok = mw.load( "chem_comp", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_chem_comp_core_ref: ok = mw.load( "chem_comp_core", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_bird_chem_comp_ref: ok = mw.load( "bird_chem_comp", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_bird_chem_comp_core_ref: ok = mw.load( "bird_chem_comp_core", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_bird_ref: ok = mw.load( "bird", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_bird_family_ref: ok = mw.load( "bird_family", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["BIRD_FAMILY_PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_entry_data: ok = mw.load( "pdbx", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_pdbx_core: ok = mw.load( "pdbx_core", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # if args.load_pdbx_core_merge: ok = mw.load( "pdbx_core", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, mergeContentTypes=["vrpt"], ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # if args.load_pdbx_core_entity: ok = mw.load( "pdbx_core", collectionLoadList=["pdbx_core_entity"], loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # if args.load_pdbx_core_entity_monomer: ok = mw.load( "pdbx_core", collectionLoadList=["pdbx_core_entity_monomer"], loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # if args.load_pdbx_core_entry: ok = mw.load( "pdbx_core", collectionLoadList=["pdbx_core_entry"], loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_pdbx_core_assembly: ok = mw.load( "pdbx_core", collectionLoadList=["pdbx_core_assembly"], loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.load_ihm_dev: ok = mw.load( "ihm_dev", loadType=loadType, inputPathList=inputPathList, styleType=args.document_style, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, ) okS = loadStatus(mw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) # logger.info("Operation completed with status %r " % ok and okS)
class ExDbWorkflow(object): def __init__(self, **kwargs): # Configuration Details configPath = kwargs.get("configPath", "exdb-config-example.yml") self.__configName = kwargs.get("configName", "site_info_configuration") mockTopPath = kwargs.get("mockTopPath", None) self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=self.__configName, mockTopPath=mockTopPath) # self.__cachePath = kwargs.get("cachePath", ".") self.__cachePath = os.path.abspath(self.__cachePath) self.__debugFlag = kwargs.get("debugFlag", False) if self.__debugFlag: logger.setLevel(logging.DEBUG) # # Rebuild or check resource cache rebuildCache = kwargs.get("rebuildCache", False) self.__useCache = not rebuildCache restoreUseGit = kwargs.get("restoreUseGit", True) restoreUseStash = kwargs.get("restoreUseStash", True) providerTypeExclude = kwargs.get("providerTypeExclude", None) # self.__cacheStatus = True rebuildCache = False if rebuildCache: self.__cacheStatus = self.buildResourceCache( rebuildCache=rebuildCache, providerTypeExclude=providerTypeExclude, restoreUseGit=restoreUseGit, restoreUseStash=restoreUseStash, ) logger.debug("Cache status if %r", self.__cacheStatus) # def load(self, op, **kwargs): logger.info("Starting operation %r\n", op) if not self.__cacheStatus: logger.error("Resource cache test or rebuild has failed - exiting") return False # argument processing if op not in ["etl_tree_node_lists", "etl_chemref", "etl_uniprot_core", "upd_ref_seq", "upd_ref_seq_comp_models", "refresh_pubchem"]: logger.error("Unsupported operation %r - exiting", op) return False try: # test mode and UniProt accession primary match minimum count for doReferenceSequenceUpdate() testMode = kwargs.get("testMode", False) minMatchPrimaryPercent = kwargs.get("minMatchPrimaryPercent", None) minMissing = kwargs.get("minMissing", 0) # readBackCheck = kwargs.get("readBackCheck", False) numProc = int(kwargs.get("numProc", 1)) chunkSize = int(kwargs.get("chunkSize", 10)) refChunkSize = int(kwargs.get("refChunkSize", 100)) documentLimit = int(kwargs.get("documentLimit")) if "documentLimit" in kwargs else None loadType = kwargs.get("loadType", "full") # or replace dbType = kwargs.get("dbType", "mongo") tU = TimeUtil() dataSetId = kwargs.get("dataSetId") if "dataSetId" in kwargs else tU.getCurrentWeekSignature() # Rebuild or reuse reference sequence cache rebuildSequenceCache = kwargs.get("rebuildSequenceCache", False) useSequenceCache = not rebuildSequenceCache # except Exception as e: logger.exception("Argument or configuration processing failing with %s", str(e)) return False # okS = ok = False if dbType == "mongo": if op == "etl_tree_node_lists": rhw = TreeNodeListWorker( self.__cfgOb, self.__cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, useCache=self.__useCache, ) ok = rhw.load(dataSetId, loadType=loadType) okS = self.loadStatus(rhw.getLoadStatus(), readBackCheck=readBackCheck) elif op == "etl_chemref": crw = ChemRefEtlWorker( self.__cfgOb, self.__cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, useCache=self.__useCache, ) ok = crw.load(dataSetId, extResource="DrugBank", loadType=loadType) okS = self.loadStatus(crw.getLoadStatus(), readBackCheck=readBackCheck) elif op == "etl_uniprot_core": crw = UniProtCoreEtlWorker( self.__cfgOb, self.__cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, useCache=self.__useCache, ) ok = crw.load(dataSetId, extResource="UniProt", loadType=loadType) okS = self.loadStatus(crw.getLoadStatus(), readBackCheck=readBackCheck) elif op == "upd_ref_seq": databaseName = "pdbx_core" collectionName = "pdbx_core_polymer_entity" polymerType = "Protein" ok = self.doReferenceSequenceUpdate( databaseName, collectionName, polymerType, fetchLimit=documentLimit, useSequenceCache=useSequenceCache, testMode=testMode, minMatchPrimaryPercent=minMatchPrimaryPercent, minMissing=minMissing, refChunkSize=refChunkSize, ) okS = ok elif op == "upd_ref_seq_comp_models": databaseName = "pdbx_comp_model_core" collectionName = "pdbx_comp_model_core_polymer_entity" polymerType = "Protein" ok = self.doReferenceSequenceUpdate( databaseName, collectionName, polymerType, fetchLimit=documentLimit, useSequenceCache=useSequenceCache, testMode=testMode, minMatchPrimaryPercent=minMatchPrimaryPercent, minMissing=minMissing, refChunkSize=refChunkSize, ) okS = ok # logger.info("Completed operation %r with status %r\n", op, ok and okS) return ok and okS def loadStatus(self, statusList, readBackCheck=True): ret = False try: dl = DocumentLoader(self.__cfgOb, self.__cachePath, "MONGO_DB", numProc=1, chunkSize=2, documentLimit=None, verbose=False, readBackCheck=readBackCheck) # sectionName = "data_exchange_configuration" databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ret = dl.load(databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None) except Exception as e: logger.exception("Failing with %s", str(e)) return ret def buildResourceCache(self, rebuildCache=False, providerTypeExclude=None, restoreUseGit=True, restoreUseStash=True): """Generate and cache resource dependencies.""" ret = False try: rp = DictMethodResourceProvider( self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, providerTypeExclude=providerTypeExclude, restoreUseGit=restoreUseGit, restoreUseStash=restoreUseStash, ) ret = rp.cacheResources(useCache=not rebuildCache, doBackup=False) except Exception as e: logger.exception("Failing with %s", str(e)) return ret def doReferenceSequenceUpdate( self, databaseName, collectionName, polymerType, fetchLimit=None, useSequenceCache=False, testMode=False, minMatchPrimaryPercent=None, minMissing=0, refChunkSize=50, **kwargs ): try: _ = kwargs _ = testMode # ------- rsaP = ReferenceSequenceAnnotationProvider( self.__cfgOb, databaseName, collectionName, polymerType, useCache=useSequenceCache, cachePath=self.__cachePath, maxChunkSize=refChunkSize ) ok = rsaP.testCache(minMatchPrimaryPercent=minMatchPrimaryPercent, minMissing=minMissing) if ok: rsa = ReferenceSequenceAnnotationAdapter(rsaP) obTr = ObjectTransformer(self.__cfgOb, objectAdapter=rsa) ok = obTr.doTransform( databaseName=databaseName, collectionName=collectionName, fetchLimit=fetchLimit, selectionQuery={"entity_poly.rcsb_entity_polymer_type": polymerType} ) else: logger.error("Reference sequence data cache build failing") return ok except Exception as e: logger.exception("Failing with %s", str(e)) return False
class RepoLoadWorkflow(object): def __init__(self, **kwargs): # Configuration Details configPath = kwargs.get("configPath", "exdb-config-example.yml") self.__configName = kwargs.get("configName", "site_info_configuration") mockTopPath = kwargs.get("mockTopPath", None) self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=self.__configName, mockTopPath=mockTopPath) # self.__cachePath = kwargs.get("cachePath", ".") self.__cachePath = os.path.abspath(self.__cachePath) self.__debugFlag = kwargs.get("debugFlag", False) if self.__debugFlag: logger.setLevel(logging.DEBUG) # # Rebuild or check resource cache # rebuildCache = kwargs.get("rebuildCache", False) # self.__cacheStatus = self.buildResourceCache(rebuildCache=rebuildCache) # logger.debug("Cache status if %r", self.__cacheStatus) # def load(self, op, **kwargs): # if not self.__cacheStatus: # logger.error("Resource cache test or rebuild has failed - exiting") # return False # argument processing if op not in [ "pdbx-loader", "etl-repository-holdings", "etl-entity-sequence-clusters" ]: logger.error("Unsupported operation %r - exiting", op) return False try: readBackCheck = kwargs.get("readBackCheck", False) numProc = int(kwargs.get("numProc", 1)) chunkSize = int(kwargs.get("chunkSize", 10)) fileLimit = int( kwargs.get("fileLimit")) if "fileLimit" in kwargs else None documentLimit = int(kwargs.get( "documentLimit")) if "documentLimit" in kwargs else None failedFilePath = kwargs.get("failFileListPath", None) loadFileListPath = kwargs.get("loadFileListPath", None) saveInputFileListPath = kwargs.get("saveFileListPath", None) schemaLevel = kwargs.get("schemaLevel", "min") if kwargs.get("schemaLevel") in [ "min", "full" ] else "min" loadType = kwargs.get("loadType", "full") # or replace updateSchemaOnReplace = kwargs.get("updateSchemaOnReplace", True) pruneDocumentSize = float( kwargs.get("pruneDocumentSize" )) if "pruneDocumentSize" in kwargs else None # "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name", documentStyle = kwargs.get("documentStyle", "rowwise_by_name_with_cardinality") dbType = kwargs.get("dbType", "mongo") # databaseName = kwargs.get("databaseName", None) databaseNameList = self.__cfgOb.get( "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration").split(",") collectionNameList = kwargs.get("collectionNameList", None) mergeValidationReports = kwargs.get("mergeValidationReports", True) # tU = TimeUtil() dataSetId = kwargs.get( "dataSetId" ) if "dataSetId" in kwargs else tU.getCurrentWeekSignature() seqDataLocator = self.__cfgOb.getPath( "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=self.__configName) sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=self.__configName) except Exception as e: logger.exception( "Argument and configuration processing failing with %s", str(e)) return False # if op == "pdbx-loader" and dbType == "mongo" and databaseName in databaseNameList: okS = True try: inputPathList = None if loadFileListPath: mu = MarshalUtil(workPath=self.__cachePath) inputPathList = mu.doImport(loadFileListPath, fmt="list") if not inputPathList: logger.error( "Operation %r missing or empty input file path list %s - exiting", op, loadFileListPath) return False except Exception as e: logger.exception( "Operation %r processing input path list failing with %s", op, str(e)) return False # try: mw = PdbxLoader( self.__cfgOb, self.__cachePath, resourceName="MONGO_DB", numProc=numProc, chunkSize=chunkSize, fileLimit=fileLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, ) ok = mw.load( databaseName, collectionLoadList=collectionNameList, loadType=loadType, inputPathList=inputPathList, styleType=documentStyle, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, mergeContentTypes=["vrpt"] if mergeValidationReports else None, updateSchemaOnReplace=updateSchemaOnReplace, ) okS = self.loadStatus(mw.getLoadStatus(), readBackCheck=readBackCheck) except Exception as e: logger.exception("Operation %r database %r failing with %s", op, databaseName, str(e)) elif op == "etl-entity-sequence-clusters" and dbType == "mongo": cw = SequenceClustersEtlWorker(self.__cfgOb, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, workPath=self.__cachePath) ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType) okS = self.loadStatus(cw.getLoadStatus(), readBackCheck=readBackCheck) elif op == "etl-repository-holdings" and dbType == "mongo": rhw = RepoHoldingsEtlWorker( self.__cfgOb, sandboxPath, self.__cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, ) ok = rhw.load(dataSetId, loadType=loadType) okS = self.loadStatus(rhw.getLoadStatus(), readBackCheck=readBackCheck) logger.info("Completed operation %r with status %r", op, ok and okS) return ok and okS def loadStatus(self, statusList, readBackCheck=True): ret = False try: dl = DocumentLoader(self.__cfgOb, self.__cachePath, "MONGO_DB", numProc=1, chunkSize=2, documentLimit=None, verbose=False, readBackCheck=readBackCheck) # sectionName = "data_exchange_configuration" databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ret = dl.load(databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=[ "update_id", "database_name", "object_name" ], keyNames=None) except Exception as e: logger.exception("Failing with %s", str(e)) return ret def buildResourceCache(self, rebuildCache=False): """Generate and cache resource dependencies. """ ret = False try: useCache = not rebuildCache logger.info("Cache setting useCache is %r", useCache) rp = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath) ret = rp.cacheResources(useCache=useCache) except Exception as e: logger.exception("Failing with %s", str(e)) return ret
class PdbxLoaderRemoteTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(PdbxLoaderRemoteTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) # self.__resourceName = "MONGO_DB" self.__failedFilePath = os.path.join(HERE, "test-output", "failed-list.txt") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__readBackCheck = True self.__numProc = 2 self.__chunkSize = 5 self.__fileLimit = 5 self.__documentStyle = "rowwise_by_name_with_cardinality" self.__ldList = [ # {"databaseName": "chem_comp_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "min"}, { "databaseName": "bird_chem_comp_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "full", "updateSchemaOnReplace": False, "status": True, }, { "databaseName": "bird_chem_comp_core", "collectionNameList": None, "loadType": "replace", "mergeContentTypes": None, "validationLevel": "full", "updateSchemaOnReplace": True, "status": True, }, { "databaseName": "pdbx_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": ["vrpt"], "validationLevel": "full", "updateSchemaOnReplace": False, "status": True, }, # { # "databaseName": "pdbx_core", # "collectionNameList": None, # "loadType": "replace", # "mergeContentTypes": ["vrpt"], # "validationLevel": "full", # "updateSchemaOnReplace": True, # "status": True, # }, ] # # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6, unitS) endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testPdbxLoader(self): for ld in self.__ldList: self.__pdbxLoaderWrapper(**ld) def __pdbxLoaderWrapper(self, **kwargs): """Wrapper for PDBx loader module""" try: logger.info("Loading %s", kwargs["databaseName"]) mw = PdbxLoader( self.__cfgOb, cachePath=self.__cachePath, resourceName=self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, fileLimit=self.__fileLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, maxStepLength=2000, useSchemaCache=True, rebuildSchemaFlag=False, ) ok = mw.load( kwargs["databaseName"], collectionLoadList=kwargs["collectionNameList"], loadType=kwargs["loadType"], inputPathList=None, styleType=self.__documentStyle, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=self.__failedFilePath, saveInputFileListPath=None, pruneDocumentSize=None, logSize=False, validationLevel=kwargs["validationLevel"], mergeContentTypes=kwargs["mergeContentTypes"], useNameFlag=False, updateSchemaOnReplace=kwargs["updateSchemaOnReplace"], restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) self.assertEqual(ok, kwargs["status"]) ok = self.__loadStatus(mw.getLoadStatus()) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __loadStatus(self, statusList): sectionName = "data_exchange_configuration" dl = DocumentLoader( self.__cfgOb, self.__cachePath, resourceName=self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=None, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ok = dl.load( databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None) return ok
class RepoHoldingsRemoteLoaderTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(RepoHoldingsRemoteLoaderTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__resourceName = "MONGO_DB" self.__readBackCheck = True self.__numProc = 2 self.__chunkSize = 10 self.__documentLimit = None self.__filterType = "assign-dates" # self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # sample data set self.__updateId = "2021_36" # eiP = EntryInfoProvider(cachePath=self.__cachePath, useCache=True) ok = eiP.testCache(minCount=0) self.assertTrue(ok) ok = eiP.restore(self.__cfgOb, configName, useStash=False, useGit=True) self.assertTrue(ok) ok = eiP.reload() self.assertTrue(ok) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testLoadHoldingsRemote(self): """Test case - load legacy repository holdings and status data - [repository_holdings] DATABASE_NAME=repository_holdings DATABASE_VERSION_STRING=v5 COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry COLLECTION_HOLDINGS_COMBINED=rcsb_repository_holdings_combined_entry """ try: sectionName = "repository_holdings_configuration" rhdp = RepoHoldingsRemoteDataPrep(cachePath=self.__cachePath, filterType=self.__filterType) # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) logger.info("databaseName %r", databaseName) addValues = None # maxDoc = 5 dList = rhdp.getHoldingsRemovedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsUnreleasedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsUpdateEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE", sectionName=sectionName) logger.info("collectionName %r", collectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsCurrentEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsCombinedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaDefDataPrepTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SchemaDefDataPrepTests, self).__init__(methodName) self.__loadPathList = [] self.__verbose = True def setUp(self): self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__numProc = 2 mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__outputPath = os.path.join(HERE, "test-output") self.__savedOutputPath = os.path.join(HERE, "test-saved-output") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE", sectionName=configName, default="local") self.__fileLimit = 100 if self.__discoveryMode == "local" else 10 self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__chemCompMockLen = 24 self.__pdbxMockLen = 30 # removes timestamped data items to allow diffs.) excludeExtras = ["rcsb_load_status"] # excludeExtras = [] # self.__verbose = True self.__modulePathMap = self.__cfgOb.get( "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) # self.__exportFlag = True self.__diffFlag = False # self.__simpleTestCaseList = [ { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_no_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeCol, "styleType": "columnwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "chem_comp", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 2, }, { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name", "mergeContentTypes": None, "rejectLength": 0, }, ] # self.__fullTestCaseList = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 0, "excludeExtras": excludeExtras, }, { "contentType": "bird_chem_comp_core", "mockLength": self.__chemCompMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": None, "rejectLength": 2, "excludeExtras": excludeExtras, }, ] # self.__fullTestCaseListA = [ { "contentType": "pdbx_core", "mockLength": self.__pdbxMockLen, "filterType": self.__fTypeRow, "styleType": "rowwise_by_name_with_cardinality", "mergeContentTypes": ["vrpt"], "rejectLength": 0, "excludeExtras": excludeExtras, }, ] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6, unitS) endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __timeStep(self, msg): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", msg, time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testSimpleSchemaDefDataPrep(self): for tcD in self.__simpleTestCaseList: rejectLength = 0 if self.__discoveryMode == "remote" else tcD[ "rejectLength"] mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[ "mockLength"] if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote": logger.info("Skipping %r in discovery mode %r", tcD["contentType"], self.__discoveryMode) continue self.__simpleSchemaDataPrep( tcD["contentType"], tcD["filterType"], tcD["styleType"], mockLength, rejectLength=rejectLength, mergeContentTypes=tcD["mergeContentTypes"]) def testFullSchemaDefDataPrep(self): for tcD in self.__fullTestCaseList: rejectLength = 0 if self.__discoveryMode == "remote" else tcD[ "rejectLength"] mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[ "mockLength"] if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote": logger.info("Skipping %r in discovery mode %r", tcD["contentType"], self.__discoveryMode) continue self.__fullSchemaDataPrep( tcD["contentType"], tcD["filterType"], tcD["styleType"], mockLength, rejectLength=rejectLength, mergeContentTypes=tcD["mergeContentTypes"], excludeExtras=tcD["excludeExtras"], ) def __simpleSchemaDataPrep(self, contentType, filterType, styleType, mockLength, rejectLength=0, dataSelectors=None, mergeContentTypes=None): """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection. Args: contentType (str): Content type name filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...) styleType (str): organization of output document (e.g. rowise-by-name) mockLength (int): Expected length of the test data for the input content type rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0. dataSelectors (list of str, optional): data selection criteria. Defaults to None. mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt']) """ try: dataSelectors = dataSelectors if dataSelectors else [ "PUBLIC_RELEASE" ] dD = self.__schP.makeSchemaDef(contentType, dataTyping="ANY", saveSchema=True) _ = SchemaDefAccess(dD) inputPathList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes) sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType, dataTyping="ANY") dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=filterType) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) # logger.debug("For %s mock length %d length of path list %d\n", contentType, mockLength, len(inputPathList)) self.assertGreaterEqual(len(inputPathList), mockLength) tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments( inputPathList, styleType=styleType, filterType=filterType, dataSelectors=dataSelectors) logger.debug( "For %s mock length %d reject length %d length of tddl list %d\n", contentType, mockLength, rejectLength, len(tableDataDictList)) self.assertGreaterEqual(len(tableDataDictList), mockLength - rejectLength) self.assertGreaterEqual(len(containerNameList), mockLength - rejectLength) if rejectList: logger.debug("For %s rejecting components %r", contentType, rejectList) # self.assertEqual(len(rejectList), rejectLength) fName = "simple-prep-%s-%s.json" % (contentType, styleType) if self.__exportFlag: fPath = os.path.join(self.__outputPath, fName) self.__mU.doExport(fPath, tableDataDictList, fmt="json", indent=3) if self.__diffFlag: fPath = os.path.join(self.__savedOutputPath, fName) refDocList = self.__mU.doImport(fPath, fmt="json") self.assertEqual(len(refDocList), len(tableDataDictList)) # jD = diff(refDocList, tableDataDictList, syntax="explicit", marshal=True) if jD: _, fn = os.path.split(fPath) bn, _ = os.path.splitext(fn) fPath = os.path.join(self.__outputPath, bn + "-diff.json") logger.debug("jsondiff for %s %s = \n%s", contentType, styleType, pprint.pformat(jD, indent=3, width=100)) self.__mU.doExport(fPath, jD, fmt="json", indent=3) self.assertEqual(len(jD), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __logDocumentOrder(self, docList): for doc in docList: logger.debug("keys %r", list(doc.keys())) def __filterDocuments(self, docList, excludeList=None): excludeList = excludeList if excludeList else [] for doc in docList: for excl in excludeList: if excl in doc: del doc[excl] def __fullSchemaDataPrep(self, contentType, filterType, styleType, mockLength, rejectLength=0, dataSelectors=None, mergeContentTypes=None, excludeExtras=None): """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection. Args: contentType (str): Content type name filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...) styleType (str): organization of output document (e.g. rowise-by-name) mockLength (int): Expected length of the test data for the input content type rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0. dataSelectors (list of str, optional): data selection criteria. Defaults to None. mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt']) """ try: excludeExtras = excludeExtras if excludeExtras else [] _ = mockLength _ = rejectLength dD = self.__schP.makeSchemaDef(contentType, dataTyping="ANY", saveSchema=True) _ = SchemaDefAccess(dD) inputPathList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContentTypes) sd, _, collectionNameList, _ = self.__schP.getSchemaInfo( databaseName=contentType, dataTyping="ANY") # dP = DictionaryApiProviderWrapper(self.__cachePath, cfgOb=self.__cfgOb, configName=self.__configName, useCache=True) dictApi = dP.getApiByName(contentType) # rP = DictMethodResourceProvider( self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=filterType) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) # for collectionName in collectionNameList: tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # docList, _, _ = sdp.processDocuments( containerList, styleType=styleType, sliceFilter=sliceFilter, filterType=filterType, dataSelectors=dataSelectors, collectionName=collectionName) docList = sdp.addDocumentPrivateAttributes( docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates( docList, collectionName) # Special exclusions for the test harness. (removes timestamped data items to allow diffs.) self.__filterDocuments(docList, excludeExtras) mergeS = "-".join( mergeContentTypes) if mergeContentTypes else "" fName = "full-prep-%s-%s-%s-%s.json" % ( contentType, collectionName, mergeS, styleType) if self.__exportFlag: self.__logDocumentOrder(docList) fPath = os.path.join(self.__outputPath, fName) self.__mU.doExport(fPath, docList, fmt="json", indent=3) logger.debug("Exported %r", fPath) # if self.__diffFlag: fPath = os.path.join(self.__savedOutputPath, fName) refDocList = self.__mU.doImport(fPath, fmt="json") self.assertEqual(len(refDocList), len(docList)) logger.debug("For %s %s len refDocList %d", contentType, collectionName, len(refDocList)) logger.debug("For %s %s len docList %d", contentType, collectionName, len(docList)) jD = diff(refDocList, docList, syntax="explicit", marshal=True) if jD: _, fn = os.path.split(fPath) bn, _ = os.path.splitext(fn) fPath = os.path.join(self.__outputPath, bn + "-diff.json") logger.debug("jsondiff for %s %s = \n%s", contentType, collectionName, pprint.pformat(jD, indent=3, width=100)) self.__mU.doExport(fPath, jD, fmt="json", indent=3) self.assertEqual(len(jD), 0) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def main(): parser = argparse.ArgumentParser() # defaultConfigName = "site_info_configuration" # parser.add_argument( "--update_chem_comp_ref", default=False, action="store_true", help="Update schema for Chemical Component reference definitions") parser.add_argument( "--update_chem_comp_core_ref", default=False, action="store_true", help="Update core schema for Chemical Component reference definitions") parser.add_argument( "--update_bird_chem_comp_ref", default=False, action="store_true", help="Update schema for Bird Chemical Component reference definitions") parser.add_argument( "--update_bird_chem_comp_core_ref", default=False, action="store_true", help= "Update core schema for Bird Chemical Component reference definitions") parser.add_argument("--update_bird_ref", default=False, action="store_true", help="Update schema for Bird reference definitions") parser.add_argument( "--update_bird_family_ref", default=False, action="store_true", help="Update schema for Bird Family reference definitions") parser.add_argument("--update_pdbx", default=False, action="store_true", help="Update schema for PDBx entry data") parser.add_argument("--update_pdbx_core", default=False, action="store_true", help="Update schema for PDBx core entry/entity data") parser.add_argument( "--update_pdbx_comp_model_core", default=False, action="store_true", help="Update schema for PDBx computational model core entry/entity data" ) # parser.add_argument("--update_repository_holdings", default=False, action="store_true", help="Update schema for repository holdings") parser.add_argument("--update_entity_sequence_clusters", default=False, action="store_true", help="Update schema for entity sequence clusters") parser.add_argument("--update_data_exchange", default=False, action="store_true", help="Update schema for data exchange status") parser.add_argument("--update_ihm_dev", default=False, action="store_true", help="Update schema for I/HM dev entry data") parser.add_argument("--update_drugbank_core", default=False, action="store_true", help="Update DrugBank schema") # parser.add_argument( "--update_config_all", default=False, action="store_true", help="Update using configuration settings (e.g. DATABASE_NAMES_ALL)") parser.add_argument( "--update_config_deployed", default=False, action="store_true", help= "Update using configuration settings (e.g. DATABASE_NAMES_DEPLOYED)") parser.add_argument( "--update_config_test", default=False, action="store_true", help="Update using configuration settings (e.g. DATABASE_NAMES_TEST)") # parser.add_argument("--config_path", default=None, help="Path to configuration options file") parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name") # parser.add_argument("--cache_path", default=None, help="Schema cache directory path") parser.add_argument( "--encoding_types", default=None, help="Schema encoding (rcsb|json|bson) (comma separated)") parser.add_argument( "--validation_levels", default=None, help="Schema validation level (full|min) (comma separated)") parser.add_argument("--compare_only", default=False, action="store_true", help="Perform comparison with cached schema") # parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging") parser.add_argument( "--mock", default=False, action="store_true", help="Use MOCK repository configuration for dependencies and testing") # parser.add_argument("--working_path", default=None, help="Working/alternative path for temporary and schema files") args = parser.parse_args() # debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Configuration Details configPath = args.config_path configName = args.config_name cachePath = args.cache_path compareOnly = args.compare_only # encodingTypes = args.encoding_types.split( ",") if args.encoding_types else [] validationLevels = args.validation_levels.split( ",") if args.validation_levels else [] dataTypingList = ["ANY", "SQL"] if not configPath: configPath = os.getenv("DBLOAD_CONFIG_PATH", None) try: if os.access(configPath, os.R_OK): os.environ["DBLOAD_CONFIG_PATH"] = configPath logger.info("Using configuation path %s (%s)", configPath, configName) else: logger.error("Missing or access issue with config file %r", configPath) exit(1) mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath) if configName != defaultConfigName: cfgOb.replaceSectionName(defaultConfigName, configName) except Exception as e: logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) exit(1) # databaseNameList = [] if args.update_chem_comp_ref: databaseNameList.append("chem_comp") if args.update_bird_chem_comp_ref: databaseNameList.append("bird_chem_comp") if args.update_chem_comp_core_ref: databaseNameList.append("chem_comp_core") if args.update_bird_chem_comp_core_ref: databaseNameList.append("bird_chem_comp_core") if args.update_bird_ref: databaseNameList.append("bird") if args.update_bird_family_ref: databaseNameList.append("bird_family") if args.update_pdbx: databaseNameList.append("pdbx") if args.update_pdbx_core: databaseNameList.append("pdbx_core") if args.update_pdbx_comp_model_core: databaseNameList.append("pdbx_comp_model_core") if args.update_repository_holdings: databaseNameList.append("repository_holdings") if args.update_entity_sequence_clusters: databaseNameList.append("sequence_clusters") if args.update_data_exchange: databaseNameList.append("data_exchange") if args.update_ihm_dev: databaseNameList.append("ihm_dev") if args.update_drugbank_core: databaseNameList.append("drugbank_core") if args.update_config_deployed: databaseNameList = cfgOb.getList( "DATABASE_NAMES_DEPLOYED", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_DEPLOYED", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_DEPLOYED", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_DEPLOYED", sectionName="database_catalog_configuration") if args.update_config_all: databaseNameList = cfgOb.getList( "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_ALL", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_ALL", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_ALL", sectionName="database_catalog_configuration") if args.update_config_test: databaseNameList = cfgOb.getList( "DATABASE_NAMES_TEST", sectionName="database_catalog_configuration") dataTypingList = cfgOb.getList( "DATATYPING_TEST", sectionName="database_catalog_configuration") validationLevels = cfgOb.getList( "VALIDATION_LEVELS_TEST", sectionName="database_catalog_configuration") encodingTypes = cfgOb.getList( "ENCODING_TYPES_TEST", sectionName="database_catalog_configuration") # scnD = cfgOb.get("document_collection_names", sectionName="document_helper_configuration") # databaseNameList = list(set(databaseNameList)) logger.debug("Collections %s", list(scnD.items())) logger.debug("databaseNameList %s", databaseNameList) if compareOnly: schP = SchemaProvider(cfgOb, cachePath, useCache=True) difPathList = [] for databaseName in databaseNameList: for dataTyping in dataTypingList: logger.debug("Building schema %s with types %s", databaseName, dataTyping) pth = schP.schemaDefCompare(databaseName, dataTyping) if pth: difPathList.append(pth) if difPathList: logger.info("Schema definition difference path list %r", difPathList) difPathList = [] for databaseName in databaseNameList: dD = schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=False) sD = SchemaDefAccess(dD) for cd in sD.getCollectionInfo(): collectionName = cd["NAME"] for encodingType in encodingTypes: if encodingType.lower() != "json": continue for level in validationLevels: pth = schP.jsonSchemaCompare(databaseName, collectionName, encodingType, level) if pth: difPathList.append(pth) if difPathList: logger.info("JSON schema difference path list %r", difPathList) else: schP = SchemaProvider(cfgOb, cachePath, useCache=False) for databaseName in databaseNameList: for encodingType in encodingTypes: if encodingType == "rcsb": for dataTyping in dataTypingList: logger.info( "Creating schema definition for content type %s data typing %s", databaseName, dataTyping) schP.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True) else: if databaseName in scnD: for dD in scnD[databaseName]: collectionName = dD["NAME"] for validationLevel in validationLevels: logger.info( "Creating %r schema for content type %s collection %s", encodingType, databaseName, collectionName) schP.makeSchema(databaseName, collectionName, encodingType=encodingType, level=validationLevel, saveSchema=True)
class DictMethodRunnerTests(unittest.TestCase): def setUp(self): self.__export = True self.__numProc = 2 self.__fileLimit = 200 mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") configPath = os.path.join(mockTopPath, "config", "dbload-setup-example.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__testCaseList = [ { "contentType": "pdbx_core", "mockLength": 50, "mergeContent": ["vrpt"] }, { "contentType": "bird_chem_comp_core", "mockLength": 17, "mergeContent": None }, ] # self.__modulePathMap = self.__cfgOb.get( "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __runContentType(self, contentType, mockLength, mergeContent): """Read and process test fixture data files from the input content type.""" try: dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) dictApi = dP.getApiByName(contentType) rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) locatorObjList = self.__rpP.getLocatorObjList( contentType=contentType, mergeContentTypes=mergeContent) containerList = self.__rpP.getContainerList(locatorObjList) # logger.debug("Length of locator list %d\n", len(locatorObjList)) self.assertGreaterEqual(len(locatorObjList), mockLength) for container in containerList: cName = container.getName() # # if cName not in ["1B5F"]: # continue logger.debug("Processing container %s", cName) dmh.apply(container) if self.__export: savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") self.__mU.doExport(savePath, [container], fmt="mmcif") except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testMethodRunner(self): """Test method runner for multiple content types.""" for tD in self.__testCaseList: self.__runContentType(tD["contentType"], tD["mockLength"], tD["mergeContent"]) def testMethodRunnerSetup(self): """Test the setup methods for method runner class""" try: dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) dictApi = dP.getApiByName("pdbx") rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) ok = dmh is not None self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class ChemRefDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") # self.__configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=self.__configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateFull(self): self.__validateChemRef("DrugBank", schemaLevel="full") def __validateChemRef(self, extResource, schemaLevel="full"): eCount = 0 if extResource == "DrugBank": schemaName = "drugbank_core" collectionNames = ["drugbank_core"] user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=self.__configName) pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=self.__configName) # cacheDir = self.__cfgOb.get("DRUGBANK_CACHE_DIR", sectionName=self.__configName) dbP = DrugBankProvider(cachePath=self.__cachePath, useCache=True, username=user, password=pw) # idD = dbP.getMapping() # crExt = ChemRefExtractor(self.__cfgOb) # idD = crExt.getChemCompAccesionMapping(extResource) dList = dbP.getDocuments() logger.info("Validating %d Drugbank documents", len(dList)) eCount = self.__validate(schemaName, collectionNames, dList, schemaLevel=schemaLevel) return eCount def __validate(self, databaseName, collectionNames, dList, schemaLevel="full"): eCount = 0 for collectionName in collectionNames: _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) for ii, dD in enumerate(dList): logger.debug("Database %s collection %s document %d", databaseName, collectionName, ii) try: cCount = 0 for error in sorted(valInfo.iter_errors(dD), key=str): logger.info( "database %s collection %s path %s error: %s", databaseName, collectionName, error.path, error.message) logger.info(">>> failing object is %r", dD) eCount += 1 cCount += 1 # logger.debug("database %s collection %s count %d", databaseName, collectionName, cCount) except Exception as e: logger.exception("Validation error %s", str(e)) return eCount
class DataExchangeStatusLoaderTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(DataExchangeStatusLoaderTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) # self.__cfgOb.dump() self.__resourceName = "MONGO_DB" self.__readBackCheck = True self.__numProc = 2 self.__chunkSize = 10 self.__documentLimit = 1000 # # sample data set self.__updateId = "2018_23" # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testLoadExchangeStatus(self): """ Test case - load data exchange status objects. [data_exchange] DATABASE_NAME=data_exchange DATABASE_VERSION_STRING=v5 COLLECTION_UPDATE_STATUS=rcsb_data_exchange_status COLLECTION_VERSION_STRING=v0_1 """ try: for ii in range(1, 100): collectionName = "my_collection_" + str(ii) dList = [] desp = DataExchangeStatus() tS = desp.setStartTime() self.assertGreaterEqual(len(tS), 15) ok = desp.setObject("my_database", collectionName) self.assertTrue(ok) ok = desp.setStatus(updateId=None, successFlag="Y") self.assertTrue(ok) # tS = desp.setEndTime() self.assertGreaterEqual(len(tS), 15) dList.append(desp.getStatus()) # self.assertEqual(len(dList), 1) logger.debug("Status record %r", dList[0]) sectionName = "data_exchange_configuration" dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) # collectionVersion = self.__cfgOb.get('COLLECTION_VERSION_STRING', sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) if ii == 1: loadType = "full" else: loadType = "append" ok = dl.load(databaseName, collectionName, loadType=loadType, documentList=dList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testReadYamlConfig(self): try: cfgOb = ConfigUtil(configPath=self.__inpPathConfigYaml, configFormat="yaml", mockTopPath=self.__mockTopPath) ok = cfgOb.appendConfig(self.__inpPathConfigAppendYaml, configFormat="yaml") self.assertTrue(ok) # sName = "DEFAULT" pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName) # self.assertEqual( pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO")) self.assertEqual( pathPdbx, os.path.join(self.__mockTopPath, "MOCK_PDBX_SANDBOX")) pathBird = cfgOb.get("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.get("PDBX_REPO_PATH", sectionName=sName) self.assertEqual(pathBird, "MOCK_BIRD_REPO") self.assertEqual(pathPdbx, "MOCK_PDBX_SANDBOX") sName = "Section1" # helperMethod = cfgOb.getHelper("DICT_METHOD_HELPER_MODULE", sectionName=sName) tv = helperMethod.echo("test_value") self.assertEqual(tv, "test_value") # tEnv = "TEST_ENV_VAR" tVal = "TEST_ENV_VAR_VALUE" os.environ[tEnv] = tVal eVal = cfgOb.getEnvValue("ENV_OPTION_A", sectionName=sName) self.assertEqual(tVal, eVal) ky = "42d13dfc9eb689e48c774aa5af8a7e15dbabcd5041939bef213eb37aed882fd6" os.environ["CONFIG_SUPPORT_TOKEN_ENV"] = ky # un = cfgOb.getSecret("SECRET_TEST_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.getSecret("SECRET_TEST_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser") self.assertEqual(pw, "testuserpassword") # un = cfgOb.get("_TEST_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.get("_TEST_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser") self.assertEqual(pw, "testuserpassword") # un = cfgOb.getSecret("_TEST_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.getSecret("_TEST_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser") self.assertEqual(pw, "testuserpassword") # sName = "Section2" un = cfgOb.getSecret("_TEST_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.getSecret("_TEST_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser") self.assertEqual(pw, "testuserpassword") # test fallback # CLEAR_TEXT_USERNAME: testuser2 # CLEAR_TEXT_PASSWORD: changeme2 un = cfgOb.get("_CLEAR_TEXT_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.get("_CLEAR_TEXT_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser2") self.assertEqual(pw, "changeme2") except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return {}
class PdbxLoaderFixture(unittest.TestCase): def __init__(self, methodName="runTest"): super(PdbxLoaderFixture, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example-local.yml") # To Do: Investigate why GitUtil sometimes gives divergence error when using 'DISCOVERY_MODE: remote', but not with 'local': # stderr: 'fatal: Need to specify how to reconcile divergent branches.' # Behavior isn't entirely predictable, since it happens sometimes but not all the time. # To fully debug, will need to add more logging statements to GitUtil, StashableBase, & StashUtil (in rcsb.utils.io) # Or, can try to resolve error directly by specifying how to reconcile diverent branches in git.Repo class. configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) # self.__resourceName = "MONGO_DB" self.__failedFilePath = os.path.join(HERE, "test-output", "failed-list.txt") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__readBackCheck = True self.__numProc = 2 self.__chunkSize = 10 self.__fileLimit = 38 self.__documentStyle = "rowwise_by_name_with_cardinality" # self.__birdChemCompCoreIdList = [ "PRD_000010", "PRD_000060", "PRD_000220", "PRD_000882", "PRD_000154", "PRD_000877", "PRD_000198", "PRD_000009", "PRD_000979", "PRDCC_000010", "PRDCC_000220", "PRDCC_000882", "PRDCC_000154", "PRDCC_000198", "PRDCC_000009", "FAM_000010", "FAM_000210", "FAM_000220", "FAM_000001", "FAM_000391", "FAM_000093", "FAM_000084", "FAM_000016", "FAM_000336", "1G1", "2RT", "2XL", "2XN", "ATP", "BJA", "BM3", "CNC", "DAL", "DDZ", "DHA", "DSN", "GTP", "HKL", "NAC", "NAG", "NND", "PTR", "SEP", "SMJ", "STL", "UNK", "UNX", "UVL", ] # self.__pdbIdList = [ "1ah1", "1b5f", "1bmv", "1c58", "1dsr", "1dul", "1kqe", "1o3q", "1sfo", "2hw3", "2hyv", "2osl", "2voo", "2wmg", "3ad7", "3hya", "3iyd", "3mbg", "3rer", "3vd8", "3vfj", "3x11", "3ztj", "4e2o", "4en8", "4mey", "5eu8", "5kds", "5tm0", "5vh4", "5vp2", "6fsz", "6lu7", "6nn7", "6q20", "6rfk", "6rku", "6yrq", ] self.__ldList = [ { "databaseName": "bird_chem_comp_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "min", "inputIdCodeList": self.__birdChemCompCoreIdList }, { "databaseName": "pdbx_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": ["vrpt"], "validationLevel": "min", "inputIdCodeList": self.__pdbIdList }, { "databaseName": "pdbx_comp_model_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "min", "inputIdCodeList": None }, ] # self.__modelFixture() self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 10 ** 6, unitS) endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __modelFixture(self): fU = FileUtil() modelSourcePath = os.path.join(self.__mockTopPath, "AF") for iPath in glob.iglob(os.path.join(modelSourcePath, "*.cif.gz")): fn = os.path.basename(iPath) uId = fn.split("-")[1] h3 = uId[-2:] h2 = uId[-4:-2] h1 = uId[-6:-4] oPath = os.path.join(self.__cachePath, "computed-models", h1, h2, h3, fn) fU.put(iPath, oPath) def testPdbxLoader(self): # for ld in self.__ldList: self.__pdbxLoaderWrapper(**ld) def __pdbxLoaderWrapper(self, **kwargs): """Wrapper for the PDBx loader module""" try: logger.info("Loading %s", kwargs["databaseName"]) mw = PdbxLoader( self.__cfgOb, cachePath=self.__cachePath, resourceName=self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, fileLimit=kwargs.get("fileLimit", self.__fileLimit), verbose=self.__verbose, readBackCheck=self.__readBackCheck, maxStepLength=2000, useSchemaCache=True, rebuildSchemaFlag=False, ) ok = mw.load( kwargs["databaseName"], collectionLoadList=kwargs["collectionNameList"], loadType=kwargs["loadType"], inputPathList=None, inputIdCodeList=kwargs["inputIdCodeList"], styleType=self.__documentStyle, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=self.__failedFilePath, saveInputFileListPath=None, pruneDocumentSize=None, logSize=False, validationLevel=kwargs["validationLevel"], mergeContentTypes=kwargs["mergeContentTypes"], useNameFlag=False, providerTypeExclude=self.__excludeType, restoreUseGit=True, restoreUseStash=False, ) self.assertTrue(ok) ok = self.__loadStatus(mw.getLoadStatus()) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __loadStatus(self, statusList): sectionName = "data_exchange_configuration" dl = DocumentLoader( self.__cfgOb, self.__cachePath, resourceName=self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=None, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None) return ok
class EntityPolymerExtractorTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(EntityPolymerExtractorTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) # self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__workPath = os.path.join(HERE, "test-output") self.__taxonomyDataPath = os.path.join( self.__cachePath, self.__cfgOb.get("NCBI_TAXONOMY_CACHE_DIR", sectionName=configName)) # self.__cacheKwargs = {"fmt": "json", "indent": 3} self.__exdbCacheDirPath = os.path.join( self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName)) # self.__mU = MarshalUtil() self.__entryLimitTest = 18 # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testExtractEntityPolymers(self): """Test case - extract entity polymer info""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs, entryLimit=self.__entryLimitTest) eCount = epe.getEntryCount() self.assertGreaterEqual(eCount, self.__entryLimitTest) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testAccessEntityPolymerFeatures(self): """Test case - access cached entity polymer info from test cache""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs) eCount = epe.getEntryCount() logger.info("Entry count %d", eCount) self.assertGreaterEqual(eCount, self.__entryLimitTest) # unpL = epe.getRefSeqAccessions("UNP") logger.info("Ref seq count %d", len(unpL)) self.assertGreaterEqual(len(unpL), 1) # for entryId in ["3RER"]: for entityId in ["1"]: uL = epe.getEntityRefSeqAccessions("UNP", entryId, entityId) logger.info("UNP for %s %s %r", entryId, entityId, uL) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testTaxonomyReadCache(self): """Test case - access cached entity polymer info from test cache""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs) logger.info("Cache entry count %d", epe.getEntryCount()) # obsL = [] tD = epe.getOrigTaxons() logger.info("Taxons %d", len(tD)) tU = TaxonomyProvider(taxDirPath=self.__taxonomyDataPath, useCache=True) # for entryId, taxIdL in tD.items(): for entityId, iTaxId in taxIdL: # logger.info("entryId %r entityId %r taxId %r" % (entryId, entityId, taxId)) mTaxId = tU.getMergedTaxId(iTaxId) if iTaxId != mTaxId: obsL.append({ "entryId": entryId, "entityId": entityId, "taxId": iTaxId, "replaceTaxId": mTaxId }) logger.info("Obsolete list length %d", len(obsL)) self.__mU.doExport(os.path.join(self.__workPath, "obsolete-taxons.json"), obsL, fmt="json", indent=3) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testAccessEntityPolymerReadCache(self): """Test case - access cached entity polymer info from test cache""" try: epe = EntityPolymerExtractor(self.__cfgOb, exdbDirPath=self.__exdbCacheDirPath, useCache=False, cacheKwargs=self.__cacheKwargs) logger.info("Cache entry count %d", epe.getEntryCount()) cD = epe.countRefSeqAccessions("UNP") self.assertGreaterEqual(len(cD), 2) logger.info("UNP reference sequences per entity %r", dict(sorted(cD.items()))) logger.info("Reference sequences per entity %r", dict(sorted(epe.countRefSeqAccessionAny().items()))) logger.info("Reference sequences per ref db %r", dict(sorted(epe.countRefSeqAccessionDbType().items()))) # ok = epe.checkRefSeqAlignRange("UNP") self.assertTrue(ok) unpL = epe.getRefSeqAccessions("UNP") logger.info("Unique UNP reference sequences %d", len(unpL)) self.assertTrue(ok) tD = epe.getUniqueTaxons() logger.info("Unique taxons %d", len(tD)) tD = epe.countRefSeqAccessionByTaxon("UNP") logger.info("Unique taxons %d", len(tD)) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class ReferenceSequenceUtilsTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(ReferenceSequenceUtilsTests, self).__init__(methodName) self.__verbose = True def setUp(self): # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config", "dbload-setup-example.yml") # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) # self.__cachePath = os.path.join(TOPDIR, "CACHE") # self.__cacheKwargs = {"fmt": "json", "indent": 3} self.__exdbCacheDirPath = os.path.join( self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName)) # # Reference sequence test data cache - # self.__refDbCachePath = os.path.join(HERE, "test-output", "unp-data-test-cache.json") self.__cacheKwargs = {"fmt": "json", "indent": 3} self.__useCache = False self.__fetchLimit = None # # Entity polymer extracted data ... # self.__entryLimit = 500 # self.__mU = MarshalUtil() # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testFetchUnp(self): """Test case - extract entity polymer info -""" try: refDbName = "UNP" rsu = ReferenceSequenceUtils( self.__cfgOb, refDbName, exdbDirPath=self.__exdbCacheDirPath, cacheKwargs=self.__cacheKwargs, useCache=self.__useCache, entryLimit=self.__entryLimit, fetchLimit=self.__fetchLimit, ) numPrimary, numSecondary, numNone = rsu.getReferenceAccessionAlignSummary( ) self.assertGreaterEqual(numPrimary, 70) logger.info("For %r matched primary: %d secondary: %d none %d", refDbName, numPrimary, numSecondary, numNone) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__numProc = 2 # self.__fileLimit = 200 self.__fileLimit = None self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName) # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__verbose = True # self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files") self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files") self.__export = True # #self.__extraOpts = None # The following for extended parent/child info - self.__extraOpts = 'addParentRefs|addPrimaryKey' # self.__alldatabaseNameD = { "ihm_dev": ["ihm_dev"], "pdbx": ["pdbx", "pdbx_ext"], "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird": ["bird"], "bird_family": ["family"], "chem_comp": ["chem_comp"], "bird_chem_comp": ["bird_chem_comp"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__databaseNameD = { "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]} # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]} # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]} self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateOptsRepo(self): # schemaLevel = "min" schemaLevel = "full" inputPathList = None eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertLessEqual(eCount, 1) @unittest.skip("Disable troubleshooting test") def testValidateOptsList(self): schemaLevel = "min" inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list") # inputPathList = glob.glob(self.__testDirPath + "/*.cif") if not inputPathList: return True databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]} for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)): if ii < 5: continue eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) #@unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmRepo(self): schemaLevel = "min" inputPathList = None self.__export = True databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # #@unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmList(self): #schemaLevel = "full" schemaLevel = "min" inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif") if not inputPathList: return True #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None): # eCount = 0 for databaseName in databaseNameD: mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes) for collectionName in databaseNameD[databaseName]: cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts) # dL, cnL = self.__testPrepDocumentsFromContainers( pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes ) # Raises exceptions for schema compliance. try: Draft4Validator.check_schema(cD) except Exception as e: logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e)) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName) for ii, dD in enumerate(dL): logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii) try: cCount = 0 #for error in sorted(valInfo.iter_errors(dD), key=str): # logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message) # logger.debug("Failing document %d : %r", ii, list(dD.items())) # eCount += 1 # cCount += 1 #if cCount > 0: # logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount) except Exception as e: logger.exception("Validation processing error %s", str(e)) return eCount def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None): """Test case - create loadable PDBx data from repository files """ try: sd, _, _, _ = self.__schP.getSchemaInfo(databaseName) # dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False) dictApi = dP.getApiByName(databaseName) rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) if self.__export: savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") #self.__mU.doExport(savePath, [container], fmt="mmcif") # tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # docList, containerNameList, _ = sdp.processDocuments( containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName ) docList = sdp.addDocumentPrivateAttributes(docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName) # mergeS = "-".join(mergeContentTypes) if mergeContentTypes else "" if self.__export and docList: # for ii, doc in enumerate(docList[:1]): for ii, doc in enumerate(docList): cn = containerNameList[ii] fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS)) self.__mU.doExport(fp, [doc], fmt="json", indent=3) logger.debug("Exported %r", fp) # return docList, containerNameList except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class PdbxLoaderTests(unittest.TestCase): loadLocal = False loadModels = True def __init__(self, methodName="runTest"): super(PdbxLoaderTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # self.__isMac = platform.system() == "Darwin" self.__excludeType = None if self.__isMac else "optional" self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) # self.__resourceName = "MONGO_DB" self.__failedFilePath = os.path.join(HERE, "test-output", "failed-list.txt") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__readBackCheck = True self.__numProc = 2 self.__chunkSize = 10 self.__fileLimit = None self.__documentStyle = "rowwise_by_name_with_cardinality" self.__ldList = [ # {"databaseName": "chem_comp_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "min"}, { "databaseName": "bird_chem_comp_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "full", "updateSchemaOnReplace": False, "status": True, }, { "databaseName": "bird_chem_comp_core", "collectionNameList": None, "loadType": "replace", "mergeContentTypes": None, "validationLevel": "full", "updateSchemaOnReplace": True, "status": True, }, { "databaseName": "pdbx_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": ["vrpt"], "validationLevel": "full", "updateSchemaOnReplace": False, "status": True, }, ] self.__ldModelList = [ { "databaseName": "pdbx_comp_model_core", "collectionNameList": None, "loadType": "full", "mergeContentTypes": None, "validationLevel": "full", "updateSchemaOnReplace": False, "status": True, }, ] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): unitS = "MB" if platform.system() == "Darwin" else "GB" rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6, unitS) endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __modelFixture(self): fU = FileUtil() modelSourcePath = os.path.join(self.__mockTopPath, "AF") for iPath in glob.iglob(os.path.join(modelSourcePath, "*.cif.gz")): fn = os.path.basename(iPath) uId = fn.split("-")[1] h3 = uId[-2:] h2 = uId[-4:-2] h1 = uId[-6:-4] oPath = os.path.join(self.__cachePath, "computed-models", h1, h2, h3, fn) fU.put(iPath, oPath) @unittest.skipUnless(loadLocal, "Skip local load test") def testPdbxLoader(self): for ld in self.__ldList: self.__pdbxLoaderWrapper(**ld) @unittest.skipUnless(loadModels, "Skip model load test") def testPdbxCompModelLoader(self): self.__modelFixture() # Comment out for manual testing for ld in self.__ldModelList: self.__pdbxLoaderWrapper(**ld) def __pdbxLoaderWrapper(self, **kwargs): """Wrapper for PDBx loader module""" try: logger.info("Loading %s", kwargs["databaseName"]) mw = PdbxLoader( self.__cfgOb, cachePath=self.__cachePath, resourceName=self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, fileLimit=None, verbose=self.__verbose, readBackCheck=self.__readBackCheck, maxStepLength=2000, useSchemaCache=True, # rebuildSchemaFlag=False, # This doesn't work for testing, I think because it's probably copying old schema files from remote repo and using those rebuildSchemaFlag=True, ) ok = mw.load( kwargs["databaseName"], collectionLoadList=kwargs["collectionNameList"], loadType=kwargs["loadType"], inputPathList=None, inputIdCodeList=None, styleType=self.__documentStyle, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=self.__failedFilePath, saveInputFileListPath=None, pruneDocumentSize=None, logSize=False, validationLevel=kwargs["validationLevel"], mergeContentTypes=kwargs["mergeContentTypes"], useNameFlag=False, updateSchemaOnReplace=kwargs["updateSchemaOnReplace"], restoreUseStash=False, restoreUseGit=True, providerTypeExclude=self.__excludeType, ) self.assertEqual(ok, kwargs["status"]) ok = self.__loadStatus(mw.getLoadStatus()) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __loadStatus(self, statusList): sectionName = "data_exchange_configuration" dl = DocumentLoader( self.__cfgOb, self.__cachePath, resourceName=self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=None, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ok = dl.load( databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=["update_id", "database_name", "object_name"], keyNames=None) return ok