def testReadYamlConfigWithAutoAppend(self): try: cfgOb = ConfigUtil( configPath=self.__inpPathConfigAutoYaml, configFormat="yaml", mockTopPath=self.__mockTopPath, defaultSectionName="site_info_1", cachePath=None, useCache=False, ) ok = cfgOb.appendConfig(self.__inpPathConfigAppendYaml, configFormat="yaml") self.assertTrue(ok) # for sName in [ "section_appended_1", "section_appended_2", "Section1", "Section2" ]: pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName) # self.assertEqual( pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO")) self.assertEqual( pathPdbx, os.path.join(self.__mockTopPath, "MOCK_PDBX_SANDBOX")) # # cfgOb = ConfigUtil( configPath=self.__inpPathConfigAutoYaml, configFormat="yaml", mockTopPath=self.__mockTopPath, defaultSectionName="site_info_1", cachePath=None, useCache=True, ) ok = cfgOb.appendConfig(self.__inpPathConfigAppendYaml, configFormat="yaml") self.assertTrue(ok) # for sName in [ "section_appended_1", "section_appended_2", "Section1", "Section2" ]: pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName) # self.assertEqual( pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO")) self.assertEqual( pathPdbx, os.path.join(self.__mockTopPath, "MOCK_PDBX_SANDBOX")) # except Exception as e: logger.error("Failing with %s", str(e)) self.fail()
def setUp(self): self.__verbose = True # mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") # configName = "site_info_configuration" cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__pathClusterData = cfgOb.getPath( "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName) # sample data set self.__dataSetId = "2018_23" # self.__levels = ["100", "95", "90", "70", "50", "30"] self.__levels = ["95"] # self.__workPath = os.path.join(HERE, "test-output") self.__pathSaveStyleCif = os.path.join(HERE, "test-output", "cluster-data-cif.json") self.__pathSaveStyleDocSequence = os.path.join( HERE, "test-output", "cluster-data-doc-sequence.json") self.__pathSaveStyleDocCluster = os.path.join( HERE, "test-output", "cluster-data-doc-cluster.json") # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
class RepoHoldingsDataPrepTests(unittest.TestCase): def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__updateId = "2019_25" # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # self.__startTime = time.time() logger.info("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.info("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testProcessLegacyFiles(self): """Test loading and processing operations for repository holdings and status echange data.""" try: rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb, sandboxPath=self.__sandboxPath, cachePath=self.__cachePath) rL = rhdp.getHoldingsUpdateEntry(updateId=self.__updateId) logger.info("update data length %r", len(rL)) self.assertGreaterEqual(len(rL), 10) # rL = rhdp.getHoldingsCurrentEntry(updateId=self.__updateId) self.assertGreaterEqual(len(rL), 10) logger.info("holdings data length %r", len(rL)) # rL = rhdp.getHoldingsUnreleasedEntry(updateId=self.__updateId) self.assertGreaterEqual(len(rL), 10) logger.info("unreleased data length %r", len(rL)) # rL = rhdp.getHoldingsRemovedEntry(updateId=self.__updateId) self.assertGreaterEqual(len(rL), 10) logger.info("removed data length %r", len(rL)) rL = rhdp.getHoldingsCombinedEntry(updateId=self.__updateId) self.assertGreaterEqual(len(rL), 10) logger.info("combined data length %r", len(rL)) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class DictionaryProviderTests(unittest.TestCase): def setUp(self): mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__dirPath = os.path.join(self.__cachePath, "dictionaries") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__configName = configName self.__contentInfoConfigName = "content_info_helper_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) dictLocatorMap = self.__cfgOb.get( "DICT_LOCATOR_CONFIG_MAP", sectionName=self.__contentInfoConfigName) schemaName = "pdbx_core" self.__dictLocators = [ self.__cfgOb.getPath(configLocator, sectionName=self.__configName) for configLocator in dictLocatorMap[schemaName] ] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testResourceCache(self): """Test case - generate and check dictonary artifact and api caches """ try: logger.debug("Dictionary locators %r", self.__dictLocators) dp = DictionaryApiProvider(dirPath=self.__dirPath, useCache=False) dApi = dp.getApi(self.__dictLocators) ok = dApi.testCache() self.assertTrue(ok) title = dApi.getDictionaryTitle() logger.debug("Title %r", title) self.assertEqual( title, "mmcif_pdbx.dic,rcsb_mmcif_ext.dic,vrpt_mmcif_ext.dic") # revL = dApi.getDictionaryHistory() numRev = dApi.getDictionaryRevisionCount() logger.debug("Number of dictionary revisions (numRev) %r", numRev) self.assertGreater(numRev, 220) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testReadIniConfig(self): try: cfgOb = ConfigUtil(configPath=self.__inpPathConfigIni, mockTopPath=self.__dataPath) sName = "DEFAULT" pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName) # self.assertEqual( pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO")) self.assertEqual( pathPdbx, os.path.join(self.__mockTopPath, "MOCK_PDBX_SANDBOX")) pathBird = cfgOb.get("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.get("PDBX_REPO_PATH", sectionName=sName) self.assertEqual(pathBird, "MOCK_BIRD_REPO") self.assertEqual(pathPdbx, "MOCK_PDBX_SANDBOX") sName = "Section1" # helperMethod = cfgOb.getHelper("DICT_METHOD_HELPER_MODULE", sectionName=sName) tv = helperMethod.echo("test_value") self.assertEqual(tv, "test_value") # tEnv = "TEST_ENV_VAR" tVal = "TEST_ENV_VAR_VALUE" os.environ[tEnv] = tVal eVal = cfgOb.getEnvValue("ENV_OPTION_A", sectionName=sName) self.assertEqual(tVal, eVal) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return {}
class SchemaDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__numProc = 2 # self.__fileLimit = 200 self.__fileLimit = None self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml") configName = "site_info_configuration" self.__configName = configName self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__mU = MarshalUtil(workPath=self.__cachePath) #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath) # self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName) # self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs" self.__verbose = True # self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName) self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files") self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files") self.__export = True # #self.__extraOpts = None # The following for extended parent/child info - self.__extraOpts = 'addParentRefs|addPrimaryKey' # self.__alldatabaseNameD = { "ihm_dev": ["ihm_dev"], "pdbx": ["pdbx", "pdbx_ext"], "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird": ["bird"], "bird_family": ["family"], "chem_comp": ["chem_comp"], "bird_chem_comp": ["bird_chem_comp"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__databaseNameD = { "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"], "bird_chem_comp_core": ["bird_chem_comp_core"], } self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]} # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]} # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]} # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]} self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateOptsRepo(self): # schemaLevel = "min" schemaLevel = "full" inputPathList = None eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertLessEqual(eCount, 1) @unittest.skip("Disable troubleshooting test") def testValidateOptsList(self): schemaLevel = "min" inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list") # inputPathList = glob.glob(self.__testDirPath + "/*.cif") if not inputPathList: return True databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]} for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)): if ii < 5: continue eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) #@unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmRepo(self): schemaLevel = "min" inputPathList = None self.__export = True databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # #@unittest.skip("Disable IHM troubleshooting test") def testValidateOptsIhmList(self): #schemaLevel = "full" schemaLevel = "min" inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif") if not inputPathList: return True #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]} databaseNameD = {"ihm_dev": ["ihm_dev"]} eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) # self.assertGreaterEqual(eCount, 20) # def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None): # eCount = 0 for databaseName in databaseNameD: mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes) for collectionName in databaseNameD[databaseName]: cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts) # dL, cnL = self.__testPrepDocumentsFromContainers( pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes ) # Raises exceptions for schema compliance. try: Draft4Validator.check_schema(cD) except Exception as e: logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e)) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName) for ii, dD in enumerate(dL): logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii) try: cCount = 0 #for error in sorted(valInfo.iter_errors(dD), key=str): # logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message) # logger.debug("Failing document %d : %r", ii, list(dD.items())) # eCount += 1 # cCount += 1 #if cCount > 0: # logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount) except Exception as e: logger.exception("Validation processing error %s", str(e)) return eCount def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None): """Test case - create loadable PDBx data from repository files """ try: sd, _, _, _ = self.__schP.getSchemaInfo(databaseName) # dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False) dictApi = dP.getApiByName(databaseName) rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST") dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP) # dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow) sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose) containerList = self.__rpP.getContainerList(inputPathList) for container in containerList: cName = container.getName() logger.debug("Processing container %s", cName) dmh.apply(container) if self.__export: savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif") #self.__mU.doExport(savePath, [container], fmt="mmcif") # tableIdExcludeList = sd.getCollectionExcluded(collectionName) tableIdIncludeList = sd.getCollectionSelected(collectionName) sliceFilter = sd.getCollectionSliceFilter(collectionName) sdp.setSchemaIdExcludeList(tableIdExcludeList) sdp.setSchemaIdIncludeList(tableIdIncludeList) # docList, containerNameList, _ = sdp.processDocuments( containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName ) docList = sdp.addDocumentPrivateAttributes(docList, collectionName) docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName) # mergeS = "-".join(mergeContentTypes) if mergeContentTypes else "" if self.__export and docList: # for ii, doc in enumerate(docList[:1]): for ii, doc in enumerate(docList): cn = containerNameList[ii] fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS)) self.__mU.doExport(fp, [doc], fmt="json", indent=3) logger.debug("Exported %r", fp) # return docList, containerNameList except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class ContentDefinitionTests(unittest.TestCase): def setUp(self): self.__verbose = True self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=self.__configName, mockTopPath=self.__mockTopPath) # # self.__pathPdbxDictionaryFile = self.__cfgOb.getPath("PDBX_DICT_LOCATOR", sectionName=self.__configName) self.__pathRcsbDictionaryFile = self.__cfgOb.getPath("RCSB_DICT_LOCATOR", sectionName=self.__configName) self.__pathVrptDictionaryFile = self.__cfgOb.getPath("VRPT_DICT_LOCATOR", sectionName=self.__configName) self.__mU = MarshalUtil() # self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testDefaults(self): """ Test the default case of using only dictionary content. """ try: dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile]) ok = dictApi.testCache() self.assertTrue(ok) sdi = ContentDefinition(dictApi) nS = sdi.getSchemaNames() logger.debug("schema name length %d", len(nS)) self.assertGreaterEqual(len(nS), 600) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testHelper(self): """ Test the dictionary content supplemented by helper function """ try: cH = ContentDefinitionHelper(cfgOb=self.__cfgOb) dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile]) sdi = ContentDefinition(dictApi, databaseName="chem_comp", contentDefHelper=cH) catNameL = sdi.getCategories() cfD = {} afD = {} for catName in catNameL: cfD[catName] = sdi.getCategoryFeatures(catName) afD[catName] = sdi.getAttributeFeatures(catName) # logger.debug("Dictionary category name length %d", len(catNameL)) self.assertGreaterEqual(len(catNameL), 600) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testExtensionWithHelper(self): """ Test the dictionary content supplemented by helper function """ try: cH = ContentDefinitionHelper(cfgOb=self.__cfgOb) dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile]) sdi = ContentDefinition(dictApi, databaseName="pdbx_core", contentDefHelper=cH) catNameL = sdi.getCategories() cfD = {} afD = {} for catName in catNameL: cfD[catName] = sdi.getCategoryFeatures(catName) afD[catName] = sdi.getAttributeFeatures(catName) # logger.debug("Dictionary category name length %d", len(catNameL)) self.assertGreaterEqual(len(catNameL), 650) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testRepoWithHelper(self): """ Test the dictionary content supplemented by helper function for auxiliary schema """ try: cH = ContentDefinitionHelper(cfgOb=self.__cfgOb) dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile, self.__pathVrptDictionaryFile]) sdi = ContentDefinition(dictApi, databaseName="repository_holdings", contentDefHelper=cH) catNameL = sdi.getCategories() cfD = {} afD = {} for catName in catNameL: cfD[catName] = sdi.getCategoryFeatures(catName) afD[catName] = sdi.getAttributeFeatures(catName) # logger.debug("Dictionary category name length %d", len(catNameL)) self.assertGreaterEqual(len(catNameL), 680) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
class SchemaSearchContextsTests(unittest.TestCase): skipFlag = True def setUp(self): self.__verbose = True mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=pathConfig, defaultSectionName=configName, mockTopPath=mockTopPath) self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb) # self.__pathPdbxDictionaryFile = self.__cfgOb.getPath( "PDBX_DICT_LOCATOR", sectionName=configName) self.__pathRcsbDictionaryFile = self.__cfgOb.getPath( "RCSB_DICT_LOCATOR", sectionName=configName) self.__pathVrptDictionaryFile = self.__cfgOb.getPath( "VRPT_DICT_LOCATOR", sectionName=configName) # self.__mU = MarshalUtil() # self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testSearchGroups(self): ok = self.__docHelper.checkSearchGroups() self.assertTrue(ok) @unittest.skipIf(skipFlag, "Troubleshooting test") def testUnUsedIndexedItems(self): """Enumerate items that are indexed by have no search group assignments. collection_attribute_search_contexts """ groupNameList = self.__docHelper.getSearchGroups() logger.info("Search groups (%d)", len(groupNameList)) # nestedSearchableD = self.__assembleNestedCategorySearchables() nestedSearchableD.update(self.__assembleNestedSubCategorySearchables()) # attribContextD = {} tD = self.__docHelper.getAllAttributeSearchContexts() for (catName, atName), contextL in tD.items(): attribContextD.setdefault((catName, atName), []).extend([t[0] for t in contextL]) logger.info("search context attribContextD %d", len(attribContextD)) lookupD = {} # if (catName, atName) in nestedSearchableD: for groupName in groupNameList: # get attributes in group attributeTupList = self.__docHelper.getSearchGroupAttributes( groupName) # logger.info("") # logger.info("%s (%2d):", groupName, len(attributeTupList)) for catName, atName in attributeTupList: lookupD.setdefault((catName, atName), []).append(groupName) # logger.info("Search group lookup len %d", len(lookupD)) for (catName, atName), contextL in attribContextD.items(): # logger.info("%s.%s contexL %r", catName, atName, contextL) if "full-text" in contextL: if (catName, atName) in lookupD or (catName, atName) in nestedSearchableD: continue logger.info("%s.%s contexL %r", catName, atName, contextL) # return True @unittest.skipIf(skipFlag, "Troubleshooting test") def testExpandSearchGroups(self): """Expand search groups and metadata content as these would be display in RCSB search menu.""" _, afD = self.__getContentFeatures() groupNameList = self.__docHelper.getSearchGroups() logger.info("Search groups (%d)", len(groupNameList)) # nestedSearchableD = self.__assembleNestedCategorySearchables() nestedSearchableD.update(self.__assembleNestedSubCategorySearchables()) # for groupName in groupNameList: # get attributes in group attributeTupList = self.__docHelper.getSearchGroupAttributes( groupName) logger.info("") logger.info("%s (%2d):", groupName, len(attributeTupList)) # Get search context and brief descriptions - for catName, atName in attributeTupList: searchContextTupL = self.__docHelper.getSearchContexts( catName, atName) if not searchContextTupL: logger.warning("Missing search context for %s.%s", catName, atName) descriptionText = self.__docHelper.getAttributeDescription( catName, atName, contextType="brief") if not descriptionText: logger.warning("Missing brief description %s.%s", catName, atName) # fD = afD[catName][atName] if catName in afD and atName in afD[ catName] else {} logger.debug("%s %s fD %r", catName, atName, fD) units = fD["UNITS"] if "UNITS" in fD else None # uS = "" if units: uS = "(units=%s)" % units # nS = "(%s.%s)" % (catName, atName) if (catName, atName) in nestedSearchableD: for dS in nestedSearchableD[(catName, atName)]: logger.info( "- %-55s: %s %s (%s)", dS, nS, uS, ",".join([tup[0] for tup in searchContextTupL])) else: logger.info( "- %-55s: %s %s (%s)", descriptionText, nS, uS, ",".join([tup[0] for tup in searchContextTupL])) return True def __assembleNestedCategorySearchables(self): """Assemble dictionary of searchable items in nested categories. Returns: (dict): {(category, atName): ["Materialized brief description", ... ] """ # cfD, afD = self.__getContentFeatures() _, afD = self.__getContentFeatures() logger.info("") searchableCategoryD = {} groupNameList = self.__docHelper.getSearchGroups() logger.debug("Search group count (%d)", len(groupNameList)) for groupName in groupNameList: # get attributes in group attributeTupList = self.__docHelper.getSearchGroupAttributes( groupName) for catName, atName in attributeTupList: searchableCategoryD.setdefault(catName, []).append(atName) logger.debug("Searchable category count (%d)", len(searchableCategoryD)) # retD = {} for catName in searchableCategoryD: nestedContextDL = self.__docHelper.getNestedContexts(catName) if not nestedContextDL: # not nested skip continue elif len(nestedContextDL) > 1: logger.warning("Multiple nested contexts for category %s", catName) # for nestedContextD in nestedContextDL: contextPath = nestedContextD[ "FIRST_CONTEXT_PATH"] if "FIRST_CONTEXT_PATH" in nestedContextD else None if not contextPath: logger.warning( "Missing context path for nested category %s", catName) continue # contextName = nestedContextD["CONTEXT_NAME"] # cpCatName = contextPath.split(".")[0] cpAtName = contextPath.split(".")[1] nestedPathSearchContext = self.__docHelper.getSearchContexts( cpCatName, cpAtName) logger.debug("Nested (%r) context path for %r %r", contextName, cpCatName, cpAtName) if not nestedPathSearchContext: logger.warning( "Missing nested (%r) search context for %r %r", contextName, cpCatName, cpAtName) # nfD = afD[cpCatName][ cpAtName] if cpCatName in afD and cpAtName in afD[ cpCatName] else {} logger.debug("FeatureD %r", nfD) # -- enumMapD = {} enumDL = nfD["ENUMS_ANNOTATED"] if not enumDL: logger.warning("Missing nested enums %s.%s", cpCatName, cpAtName) else: logger.debug("All context enums count %d", len(enumDL)) for enumD in enumDL: logger.info("%s.%s enumD %r", cpCatName, cpAtName, enumD) if "name" not in enumD: logger.warning( "Missing nested enum (name) for %s.%s", cpCatName, cpAtName) # enumMapD = { enumD["value"]: enumD["name"] if "name" in enumD else enumD["detail"] for enumD in enumDL } # -- nestedDescriptionText = self.__docHelper.getAttributeDescription( cpCatName, cpAtName, contextType="brief") if not nestedDescriptionText: logger.warning("Missing brief nested description %s.%s", cpCatName, cpAtName) else: logger.debug("Nested context description: %r", nestedDescriptionText) # -- cvDL = nestedContextD["CONTEXT_ATTRIBUTE_VALUES"] if "CONTEXT_ATTRIBUTE_VALUES" in nestedContextD else [] if not cvDL: logger.warning("Missing context attribute values for %s", catName) # if no context values defined then use: all enums x searchable attributes in this category # # Template: enum detail + search attribute brief description text for enumD in enumDL: for atName in searchableCategoryD[catName]: briefDescr = self.__docHelper.getAttributeDescription( catName, atName, contextType="brief") # subCategories = nfD["SUB_CATEGORIES"] if "SUB_CATEGORIES" in nfD else None tS = enumD["detail"] + " " + briefDescr retD.setdefault((catName, atName), []).append(tS) else: # Only use context values from the full enum list with specified search paths. # # Template: context value (enum detail) + search path attribute (brief description text) # cVDL.append({"CONTEXT_VALUE": tD["CONTEXT_VALUE"], "SEARCH_PATHS": tD["SEARCH_PATHS"]}) # for cvD in cvDL: enumV = cvD["CONTEXT_VALUE"] enumDetail = enumMapD[ enumV] if enumV in enumMapD else None if not enumDetail: logger.warning( "%s %s missing detail for enum value %s", catName, cpAtName, enumV) for sp in cvD["SEARCH_PATHS"]: if sp.count(".") > 1: k = sp.rfind(".") sp = sp[:k] + "_" + sp[k + 1:] cnS = sp.split(".")[0] anS = sp.split(".")[1] briefDescr = self.__docHelper.getAttributeDescription( cnS, anS, contextType="brief") tS = enumDetail + " " + briefDescr logger.debug("%s,%s tS %r", cnS, anS, tS) retD.setdefault((cnS, anS), []).append(tS) for aD in cvD["ATTRIBUTES"]: sp = aD["PATH"] if sp.count(".") > 1: k = sp.rfind(".") sp = sp[:k] + "_" + sp[k + 1:] cnS = sp.split(".")[0] anS = sp.split(".")[1] briefDescr = self.__docHelper.getAttributeDescription( cnS, anS, contextType="brief") tS = enumDetail + " " + briefDescr logger.debug("%s,%s tS %r", cnS, anS, tS) retD.setdefault((cnS, anS), []).append(tS) exL = aD["EXAMPLES"] logger.info("%s,%s sp %r examplesL %r", cnS, anS, sp, exL) # for k, vL in retD.items(): for v in vL: logger.debug("%s : %r", k, v) # return retD def __assembleNestedSubCategorySearchables(self): """Assemble dictionary of searchable items in nested subcategories. Returns: (dict): {(category, atName): ["Materialized brief description", ... ] """ _, afD = self.__getContentFeatures() # logger.info("") searchableCategoryD = {} groupNameList = self.__docHelper.getSearchGroups() logger.debug("Search group count (%d)", len(groupNameList)) for groupName in groupNameList: # get attributes in group attributeTupList = self.__docHelper.getSearchGroupAttributes( groupName) for catName, atName in attributeTupList: searchableCategoryD.setdefault(catName, []).append(atName) logger.debug("Searchable category count (%d)", len(searchableCategoryD)) # subcatNestedD = {} tD = self.__docHelper.getAllSubCategoryNestedContexts() for k, v in tD.items(): for kk, vv in v.items(): if kk in subcatNestedD: logger.warning( "Duplicate nested subcategory specifications in %r %r", k, kk) # only take cases with an context path ... if "FIRST_CONTEXT_PATH" in vv: subcatNestedD[kk[0]] = (kk[1], vv) # cat = (subcat, {nested context dict}) # retD = {} for catName in searchableCategoryD: if catName not in subcatNestedD: continue subCatName, nestedContextD = subcatNestedD[catName] # contextPath = nestedContextD[ "FIRST_CONTEXT_PATH"] if "FIRST_CONTEXT_PATH" in nestedContextD else None if not contextPath: logger.warning("Missing context path for nested category %s", catName) continue # if contextPath.count(".") > 1: k = contextPath.rfind(".") contextPath = contextPath[:k] + "_" + contextPath[k + 1:] logger.debug("%s subcategory %s context path %r", catName, subCatName, contextPath) contextName = nestedContextD["CONTEXT_NAME"] cpCatName = contextPath.split(".")[0] cpAtName = contextPath.split(".")[1] nestedPathSearchContext = self.__docHelper.getSearchContexts( cpCatName, cpAtName) logger.debug("Nested (%r) context path for %r %r", contextName, cpCatName, cpAtName) if not nestedPathSearchContext: logger.warning("Missing nested (%r) search context for %r %r", contextName, cpCatName, cpAtName) # nfD = afD[cpCatName][ cpAtName] if cpCatName in afD and cpAtName in afD[ cpCatName] else {} logger.debug("FeatureD %r", nfD) # -- enumMapD = {} enumDL = nfD["ENUMS_ANNOTATED"] if not enumDL: logger.warning("Missing nested enums %s.%s", cpCatName, cpAtName) else: logger.debug("All context enums count %d", len(enumDL)) for enumD in enumDL: if "name" not in enumD: logger.warning("Missing nested enum (name) for %s.%s", cpCatName, cpAtName) # enumMapD = { enumD["value"]: enumD["name"] if "name" in enumD else enumD["detail"] for enumD in enumDL } # -- nestedDescriptionText = self.__docHelper.getAttributeDescription( cpCatName, cpAtName, contextType="brief") if not nestedDescriptionText: logger.warning("Missing brief nested description %s.%s", cpCatName, cpAtName) else: logger.debug("Nested context description: %r", nestedDescriptionText) # -- cvDL = nestedContextD["CONTEXT_ATTRIBUTE_VALUES"] if "CONTEXT_ATTRIBUTE_VALUES" in nestedContextD else [] # if not cvDL: logger.warning("Missing context attribute values for %s", catName) # if no context values defined then use: all enums x searchable attributes in this category # # Template: enum detail + search attribute brief description text for enumD in enumDL: for atName in searchableCategoryD[catName]: nnfD = afD[catName][atName] subCatL = [d["id"] for d in nnfD["SUB_CATEGORIES"] ] if "SUB_CATEGORIES" in nnfD else None logger.debug("%s.%s %s subCatL %r", catName, atName, subCatName, subCatL) if subCatL and subCatName in subCatL: briefDescr = self.__docHelper.getAttributeDescription( catName, atName, contextType="brief") tS = enumD["detail"] + " " + briefDescr retD.setdefault((catName, atName), []).append(tS) else: # Only use context values from the full enum list with specified search paths. # # Template: context value (enum detail) + search path attribute (brief description text) # cVDL.append({"CONTEXT_VALUE": tD["CONTEXT_VALUE"], "SEARCH_PATHS": tD["SEARCH_PATHS"]}) # for cvD in cvDL: enumV = cvD["CONTEXT_VALUE"] enumDetail = enumMapD[enumV] if enumV in enumMapD else None if not enumDetail: logger.warning( "%s %s missing detail for enum value %s", catName, cpAtName, enumV) for sp in cvD["SEARCH_PATHS"]: if sp.count(".") > 1: k = sp.rfind(".") sp = sp[:k] + "_" + sp[k + 1:] cnS = sp.split(".")[0] anS = sp.split(".")[1] briefDescr = self.__docHelper.getAttributeDescription( cnS, anS, contextType="brief") tS = enumDetail + " " + briefDescr retD.setdefault((cnS, anS), []).append(tS) for aD in cvD["ATTRIBUTES"]: sp = aD["PATH"] if sp.count(".") > 1: k = sp.rfind(".") sp = sp[:k] + "_" + sp[k + 1:] cnS = sp.split(".")[0] anS = sp.split(".")[1] briefDescr = self.__docHelper.getAttributeDescription( cnS, anS, contextType="brief") tS = enumDetail + " " + briefDescr retD.setdefault((cnS, anS), []).append(tS) exL = aD["EXAMPLES"] logger.debug("%s,%s sp %r exL %r", cnS, anS, sp, exL) # for k, vL in retD.items(): for v in vL: logger.debug("%s : %r", k, v) # return retD def __getContentFeatures(self): """Get category and attribute features""" try: cH = ContentDefinitionHelper(cfgOb=self.__cfgOb) dictApi = self.__dP.getApiByLocators(dictLocators=[ self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile ]) # logger.info("units = %r", dictApi.getUnits("pdbx_nmr_spectrometer", "manufacturer")) sdi = ContentDefinition(dictApi, databaseName="pdbx_core", contentDefHelper=cH) catNameL = sdi.getCategories() cfD = {} afD = {} for catName in catNameL: cfD[catName] = sdi.getCategoryFeatures(catName) afD[catName] = sdi.getAttributeFeatures(catName) # return cfD, afD except Exception as e: logger.exception("Failing with %s", str(e)) return None, None
class RepoHoldingsDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__updateId = "2018_25" self.__export = False # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # self.__mU = MarshalUtil(workPath=self.__cachePath) self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateOptsStrict(self): updateId = self.__updateId schemaLevel = "full" eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertTrue(eCount <= 1) @unittest.skip("Troubleshooting test") def testValidateOptsMin(self): updateId = self.__updateId schemaLevel = "min" eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel) logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount) self.assertTrue(eCount <= 1) def __testValidateOpts(self, updateId, schemaLevel="full"): schemaNames = ["repository_holdings"] collectionNames = { "repository_holdings": [ "repository_holdings_update_entry", "repository_holdings_current_entry", "repository_holdings_unreleased_entry", "repository_holdings_removed_entry", "repository_holdings_combined_entry", ], "entity_sequence_clusters": ["cluster_members", "cluster_provenance", "entity_members"], } # eCount = 0 for schemaName in schemaNames: for collectionName in collectionNames[schemaName]: _ = self.__schP.makeSchemaDef(schemaName, dataTyping="ANY", saveSchema=True) cD = self.__schP.makeSchema(schemaName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True) dL = self.__getRepositoryHoldingsDocuments( schemaName, collectionName, updateId) if self.__export: savePath = os.path.join(HERE, "test-output", collectionName + ".json") self.__mU.doExport(savePath, dL, fmt="json", indent=3) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) for ii, dD in enumerate(dL): logger.debug("Schema %s collection %s document %d", schemaName, collectionName, ii) try: cCount = 0 for error in sorted(valInfo.iter_errors(dD), key=str): logger.info( "schema %s collection %s path %s error: %s", schemaName, collectionName, error.path, error.message) logger.info(">>>") logger.info(">>> failing object is %r", dD) logger.info(">>>") eCount += 1 cCount += 1 # logger.debug("schema %s collection %s count %d", schemaName, collectionName, cCount) except Exception as e: logger.exception("Validation error %s", str(e)) return eCount def __getRepositoryHoldingsDocuments(self, schemaName, collectionName, updateId): """Test loading and processing operations for legacy holdings and status data.""" rL = [] try: rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb, sandboxPath=self.__sandboxPath, workPath=self.__cachePath) if collectionName == "repository_holdings_update_entry": rL = rhdp.getHoldingsUpdateEntry(updateId=updateId) self.assertGreaterEqual(len(rL), 10) logger.debug("update data length %r", len(rL)) # elif collectionName == "repository_holdings_current_entry": rL = rhdp.getHoldingsCurrentEntry(updateId=updateId) self.assertGreaterEqual(len(rL), 10) logger.debug("holdings data length %r", len(rL)) # elif collectionName == "repository_holdings_unreleased_entry": rL = rhdp.getHoldingsUnreleasedEntry(updateId=updateId) self.assertGreaterEqual(len(rL), 10) logger.debug("unreleased data length %r", len(rL)) # elif collectionName in ["repository_holdings_removed_entry"]: rL = rhdp.getHoldingsRemovedEntry(updateId=updateId) if collectionName == "repository_holdings_removed": self.assertGreaterEqual(len(rL), 10) logger.debug("removed data length %r", len(rL)) elif collectionName == "repository_holdings_combined_entry": rL = rhdp.getHoldingsCombinedEntry(updateId=updateId) self.assertGreaterEqual(len(rL), 10) logger.debug("holdings data length %r", len(rL)) # except Exception as e: logger.exception("%s %s failing with %s", schemaName, collectionName, str(e)) self.fail() return rL
class ClusterDataPrepValidateTests(unittest.TestCase): def setUp(self): self.__verbose = True # self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__updateId = "2018_25" # configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath) self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True) # self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # self.__dataSetId = "2018_23" self.__pathClusterData = self.__cfgOb.getPath("RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName) # self.__levels = ['100', '95', '90', '70', '50', '30'] self.__levels = ["100"] # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testValidateOptsStrict(self): updateId = self.__updateId validationLevel = "full" eCount = self.__testValidateOpts(updateId, validationLevel=validationLevel) logger.info("Total validation errors validation level %s : %d", validationLevel, eCount) self.assertTrue(eCount <= 1) def __testValidateOpts(self, updateId, validationLevel="full"): _ = updateId databaseNames = ["sequence_clusters"] collectionNames = {"sequence_clusters": ["cluster_provenance", "cluster_members", "entity_members"]} # eCount = 0 for databaseName in databaseNames: for collectionName in collectionNames[databaseName]: _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True) cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=validationLevel, saveSchema=True) # dL = self.__getSequenceClusterData(collectionName, levels=self.__levels, dataSetId=self.__dataSetId, dataLocator=self.__pathClusterData) # Raises exceptions for schema compliance. Draft4Validator.check_schema(cD) # valInfo = Draft4Validator(cD, format_checker=FormatChecker()) for _, dD in enumerate(dL): # logger.debug("Schema %s collection %s document %d" % (schemaName, collectionName, ii)) try: cCount = 0 for error in sorted(valInfo.iter_errors(dD), key=str): logger.info("schema %s collection %s path %s error: %s", databaseName, collectionName, error.path, error.message) logger.info(">>> failing object is %r", dD) eCount += 1 cCount += 1 # logger.debug("schema %s collection %s count %d", databaseName, collectionName, cCount) except Exception as e: logger.exception("Validation error %s", str(e)) return eCount def __fetchProvenance(self): """Test case for fetching a provenance dictionary content.""" try: provKeyName = "rcsb_entity_sequence_cluster_prov" provU = ProvenanceProvider(self.__cfgOb, self.__cachePath, useCache=True) pD = provU.fetch() return pD[provKeyName] if provKeyName in pD else {} except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __getSequenceClusterData(self, collectionName, dataSetId=None, dataLocator=None, levels=None): """Test extraction on an example sequence cluster data set.""" try: # if collectionName == "cluster_provenance": return [self.__fetchProvenance()] # entitySchemaName = "rcsb_entity_sequence_cluster_list" clusterSchemaName = "rcsb_entity_sequence_cluster_identifer_list" cdp = ClusterDataPrep(workPath=self.__cachePath, entitySchemaName=entitySchemaName, clusterSchemaName=clusterSchemaName) cifD, docBySequenceD, docByClusterD = cdp.extract(dataSetId, clusterSetLocator=dataLocator, levels=levels, clusterType="entity") self.assertEqual(len(cifD), 1) self.assertEqual(len(docBySequenceD), 1) self.assertEqual(len(docByClusterD), 1) if collectionName == "entity_members": return docBySequenceD[entitySchemaName] elif collectionName == "cluster_members": return docByClusterD[clusterSchemaName] except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return None
def main(): parser = argparse.ArgumentParser() # defaultConfigName = "site_info_configuration" # parser.add_argument( "--full", default=True, action="store_true", help="Fresh full load in a new tables/collections (Default)") # parser.add_argument("--etl_entity_sequence_clusters", default=False, action="store_true", help="ETL entity sequence clusters") parser.add_argument("--etl_repository_holdings", default=False, action="store_true", help="ETL repository holdings") # parser.add_argument("--etl_chemref", default=False, action="store_true", help="ETL integrated chemical reference data") # parser.add_argument("--etl_tree_node_lists", default=False, action='store_true', help="ETL tree node lists") parser.add_argument( "--data_set_id", default=None, help="Data set identifier (default= 2018_14 for current week)") # parser.add_argument( "--sequence_cluster_data_path", default=None, help="Sequence cluster data path (default set by configuration") parser.add_argument( "--sandbox_data_path", default=None, help="Date exchange sandboxPath data path (default set by configuration" ) # parser.add_argument("--config_path", default=None, help="Path to configuration options file") parser.add_argument("--config_name", default=defaultConfigName, help="Configuration section name") parser.add_argument("--db_type", default="mongo", help="Database server type (default=mongo)") # parser.add_argument("--document_style", default="rowwise_by_name_with_cardinality", # help="Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name") parser.add_argument("--read_back_check", default=False, action="store_true", help="Perform read back check on all documents") # parser.add_argument("--num_proc", default=2, help="Number of processes to execute (default=2)") parser.add_argument("--chunk_size", default=10, help="Number of files loaded per process") parser.add_argument("--document_limit", default=None, help="Load document limit for testing") parser.add_argument("--prune_document_size", default=None, help="Prune large documents to this size limit (MB)") parser.add_argument("--debug", default=False, action="store_true", help="Turn on verbose logging") parser.add_argument("--mock", default=False, action="store_true", help="Use MOCK repository configuration for testing") parser.add_argument("--cache_path", default=None, help="Path containing cache directories") # parser.add_argument("--use_cache", default=False, action="store_true", help="Use cache files from remote resources") parser.add_argument("--rebuild_cache", default=False, action="store_true", help="Rebuild cached resource files") # parser.add_argument("--rebuild_schema", default=False, action="store_true", help="Rebuild schema on-the-fly if not cached") # # args = parser.parse_args() # debugFlag = args.debug if debugFlag: logger.setLevel(logging.DEBUG) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Configuration Details configPath = args.config_path configName = args.config_name # useCache = args.use_cache if not configPath: configPath = os.getenv("DBLOAD_CONFIG_PATH", None) try: if os.access(configPath, os.R_OK): os.environ["DBLOAD_CONFIG_PATH"] = configPath logger.info("Using configuation path %s (%s)", configPath, configName) else: logger.error("Missing or access issue with config file %r", configPath) exit(1) mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") if args.mock else None cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=defaultConfigName, mockTopPath=mockTopPath) if configName != defaultConfigName: cfgOb.replaceSectionName(defaultConfigName, configName) # except Exception as e: logger.error("Missing or access issue with config file %r with %s", configPath, str(e)) exit(1) # try: readBackCheck = args.read_back_check tU = TimeUtil() dataSetId = args.data_set_id if args.data_set_id else tU.getCurrentWeekSignature( ) seqDataLocator = args.sequence_cluster_data_path if args.sequence_cluster_data_path else cfgOb.getPath( "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName) sandboxPath = args.sandbox_data_path if args.sandbox_data_path else cfgOb.getPath( "RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) numProc = int(args.num_proc) chunkSize = int(args.chunk_size) documentLimit = int( args.document_limit) if args.document_limit else None loadType = "full" if args.full else "replace" # loadType = 'replace' if args.replace else 'full' cachePath = args.cache_path if args.cache_path else "." rebuildCache = args.rebuild_cache if args.rebuild_cache else False # rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False # # if args.document_style not in ['rowwise_by_name', 'rowwise_by_name_with_cardinality', 'columnwise_by_name', 'rowwise_by_id', 'rowwise_no_name']: # logger.error("Unsupported document style %s" % args.document_style) if args.db_type != "mongo": logger.error("Unsupported database server type %s", args.db_type) except Exception as e: logger.exception("Argument processing problem %s", str(e)) parser.print_help(sys.stderr) exit(1) # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- - # Rebuild or check resource cache ok = buildResourceCache(cfgOb, configName, cachePath, rebuildCache=rebuildCache) if not ok: logger.error("Cache rebuild or check failure (rebuild %r) %r", rebuildCache, cachePath) exit(1) ## if args.db_type == "mongo": if args.etl_entity_sequence_clusters: cw = SequenceClustersEtlWorker(cfgOb, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck, workPath=cachePath) ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType) okS = loadStatus(cw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) if args.etl_repository_holdings: rhw = RepoHoldingsEtlWorker(cfgOb, sandboxPath, cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=debugFlag, readBackCheck=readBackCheck) ok = rhw.load(dataSetId, loadType=loadType) okS = loadStatus(rhw.getLoadStatus(), cfgOb, cachePath, readBackCheck=readBackCheck) logger.info("Operation completed with status %r " % ok and okS)
class RepoLoadWorkflow(object): def __init__(self, **kwargs): # Configuration Details configPath = kwargs.get("configPath", "exdb-config-example.yml") self.__configName = kwargs.get("configName", "site_info_configuration") mockTopPath = kwargs.get("mockTopPath", None) self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=self.__configName, mockTopPath=mockTopPath) # self.__cachePath = kwargs.get("cachePath", ".") self.__cachePath = os.path.abspath(self.__cachePath) self.__debugFlag = kwargs.get("debugFlag", False) if self.__debugFlag: logger.setLevel(logging.DEBUG) # # Rebuild or check resource cache # rebuildCache = kwargs.get("rebuildCache", False) # self.__cacheStatus = self.buildResourceCache(rebuildCache=rebuildCache) # logger.debug("Cache status if %r", self.__cacheStatus) # def load(self, op, **kwargs): # if not self.__cacheStatus: # logger.error("Resource cache test or rebuild has failed - exiting") # return False # argument processing if op not in [ "pdbx-loader", "etl-repository-holdings", "etl-entity-sequence-clusters" ]: logger.error("Unsupported operation %r - exiting", op) return False try: readBackCheck = kwargs.get("readBackCheck", False) numProc = int(kwargs.get("numProc", 1)) chunkSize = int(kwargs.get("chunkSize", 10)) fileLimit = int( kwargs.get("fileLimit")) if "fileLimit" in kwargs else None documentLimit = int(kwargs.get( "documentLimit")) if "documentLimit" in kwargs else None failedFilePath = kwargs.get("failFileListPath", None) loadFileListPath = kwargs.get("loadFileListPath", None) saveInputFileListPath = kwargs.get("saveFileListPath", None) schemaLevel = kwargs.get("schemaLevel", "min") if kwargs.get("schemaLevel") in [ "min", "full" ] else "min" loadType = kwargs.get("loadType", "full") # or replace updateSchemaOnReplace = kwargs.get("updateSchemaOnReplace", True) pruneDocumentSize = float( kwargs.get("pruneDocumentSize" )) if "pruneDocumentSize" in kwargs else None # "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name", documentStyle = kwargs.get("documentStyle", "rowwise_by_name_with_cardinality") dbType = kwargs.get("dbType", "mongo") # databaseName = kwargs.get("databaseName", None) databaseNameList = self.__cfgOb.get( "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration").split(",") collectionNameList = kwargs.get("collectionNameList", None) mergeValidationReports = kwargs.get("mergeValidationReports", True) # tU = TimeUtil() dataSetId = kwargs.get( "dataSetId" ) if "dataSetId" in kwargs else tU.getCurrentWeekSignature() seqDataLocator = self.__cfgOb.getPath( "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=self.__configName) sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=self.__configName) except Exception as e: logger.exception( "Argument and configuration processing failing with %s", str(e)) return False # if op == "pdbx-loader" and dbType == "mongo" and databaseName in databaseNameList: okS = True try: inputPathList = None if loadFileListPath: mu = MarshalUtil(workPath=self.__cachePath) inputPathList = mu.doImport(loadFileListPath, fmt="list") if not inputPathList: logger.error( "Operation %r missing or empty input file path list %s - exiting", op, loadFileListPath) return False except Exception as e: logger.exception( "Operation %r processing input path list failing with %s", op, str(e)) return False # try: mw = PdbxLoader( self.__cfgOb, self.__cachePath, resourceName="MONGO_DB", numProc=numProc, chunkSize=chunkSize, fileLimit=fileLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, ) ok = mw.load( databaseName, collectionLoadList=collectionNameList, loadType=loadType, inputPathList=inputPathList, styleType=documentStyle, dataSelectors=["PUBLIC_RELEASE"], failedFilePath=failedFilePath, saveInputFileListPath=saveInputFileListPath, pruneDocumentSize=pruneDocumentSize, validationLevel=schemaLevel, mergeContentTypes=["vrpt"] if mergeValidationReports else None, updateSchemaOnReplace=updateSchemaOnReplace, ) okS = self.loadStatus(mw.getLoadStatus(), readBackCheck=readBackCheck) except Exception as e: logger.exception("Operation %r database %r failing with %s", op, databaseName, str(e)) elif op == "etl-entity-sequence-clusters" and dbType == "mongo": cw = SequenceClustersEtlWorker(self.__cfgOb, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, workPath=self.__cachePath) ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType) okS = self.loadStatus(cw.getLoadStatus(), readBackCheck=readBackCheck) elif op == "etl-repository-holdings" and dbType == "mongo": rhw = RepoHoldingsEtlWorker( self.__cfgOb, sandboxPath, self.__cachePath, numProc=numProc, chunkSize=chunkSize, documentLimit=documentLimit, verbose=self.__debugFlag, readBackCheck=readBackCheck, ) ok = rhw.load(dataSetId, loadType=loadType) okS = self.loadStatus(rhw.getLoadStatus(), readBackCheck=readBackCheck) logger.info("Completed operation %r with status %r", op, ok and okS) return ok and okS def loadStatus(self, statusList, readBackCheck=True): ret = False try: dl = DocumentLoader(self.__cfgOb, self.__cachePath, "MONGO_DB", numProc=1, chunkSize=2, documentLimit=None, verbose=False, readBackCheck=readBackCheck) # sectionName = "data_exchange_configuration" databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS", sectionName=sectionName) ret = dl.load(databaseName, collectionName, loadType="append", documentList=statusList, indexAttributeList=[ "update_id", "database_name", "object_name" ], keyNames=None) except Exception as e: logger.exception("Failing with %s", str(e)) return ret def buildResourceCache(self, rebuildCache=False): """Generate and cache resource dependencies. """ ret = False try: useCache = not rebuildCache logger.info("Cache setting useCache is %r", useCache) rp = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath) ret = rp.cacheResources(useCache=useCache) except Exception as e: logger.exception("Failing with %s", str(e)) return ret
class RepoHoldingsRemoteLoaderTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(RepoHoldingsRemoteLoaderTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) self.__resourceName = "MONGO_DB" self.__readBackCheck = True self.__numProc = 2 self.__chunkSize = 10 self.__documentLimit = None self.__filterType = "assign-dates" # self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName) # sample data set self.__updateId = "2021_36" # eiP = EntryInfoProvider(cachePath=self.__cachePath, useCache=True) ok = eiP.testCache(minCount=0) self.assertTrue(ok) ok = eiP.restore(self.__cfgOb, configName, useStash=False, useGit=True) self.assertTrue(ok) ok = eiP.reload() self.assertTrue(ok) # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def testLoadHoldingsRemote(self): """Test case - load legacy repository holdings and status data - [repository_holdings] DATABASE_NAME=repository_holdings DATABASE_VERSION_STRING=v5 COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry COLLECTION_HOLDINGS_COMBINED=rcsb_repository_holdings_combined_entry """ try: sectionName = "repository_holdings_configuration" rhdp = RepoHoldingsRemoteDataPrep(cachePath=self.__cachePath, filterType=self.__filterType) # dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # databaseName = self.__cfgOb.get("DATABASE_NAME", sectionName=sectionName) logger.info("databaseName %r", databaseName) addValues = None # maxDoc = 5 dList = rhdp.getHoldingsRemovedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsUnreleasedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsUpdateEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE", sectionName=sectionName) logger.info("collectionName %r", collectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsCurrentEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # dList = rhdp.getHoldingsCombinedEntry(updateId=self.__updateId) dList = dList[:maxDoc] if maxDoc else dList collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED", sectionName=sectionName) ok = dl.load(databaseName, collectionName, loadType="full", documentList=dList, indexAttributeList=["update_id", "entry_id"], keyNames=None, addValues=addValues) logger.info("Collection %r length %d load status %r", collectionName, len(dList), ok) self.assertTrue(ok) # except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()
def testReadYamlConfig(self): try: cfgOb = ConfigUtil(configPath=self.__inpPathConfigYaml, configFormat="yaml", mockTopPath=self.__mockTopPath) ok = cfgOb.appendConfig(self.__inpPathConfigAppendYaml, configFormat="yaml") self.assertTrue(ok) # sName = "DEFAULT" pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName) # self.assertEqual( pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO")) self.assertEqual( pathPdbx, os.path.join(self.__mockTopPath, "MOCK_PDBX_SANDBOX")) pathBird = cfgOb.get("BIRD_REPO_PATH", sectionName=sName) pathPdbx = cfgOb.get("PDBX_REPO_PATH", sectionName=sName) self.assertEqual(pathBird, "MOCK_BIRD_REPO") self.assertEqual(pathPdbx, "MOCK_PDBX_SANDBOX") sName = "Section1" # helperMethod = cfgOb.getHelper("DICT_METHOD_HELPER_MODULE", sectionName=sName) tv = helperMethod.echo("test_value") self.assertEqual(tv, "test_value") # tEnv = "TEST_ENV_VAR" tVal = "TEST_ENV_VAR_VALUE" os.environ[tEnv] = tVal eVal = cfgOb.getEnvValue("ENV_OPTION_A", sectionName=sName) self.assertEqual(tVal, eVal) ky = "42d13dfc9eb689e48c774aa5af8a7e15dbabcd5041939bef213eb37aed882fd6" os.environ["CONFIG_SUPPORT_TOKEN_ENV"] = ky # un = cfgOb.getSecret("SECRET_TEST_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.getSecret("SECRET_TEST_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser") self.assertEqual(pw, "testuserpassword") # un = cfgOb.get("_TEST_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.get("_TEST_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser") self.assertEqual(pw, "testuserpassword") # un = cfgOb.getSecret("_TEST_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.getSecret("_TEST_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser") self.assertEqual(pw, "testuserpassword") # sName = "Section2" un = cfgOb.getSecret("_TEST_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.getSecret("_TEST_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser") self.assertEqual(pw, "testuserpassword") # test fallback # CLEAR_TEXT_USERNAME: testuser2 # CLEAR_TEXT_PASSWORD: changeme2 un = cfgOb.get("_CLEAR_TEXT_USERNAME", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") pw = cfgOb.get("_CLEAR_TEXT_PASSWORD", default=None, sectionName=sName, tokenName="CONFIG_SUPPORT_TOKEN") self.assertEqual(un, "testuser2") self.assertEqual(pw, "changeme2") except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() return {}
class SequenceClusterLoaderTests(unittest.TestCase): def __init__(self, methodName="runTest"): super(SequenceClusterLoaderTests, self).__init__(methodName) self.__verbose = True def setUp(self): # # mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data") configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml") configName = "site_info_configuration" self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName, mockTopPath=mockTopPath) # self.__cfgOb.dump() self.__resourceName = "MONGO_DB" self.__failedFilePath = os.path.join(HERE, "test-output", "failed-list.txt") self.__readBackCheck = True self.__numProc = 2 self.__chunkSize = 10 self.__documentLimit = 1000 # # sample data set self.__dataSetId = "2018_23" self.__pathClusterData = self.__cfgOb.getPath( "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName) self.__levels = ["100", "95", "90", "70", "50", "30"] # self.__workPath = os.path.join(HERE, "test-output") self.__cachePath = os.path.join(TOPDIR, "CACHE") self.__pathSaveStyleCif = os.path.join(HERE, "test-output", "cluster-data-cif.json") self.__pathSaveStyleDocSequence = os.path.join( HERE, "test-output", "cluster-data-doc-sequence.json") self.__pathSaveStyleDocCluster = os.path.join( HERE, "test-output", "cluster-data-doc-cluster.json") # self.__entitySchemaName = "rcsb_entity_sequence_cluster_list" self.__clusterSchemaName = "rcsb_entity_sequence_cluster_identifer_list" self.__provKeyName = "rcsb_entity_sequence_cluster_prov" # # self.__startTime = time.time() logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime())) def tearDown(self): endTime = time.time() logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime) def __fetchProvenance(self): """ Test case for fetching a provenance dictionary content. """ try: provU = ProvenanceProvider(self.__cfgOb, self.__cachePath) pD = provU.fetch() return pD[self.__provKeyName] if self.__provKeyName in pD else {} except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def __testExtract(self, dataSetId, dataLocator, levels): """ Test extraction on an example sequence cluster data set. """ try: cdp = ClusterDataPrep(workPath=self.__workPath, entitySchemaName=self.__entitySchemaName, clusterSchemaName=self.__clusterSchemaName) cifD, docBySequenceD, docByClusterD = cdp.extract( dataSetId, clusterSetLocator=dataLocator, levels=levels, clusterType="entity") self.assertEqual(len(cifD), 1) self.assertEqual(len(docBySequenceD), 1) self.assertEqual(len(docByClusterD), 1) return docBySequenceD, docByClusterD except Exception as e: logger.exception("Failing with %s", str(e)) self.fail() def testLoadCluster(self): """ Test case - load example sequence cluster document data """ try: dl = DocumentLoader( self.__cfgOb, self.__cachePath, self.__resourceName, numProc=self.__numProc, chunkSize=self.__chunkSize, documentLimit=self.__documentLimit, verbose=self.__verbose, readBackCheck=self.__readBackCheck, ) # docBySequenceD, docByClusterD = self.__testExtract( dataSetId=self.__dataSetId, dataLocator=self.__pathClusterData, levels=self.__levels) # dList = docBySequenceD[self.__entitySchemaName] ok = dl.load( "sequence_clusters", "entity_members", loadType="full", documentList=dList, indexAttributeList=["data_set_id", "entry_id", "entity_id"], keyNames=None) self.assertTrue(ok) dList = docByClusterD[self.__clusterSchemaName] ok = dl.load( "sequence_clusters", "cluster_members", loadType="full", documentList=dList, indexAttributeList=["data_set_id", "identity", "cluster_id"], keyNames=None) self.assertTrue(ok) pD = self.__fetchProvenance() ok = dl.load("sequence_clusters", "cluster_provenance", loadType="full", documentList=[pD], indexAttributeList=None, keyNames=None) self.assertTrue(ok) except Exception as e: logger.exception("Failing with %s", str(e)) self.fail()