def testReadYamlConfigWithAutoAppend(self):
     try:
         cfgOb = ConfigUtil(
             configPath=self.__inpPathConfigAutoYaml,
             configFormat="yaml",
             mockTopPath=self.__mockTopPath,
             defaultSectionName="site_info_1",
             cachePath=None,
             useCache=False,
         )
         ok = cfgOb.appendConfig(self.__inpPathConfigAppendYaml,
                                 configFormat="yaml")
         self.assertTrue(ok)
         #
         for sName in [
                 "section_appended_1", "section_appended_2", "Section1",
                 "Section2"
         ]:
             pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName)
             pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName)
             #
             self.assertEqual(
                 pathBird, os.path.join(self.__mockTopPath,
                                        "MOCK_BIRD_REPO"))
             self.assertEqual(
                 pathPdbx,
                 os.path.join(self.__mockTopPath, "MOCK_PDBX_SANDBOX"))
             #
         #
         cfgOb = ConfigUtil(
             configPath=self.__inpPathConfigAutoYaml,
             configFormat="yaml",
             mockTopPath=self.__mockTopPath,
             defaultSectionName="site_info_1",
             cachePath=None,
             useCache=True,
         )
         ok = cfgOb.appendConfig(self.__inpPathConfigAppendYaml,
                                 configFormat="yaml")
         self.assertTrue(ok)
         #
         for sName in [
                 "section_appended_1", "section_appended_2", "Section1",
                 "Section2"
         ]:
             pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName)
             pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName)
             #
             self.assertEqual(
                 pathBird, os.path.join(self.__mockTopPath,
                                        "MOCK_BIRD_REPO"))
             self.assertEqual(
                 pathPdbx,
                 os.path.join(self.__mockTopPath, "MOCK_PDBX_SANDBOX"))
             #
     except Exception as e:
         logger.error("Failing with %s", str(e))
         self.fail()
Esempio n. 2
0
    def setUp(self):
        self.__verbose = True
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        #
        configName = "site_info_configuration"
        cfgOb = ConfigUtil(configPath=pathConfig,
                           defaultSectionName=configName,
                           mockTopPath=mockTopPath)
        self.__pathClusterData = cfgOb.getPath(
            "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        # sample data set
        self.__dataSetId = "2018_23"

        # self.__levels = ["100", "95", "90", "70", "50", "30"]
        self.__levels = ["95"]
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__pathSaveStyleCif = os.path.join(HERE, "test-output",
                                               "cluster-data-cif.json")
        self.__pathSaveStyleDocSequence = os.path.join(
            HERE, "test-output", "cluster-data-doc-sequence.json")
        self.__pathSaveStyleDocCluster = os.path.join(
            HERE, "test-output", "cluster-data-doc-cluster.json")
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Esempio n. 3
0
class RepoHoldingsDataPrepTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2019_25"
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        #
        self.__startTime = time.time()
        logger.info("Starting %s at %s", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testProcessLegacyFiles(self):
        """Test loading and processing operations for repository holdings and status echange data."""
        try:
            rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb,
                                        sandboxPath=self.__sandboxPath,
                                        cachePath=self.__cachePath)
            rL = rhdp.getHoldingsUpdateEntry(updateId=self.__updateId)
            logger.info("update data length %r", len(rL))
            self.assertGreaterEqual(len(rL), 10)
            #
            rL = rhdp.getHoldingsCurrentEntry(updateId=self.__updateId)
            self.assertGreaterEqual(len(rL), 10)
            logger.info("holdings data length %r", len(rL))
            #
            rL = rhdp.getHoldingsUnreleasedEntry(updateId=self.__updateId)
            self.assertGreaterEqual(len(rL), 10)
            logger.info("unreleased data length %r", len(rL))
            #
            rL = rhdp.getHoldingsRemovedEntry(updateId=self.__updateId)
            self.assertGreaterEqual(len(rL), 10)
            logger.info("removed data length %r", len(rL))

            rL = rhdp.getHoldingsCombinedEntry(updateId=self.__updateId)
            self.assertGreaterEqual(len(rL), 10)
            logger.info("combined data length %r", len(rL))
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Esempio n. 4
0
class DictionaryProviderTests(unittest.TestCase):
    def setUp(self):
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__dirPath = os.path.join(self.__cachePath, "dictionaries")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__contentInfoConfigName = "content_info_helper_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        dictLocatorMap = self.__cfgOb.get(
            "DICT_LOCATOR_CONFIG_MAP",
            sectionName=self.__contentInfoConfigName)
        schemaName = "pdbx_core"
        self.__dictLocators = [
            self.__cfgOb.getPath(configLocator, sectionName=self.__configName)
            for configLocator in dictLocatorMap[schemaName]
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testResourceCache(self):
        """Test case - generate and check dictonary artifact and api caches
        """
        try:
            logger.debug("Dictionary locators %r", self.__dictLocators)
            dp = DictionaryApiProvider(dirPath=self.__dirPath, useCache=False)
            dApi = dp.getApi(self.__dictLocators)
            ok = dApi.testCache()
            self.assertTrue(ok)
            title = dApi.getDictionaryTitle()
            logger.debug("Title %r", title)
            self.assertEqual(
                title, "mmcif_pdbx.dic,rcsb_mmcif_ext.dic,vrpt_mmcif_ext.dic")
            # revL = dApi.getDictionaryHistory()
            numRev = dApi.getDictionaryRevisionCount()
            logger.debug("Number of dictionary revisions (numRev) %r", numRev)
            self.assertGreater(numRev, 220)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
    def testReadIniConfig(self):
        try:
            cfgOb = ConfigUtil(configPath=self.__inpPathConfigIni,
                               mockTopPath=self.__dataPath)
            sName = "DEFAULT"
            pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName)
            pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName)
            #
            self.assertEqual(
                pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO"))
            self.assertEqual(
                pathPdbx, os.path.join(self.__mockTopPath,
                                       "MOCK_PDBX_SANDBOX"))

            pathBird = cfgOb.get("BIRD_REPO_PATH", sectionName=sName)
            pathPdbx = cfgOb.get("PDBX_REPO_PATH", sectionName=sName)

            self.assertEqual(pathBird, "MOCK_BIRD_REPO")
            self.assertEqual(pathPdbx, "MOCK_PDBX_SANDBOX")
            sName = "Section1"
            #
            helperMethod = cfgOb.getHelper("DICT_METHOD_HELPER_MODULE",
                                           sectionName=sName)

            tv = helperMethod.echo("test_value")
            self.assertEqual(tv, "test_value")
            #
            tEnv = "TEST_ENV_VAR"
            tVal = "TEST_ENV_VAR_VALUE"
            os.environ[tEnv] = tVal
            eVal = cfgOb.getEnvValue("ENV_OPTION_A", sectionName=sName)
            self.assertEqual(tVal, eVal)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return {}
class SchemaDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__numProc = 2
        # self.__fileLimit = 200
        self.__fileLimit = None
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = True
        #
        self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        #self.__extraOpts = None
        # The following for extended parent/child info -
        self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsRepo(self):
        # schemaLevel = "min"

        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        self.assertLessEqual(eCount, 1)

    @unittest.skip("Disable troubleshooting test")
    def testValidateOptsList(self):
        schemaLevel = "min"
        inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list")
        # inputPathList = glob.glob(self.__testDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]}
        for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)):
            if ii < 5:
                continue
            eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
            logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmRepo(self):
        schemaLevel = "min"
        inputPathList = None
        self.__export = True

        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmList(self):
        #schemaLevel = "full"
        schemaLevel = "min"

        inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif")
        if not inputPathList:
            return True
        #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None):
        #
        eCount = 0
        for databaseName in databaseNameD:
            mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None
            _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
            pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes)
            for collectionName in databaseNameD[databaseName]:
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts)
                #
                dL, cnL = self.__testPrepDocumentsFromContainers(
                    pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes
                )
                # Raises exceptions for schema compliance.
                try:
                    Draft4Validator.check_schema(cD)
                except Exception as e:
                    logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e))
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName)
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii)
                    try:
                        cCount = 0
                        #for error in sorted(valInfo.iter_errors(dD), key=str):
                        #    logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message)
                        #    logger.debug("Failing document %d : %r", ii, list(dD.items()))
                        #    eCount += 1
                        #    cCount += 1
                        #if cCount > 0:
                        #    logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount)
                    except Exception as e:
                        logger.exception("Validation processing error %s", str(e))

        return eCount

    def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None):
        """Test case -  create loadable PDBx data from repository files
        """
        try:

            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName)
            #
            dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False)
            dictApi = dP.getApiByName(databaseName)
            rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif")
                    #self.__mU.doExport(savePath, [container], fmt="mmcif")
            #
            tableIdExcludeList = sd.getCollectionExcluded(collectionName)
            tableIdIncludeList = sd.getCollectionSelected(collectionName)
            sliceFilter = sd.getCollectionSliceFilter(collectionName)
            sdp.setSchemaIdExcludeList(tableIdExcludeList)
            sdp.setSchemaIdIncludeList(tableIdIncludeList)
            #
            docList, containerNameList, _ = sdp.processDocuments(
                containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName
            )

            docList = sdp.addDocumentPrivateAttributes(docList, collectionName)
            docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName)
            #
            mergeS = "-".join(mergeContentTypes) if mergeContentTypes else ""
            if self.__export and docList:
                # for ii, doc in enumerate(docList[:1]):
                for ii, doc in enumerate(docList):
                    cn = containerNameList[ii]
                    fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS))
                    self.__mU.doExport(fp, [doc], fmt="json", indent=3)
                    logger.debug("Exported %r", fp)
            #
            return docList, containerNameList

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
class ContentDefinitionTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=self.__configName, mockTopPath=self.__mockTopPath)
        #
        #
        self.__pathPdbxDictionaryFile = self.__cfgOb.getPath("PDBX_DICT_LOCATOR", sectionName=self.__configName)
        self.__pathRcsbDictionaryFile = self.__cfgOb.getPath("RCSB_DICT_LOCATOR", sectionName=self.__configName)
        self.__pathVrptDictionaryFile = self.__cfgOb.getPath("VRPT_DICT_LOCATOR", sectionName=self.__configName)

        self.__mU = MarshalUtil()
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=True)
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testDefaults(self):
        """ Test the default case of using only dictionary content.
        """
        try:
            dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile])
            ok = dictApi.testCache()
            self.assertTrue(ok)
            sdi = ContentDefinition(dictApi)
            nS = sdi.getSchemaNames()
            logger.debug("schema name length %d", len(nS))
            self.assertGreaterEqual(len(nS), 600)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testHelper(self):
        """ Test the dictionary content supplemented by helper function

        """
        try:
            cH = ContentDefinitionHelper(cfgOb=self.__cfgOb)
            dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile])
            sdi = ContentDefinition(dictApi, databaseName="chem_comp", contentDefHelper=cH)
            catNameL = sdi.getCategories()
            cfD = {}
            afD = {}
            for catName in catNameL:
                cfD[catName] = sdi.getCategoryFeatures(catName)
                afD[catName] = sdi.getAttributeFeatures(catName)

            #
            logger.debug("Dictionary category name length %d", len(catNameL))
            self.assertGreaterEqual(len(catNameL), 600)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testExtensionWithHelper(self):
        """ Test the dictionary content supplemented by helper function

        """
        try:
            cH = ContentDefinitionHelper(cfgOb=self.__cfgOb)
            dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile])
            sdi = ContentDefinition(dictApi, databaseName="pdbx_core", contentDefHelper=cH)
            catNameL = sdi.getCategories()
            cfD = {}
            afD = {}
            for catName in catNameL:
                cfD[catName] = sdi.getCategoryFeatures(catName)
                afD[catName] = sdi.getAttributeFeatures(catName)

            #
            logger.debug("Dictionary category name length %d", len(catNameL))
            self.assertGreaterEqual(len(catNameL), 650)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testRepoWithHelper(self):
        """ Test the dictionary content supplemented by helper function for auxiliary schema

        """
        try:
            cH = ContentDefinitionHelper(cfgOb=self.__cfgOb)
            dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile, self.__pathVrptDictionaryFile])
            sdi = ContentDefinition(dictApi, databaseName="repository_holdings", contentDefHelper=cH)
            catNameL = sdi.getCategories()
            cfD = {}
            afD = {}
            for catName in catNameL:
                cfD[catName] = sdi.getCategoryFeatures(catName)
                afD[catName] = sdi.getAttributeFeatures(catName)

            #
            logger.debug("Dictionary category name length %d", len(catNameL))
            self.assertGreaterEqual(len(catNameL), 680)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
class SchemaSearchContextsTests(unittest.TestCase):
    skipFlag = True

    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
        #
        self.__pathPdbxDictionaryFile = self.__cfgOb.getPath(
            "PDBX_DICT_LOCATOR", sectionName=configName)
        self.__pathRcsbDictionaryFile = self.__cfgOb.getPath(
            "RCSB_DICT_LOCATOR", sectionName=configName)
        self.__pathVrptDictionaryFile = self.__cfgOb.getPath(
            "VRPT_DICT_LOCATOR", sectionName=configName)

        # self.__mU = MarshalUtil()
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                                 self.__cachePath,
                                                 useCache=True)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testSearchGroups(self):
        ok = self.__docHelper.checkSearchGroups()
        self.assertTrue(ok)

    @unittest.skipIf(skipFlag, "Troubleshooting test")
    def testUnUsedIndexedItems(self):
        """Enumerate items that are indexed by have no search group assignments.

        collection_attribute_search_contexts
        """

        groupNameList = self.__docHelper.getSearchGroups()
        logger.info("Search groups (%d)", len(groupNameList))
        #
        nestedSearchableD = self.__assembleNestedCategorySearchables()
        nestedSearchableD.update(self.__assembleNestedSubCategorySearchables())
        #
        attribContextD = {}
        tD = self.__docHelper.getAllAttributeSearchContexts()
        for (catName, atName), contextL in tD.items():
            attribContextD.setdefault((catName, atName),
                                      []).extend([t[0] for t in contextL])
        logger.info("search context attribContextD %d", len(attribContextD))

        lookupD = {}
        # if (catName, atName) in nestedSearchableD:
        for groupName in groupNameList:
            # get attributes in group
            attributeTupList = self.__docHelper.getSearchGroupAttributes(
                groupName)
            # logger.info("")
            # logger.info("%s (%2d):", groupName, len(attributeTupList))
            for catName, atName in attributeTupList:
                lookupD.setdefault((catName, atName), []).append(groupName)
        #
        logger.info("Search group lookup len %d", len(lookupD))
        for (catName, atName), contextL in attribContextD.items():
            # logger.info("%s.%s contexL %r", catName, atName, contextL)

            if "full-text" in contextL:
                if (catName,
                        atName) in lookupD or (catName,
                                               atName) in nestedSearchableD:
                    continue
                logger.info("%s.%s contexL %r", catName, atName, contextL)

        #

        return True

    @unittest.skipIf(skipFlag, "Troubleshooting test")
    def testExpandSearchGroups(self):
        """Expand search groups and metadata content as these would be display in RCSB search menu."""
        _, afD = self.__getContentFeatures()
        groupNameList = self.__docHelper.getSearchGroups()
        logger.info("Search groups (%d)", len(groupNameList))
        #
        nestedSearchableD = self.__assembleNestedCategorySearchables()
        nestedSearchableD.update(self.__assembleNestedSubCategorySearchables())
        #
        for groupName in groupNameList:
            # get attributes in group
            attributeTupList = self.__docHelper.getSearchGroupAttributes(
                groupName)
            logger.info("")
            logger.info("%s (%2d):", groupName, len(attributeTupList))
            # Get search context and brief descriptions -
            for catName, atName in attributeTupList:
                searchContextTupL = self.__docHelper.getSearchContexts(
                    catName, atName)
                if not searchContextTupL:
                    logger.warning("Missing search context for %s.%s", catName,
                                   atName)
                descriptionText = self.__docHelper.getAttributeDescription(
                    catName, atName, contextType="brief")
                if not descriptionText:
                    logger.warning("Missing brief description %s.%s", catName,
                                   atName)
                #
                fD = afD[catName][atName] if catName in afD and atName in afD[
                    catName] else {}
                logger.debug("%s %s fD %r", catName, atName, fD)
                units = fD["UNITS"] if "UNITS" in fD else None
                #
                uS = ""
                if units:
                    uS = "(units=%s)" % units
                #
                nS = "(%s.%s)" % (catName, atName)
                if (catName, atName) in nestedSearchableD:
                    for dS in nestedSearchableD[(catName, atName)]:
                        logger.info(
                            "- %-55s: %s %s (%s)", dS, nS, uS,
                            ",".join([tup[0] for tup in searchContextTupL]))
                else:
                    logger.info(
                        "- %-55s: %s %s (%s)", descriptionText, nS, uS,
                        ",".join([tup[0] for tup in searchContextTupL]))

        return True

    def __assembleNestedCategorySearchables(self):
        """Assemble dictionary of searchable items in nested categories.

        Returns:
            (dict): {(category, atName): ["Materialized brief description", ... ]
        """
        # cfD, afD = self.__getContentFeatures()
        _, afD = self.__getContentFeatures()
        logger.info("")
        searchableCategoryD = {}
        groupNameList = self.__docHelper.getSearchGroups()
        logger.debug("Search group count (%d)", len(groupNameList))
        for groupName in groupNameList:
            # get attributes in group
            attributeTupList = self.__docHelper.getSearchGroupAttributes(
                groupName)
            for catName, atName in attributeTupList:
                searchableCategoryD.setdefault(catName, []).append(atName)
        logger.debug("Searchable category count (%d)",
                     len(searchableCategoryD))
        #
        retD = {}
        for catName in searchableCategoryD:
            nestedContextDL = self.__docHelper.getNestedContexts(catName)
            if not nestedContextDL:
                # not nested skip
                continue
            elif len(nestedContextDL) > 1:
                logger.warning("Multiple nested contexts for category %s",
                               catName)
            #
            for nestedContextD in nestedContextDL:
                contextPath = nestedContextD[
                    "FIRST_CONTEXT_PATH"] if "FIRST_CONTEXT_PATH" in nestedContextD else None
                if not contextPath:
                    logger.warning(
                        "Missing context path for nested category %s", catName)
                    continue
                #
                contextName = nestedContextD["CONTEXT_NAME"]

                #
                cpCatName = contextPath.split(".")[0]
                cpAtName = contextPath.split(".")[1]
                nestedPathSearchContext = self.__docHelper.getSearchContexts(
                    cpCatName, cpAtName)
                logger.debug("Nested (%r) context path for %r %r", contextName,
                             cpCatName, cpAtName)
                if not nestedPathSearchContext:
                    logger.warning(
                        "Missing nested (%r) search context for %r %r",
                        contextName, cpCatName, cpAtName)
                #
                nfD = afD[cpCatName][
                    cpAtName] if cpCatName in afD and cpAtName in afD[
                        cpCatName] else {}
                logger.debug("FeatureD %r", nfD)
                # --
                enumMapD = {}
                enumDL = nfD["ENUMS_ANNOTATED"]
                if not enumDL:
                    logger.warning("Missing nested enums %s.%s", cpCatName,
                                   cpAtName)
                else:
                    logger.debug("All context enums count %d", len(enumDL))
                    for enumD in enumDL:
                        logger.info("%s.%s enumD %r", cpCatName, cpAtName,
                                    enumD)
                        if "name" not in enumD:
                            logger.warning(
                                "Missing nested enum (name) for %s.%s",
                                cpCatName, cpAtName)
                    #
                    enumMapD = {
                        enumD["value"]:
                        enumD["name"] if "name" in enumD else enumD["detail"]
                        for enumD in enumDL
                    }
                # --
                nestedDescriptionText = self.__docHelper.getAttributeDescription(
                    cpCatName, cpAtName, contextType="brief")
                if not nestedDescriptionText:
                    logger.warning("Missing brief nested description %s.%s",
                                   cpCatName, cpAtName)
                else:
                    logger.debug("Nested context description: %r",
                                 nestedDescriptionText)
                # --
                cvDL = nestedContextD["CONTEXT_ATTRIBUTE_VALUES"] if "CONTEXT_ATTRIBUTE_VALUES" in nestedContextD else []
                if not cvDL:
                    logger.warning("Missing context attribute values for %s",
                                   catName)
                    # if no context values defined then use: all enums x searchable attributes in this category
                    #
                    # Template:  enum detail + search attribute brief description text
                    for enumD in enumDL:
                        for atName in searchableCategoryD[catName]:
                            briefDescr = self.__docHelper.getAttributeDescription(
                                catName, atName, contextType="brief")
                            # subCategories = nfD["SUB_CATEGORIES"] if "SUB_CATEGORIES" in nfD else None
                            tS = enumD["detail"] + " " + briefDescr
                            retD.setdefault((catName, atName), []).append(tS)
                else:
                    # Only use context values from the full enum list with specified search paths.
                    #
                    # Template:  context value (enum detail) + search path attribute (brief description text)
                    #  cVDL.append({"CONTEXT_VALUE": tD["CONTEXT_VALUE"], "SEARCH_PATHS": tD["SEARCH_PATHS"]})
                    #
                    for cvD in cvDL:
                        enumV = cvD["CONTEXT_VALUE"]
                        enumDetail = enumMapD[
                            enumV] if enumV in enumMapD else None
                        if not enumDetail:
                            logger.warning(
                                "%s %s missing detail for enum value %s",
                                catName, cpAtName, enumV)
                        for sp in cvD["SEARCH_PATHS"]:
                            if sp.count(".") > 1:
                                k = sp.rfind(".")
                                sp = sp[:k] + "_" + sp[k + 1:]
                            cnS = sp.split(".")[0]
                            anS = sp.split(".")[1]
                            briefDescr = self.__docHelper.getAttributeDescription(
                                cnS, anS, contextType="brief")
                            tS = enumDetail + " " + briefDescr
                            logger.debug("%s,%s tS %r", cnS, anS, tS)
                            retD.setdefault((cnS, anS), []).append(tS)
                        for aD in cvD["ATTRIBUTES"]:
                            sp = aD["PATH"]
                            if sp.count(".") > 1:
                                k = sp.rfind(".")
                                sp = sp[:k] + "_" + sp[k + 1:]
                            cnS = sp.split(".")[0]
                            anS = sp.split(".")[1]
                            briefDescr = self.__docHelper.getAttributeDescription(
                                cnS, anS, contextType="brief")
                            tS = enumDetail + " " + briefDescr
                            logger.debug("%s,%s tS %r", cnS, anS, tS)
                            retD.setdefault((cnS, anS), []).append(tS)
                            exL = aD["EXAMPLES"]
                            logger.info("%s,%s sp %r examplesL %r", cnS, anS,
                                        sp, exL)
        #
        for k, vL in retD.items():
            for v in vL:
                logger.debug("%s : %r", k, v)
        #
        return retD

    def __assembleNestedSubCategorySearchables(self):
        """Assemble dictionary of searchable items in nested subcategories.

        Returns:
            (dict): {(category, atName): ["Materialized brief description", ... ]
        """
        _, afD = self.__getContentFeatures()
        # logger.info("")
        searchableCategoryD = {}
        groupNameList = self.__docHelper.getSearchGroups()
        logger.debug("Search group count (%d)", len(groupNameList))
        for groupName in groupNameList:
            # get attributes in group
            attributeTupList = self.__docHelper.getSearchGroupAttributes(
                groupName)
            for catName, atName in attributeTupList:
                searchableCategoryD.setdefault(catName, []).append(atName)
        logger.debug("Searchable category count (%d)",
                     len(searchableCategoryD))
        #
        subcatNestedD = {}
        tD = self.__docHelper.getAllSubCategoryNestedContexts()
        for k, v in tD.items():
            for kk, vv in v.items():
                if kk in subcatNestedD:
                    logger.warning(
                        "Duplicate nested subcategory specifications in %r %r",
                        k, kk)
                # only take cases with an context path ...
                if "FIRST_CONTEXT_PATH" in vv:
                    subcatNestedD[kk[0]] = (kk[1], vv)
        #  cat = (subcat, {nested context dict})
        #
        retD = {}
        for catName in searchableCategoryD:
            if catName not in subcatNestedD:
                continue
            subCatName, nestedContextD = subcatNestedD[catName]
            #
            contextPath = nestedContextD[
                "FIRST_CONTEXT_PATH"] if "FIRST_CONTEXT_PATH" in nestedContextD else None
            if not contextPath:
                logger.warning("Missing context path for nested category %s",
                               catName)
                continue
            #
            if contextPath.count(".") > 1:
                k = contextPath.rfind(".")
                contextPath = contextPath[:k] + "_" + contextPath[k + 1:]
            logger.debug("%s subcategory %s context path %r", catName,
                         subCatName, contextPath)
            contextName = nestedContextD["CONTEXT_NAME"]
            cpCatName = contextPath.split(".")[0]
            cpAtName = contextPath.split(".")[1]
            nestedPathSearchContext = self.__docHelper.getSearchContexts(
                cpCatName, cpAtName)
            logger.debug("Nested (%r) context path for %r %r", contextName,
                         cpCatName, cpAtName)
            if not nestedPathSearchContext:
                logger.warning("Missing nested (%r) search context for %r %r",
                               contextName, cpCatName, cpAtName)
            #
            nfD = afD[cpCatName][
                cpAtName] if cpCatName in afD and cpAtName in afD[
                    cpCatName] else {}
            logger.debug("FeatureD %r", nfD)
            # --
            enumMapD = {}
            enumDL = nfD["ENUMS_ANNOTATED"]
            if not enumDL:
                logger.warning("Missing nested enums %s.%s", cpCatName,
                               cpAtName)
            else:
                logger.debug("All context enums count %d", len(enumDL))
                for enumD in enumDL:
                    if "name" not in enumD:
                        logger.warning("Missing nested enum (name) for %s.%s",
                                       cpCatName, cpAtName)
                #
                enumMapD = {
                    enumD["value"]:
                    enumD["name"] if "name" in enumD else enumD["detail"]
                    for enumD in enumDL
                }
            # --
            nestedDescriptionText = self.__docHelper.getAttributeDescription(
                cpCatName, cpAtName, contextType="brief")
            if not nestedDescriptionText:
                logger.warning("Missing brief nested description %s.%s",
                               cpCatName, cpAtName)
            else:
                logger.debug("Nested context description: %r",
                             nestedDescriptionText)
                # --
            cvDL = nestedContextD["CONTEXT_ATTRIBUTE_VALUES"] if "CONTEXT_ATTRIBUTE_VALUES" in nestedContextD else []
            #
            if not cvDL:
                logger.warning("Missing context attribute values for %s",
                               catName)
                # if no context values defined then use: all enums x searchable attributes in this category
                #
                # Template:  enum detail + search attribute brief description text
                for enumD in enumDL:
                    for atName in searchableCategoryD[catName]:
                        nnfD = afD[catName][atName]
                        subCatL = [d["id"] for d in nnfD["SUB_CATEGORIES"]
                                   ] if "SUB_CATEGORIES" in nnfD else None
                        logger.debug("%s.%s %s subCatL %r", catName, atName,
                                     subCatName, subCatL)
                        if subCatL and subCatName in subCatL:
                            briefDescr = self.__docHelper.getAttributeDescription(
                                catName, atName, contextType="brief")
                            tS = enumD["detail"] + " " + briefDescr
                            retD.setdefault((catName, atName), []).append(tS)
            else:
                # Only use context values from the full enum list with specified search paths.
                #
                # Template:  context value (enum detail) + search path attribute (brief description text)
                #  cVDL.append({"CONTEXT_VALUE": tD["CONTEXT_VALUE"], "SEARCH_PATHS": tD["SEARCH_PATHS"]})
                #
                for cvD in cvDL:
                    enumV = cvD["CONTEXT_VALUE"]
                    enumDetail = enumMapD[enumV] if enumV in enumMapD else None
                    if not enumDetail:
                        logger.warning(
                            "%s %s missing detail for enum value %s", catName,
                            cpAtName, enumV)
                    for sp in cvD["SEARCH_PATHS"]:
                        if sp.count(".") > 1:
                            k = sp.rfind(".")
                            sp = sp[:k] + "_" + sp[k + 1:]
                        cnS = sp.split(".")[0]
                        anS = sp.split(".")[1]
                        briefDescr = self.__docHelper.getAttributeDescription(
                            cnS, anS, contextType="brief")
                        tS = enumDetail + " " + briefDescr
                        retD.setdefault((cnS, anS), []).append(tS)
                    for aD in cvD["ATTRIBUTES"]:
                        sp = aD["PATH"]
                        if sp.count(".") > 1:
                            k = sp.rfind(".")
                            sp = sp[:k] + "_" + sp[k + 1:]
                        cnS = sp.split(".")[0]
                        anS = sp.split(".")[1]
                        briefDescr = self.__docHelper.getAttributeDescription(
                            cnS, anS, contextType="brief")
                        tS = enumDetail + " " + briefDescr
                        retD.setdefault((cnS, anS), []).append(tS)
                        exL = aD["EXAMPLES"]
                        logger.debug("%s,%s sp %r exL %r", cnS, anS, sp, exL)
        #
        for k, vL in retD.items():
            for v in vL:
                logger.debug("%s : %r", k, v)
        #
        return retD

    def __getContentFeatures(self):
        """Get category and attribute features"""
        try:
            cH = ContentDefinitionHelper(cfgOb=self.__cfgOb)
            dictApi = self.__dP.getApiByLocators(dictLocators=[
                self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile
            ])
            # logger.info("units = %r", dictApi.getUnits("pdbx_nmr_spectrometer", "manufacturer"))
            sdi = ContentDefinition(dictApi,
                                    databaseName="pdbx_core",
                                    contentDefHelper=cH)
            catNameL = sdi.getCategories()
            cfD = {}
            afD = {}
            for catName in catNameL:
                cfD[catName] = sdi.getCategoryFeatures(catName)
                afD[catName] = sdi.getAttributeFeatures(catName)
            #
            return cfD, afD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None, None
class RepoHoldingsDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        self.__export = False
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testValidateOptsStrict(self):
        updateId = self.__updateId
        schemaLevel = "full"
        eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertTrue(eCount <= 1)

    @unittest.skip("Troubleshooting test")
    def testValidateOptsMin(self):
        updateId = self.__updateId
        schemaLevel = "min"
        eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertTrue(eCount <= 1)

    def __testValidateOpts(self, updateId, schemaLevel="full"):
        schemaNames = ["repository_holdings"]
        collectionNames = {
            "repository_holdings": [
                "repository_holdings_update_entry",
                "repository_holdings_current_entry",
                "repository_holdings_unreleased_entry",
                "repository_holdings_removed_entry",
                "repository_holdings_combined_entry",
            ],
            "entity_sequence_clusters":
            ["cluster_members", "cluster_provenance", "entity_members"],
        }
        #
        eCount = 0
        for schemaName in schemaNames:
            for collectionName in collectionNames[schemaName]:
                _ = self.__schP.makeSchemaDef(schemaName,
                                              dataTyping="ANY",
                                              saveSchema=True)
                cD = self.__schP.makeSchema(schemaName,
                                            collectionName,
                                            encodingType="JSON",
                                            level=schemaLevel,
                                            saveSchema=True)
                dL = self.__getRepositoryHoldingsDocuments(
                    schemaName, collectionName, updateId)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            collectionName + ".json")
                    self.__mU.doExport(savePath, dL, fmt="json", indent=3)
                # Raises exceptions for schema compliance.
                Draft4Validator.check_schema(cD)
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d",
                                 schemaName, collectionName, ii)
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info(
                                "schema %s collection %s path %s error: %s",
                                schemaName, collectionName, error.path,
                                error.message)
                            logger.info(">>>")
                            logger.info(">>> failing object is %r", dD)
                            logger.info(">>>")
                            eCount += 1
                            cCount += 1
                        #
                        logger.debug("schema %s collection %s count %d",
                                     schemaName, collectionName, cCount)
                    except Exception as e:
                        logger.exception("Validation error %s", str(e))

        return eCount

    def __getRepositoryHoldingsDocuments(self, schemaName, collectionName,
                                         updateId):
        """Test loading and processing operations for legacy holdings and status data."""
        rL = []
        try:
            rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb,
                                        sandboxPath=self.__sandboxPath,
                                        workPath=self.__cachePath)
            if collectionName == "repository_holdings_update_entry":
                rL = rhdp.getHoldingsUpdateEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("update data length %r", len(rL))
            #
            elif collectionName == "repository_holdings_current_entry":
                rL = rhdp.getHoldingsCurrentEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("holdings data length %r", len(rL))
            #
            elif collectionName == "repository_holdings_unreleased_entry":
                rL = rhdp.getHoldingsUnreleasedEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("unreleased data length %r", len(rL))
            #
            elif collectionName in ["repository_holdings_removed_entry"]:
                rL = rhdp.getHoldingsRemovedEntry(updateId=updateId)
                if collectionName == "repository_holdings_removed":
                    self.assertGreaterEqual(len(rL), 10)
                    logger.debug("removed data length %r", len(rL))
            elif collectionName == "repository_holdings_combined_entry":
                rL = rhdp.getHoldingsCombinedEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("holdings data length %r", len(rL))

            #
        except Exception as e:
            logger.exception("%s %s failing with %s", schemaName,
                             collectionName, str(e))
            self.fail()

        return rL
Esempio n. 10
0
class ClusterDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        #
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName)
        #
        self.__dataSetId = "2018_23"
        self.__pathClusterData = self.__cfgOb.getPath("RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        # self.__levels = ['100', '95', '90', '70', '50', '30']
        self.__levels = ["100"]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsStrict(self):
        updateId = self.__updateId
        validationLevel = "full"
        eCount = self.__testValidateOpts(updateId, validationLevel=validationLevel)
        logger.info("Total validation errors validation level %s : %d", validationLevel, eCount)
        self.assertTrue(eCount <= 1)

    def __testValidateOpts(self, updateId, validationLevel="full"):
        _ = updateId
        databaseNames = ["sequence_clusters"]
        collectionNames = {"sequence_clusters": ["cluster_provenance", "cluster_members", "entity_members"]}
        #
        eCount = 0
        for databaseName in databaseNames:
            for collectionName in collectionNames[databaseName]:
                _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=validationLevel, saveSchema=True)
                #
                dL = self.__getSequenceClusterData(collectionName, levels=self.__levels, dataSetId=self.__dataSetId, dataLocator=self.__pathClusterData)
                # Raises exceptions for schema compliance.
                Draft4Validator.check_schema(cD)
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                for _, dD in enumerate(dL):
                    # logger.debug("Schema %s collection %s document %d" % (schemaName, collectionName, ii))
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info("schema %s collection %s path %s error: %s", databaseName, collectionName, error.path, error.message)
                            logger.info(">>> failing object is %r", dD)
                            eCount += 1
                            cCount += 1
                        #
                        logger.debug("schema %s collection %s count %d", databaseName, collectionName, cCount)
                    except Exception as e:
                        logger.exception("Validation error %s", str(e))

        return eCount

    def __fetchProvenance(self):
        """Test case for fetching a provenance dictionary content."""
        try:
            provKeyName = "rcsb_entity_sequence_cluster_prov"
            provU = ProvenanceProvider(self.__cfgOb, self.__cachePath, useCache=True)
            pD = provU.fetch()
            return pD[provKeyName] if provKeyName in pD else {}
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __getSequenceClusterData(self, collectionName, dataSetId=None, dataLocator=None, levels=None):
        """Test extraction on an example sequence cluster data set."""
        try:
            #
            if collectionName == "cluster_provenance":
                return [self.__fetchProvenance()]
            #
            entitySchemaName = "rcsb_entity_sequence_cluster_list"
            clusterSchemaName = "rcsb_entity_sequence_cluster_identifer_list"
            cdp = ClusterDataPrep(workPath=self.__cachePath, entitySchemaName=entitySchemaName, clusterSchemaName=clusterSchemaName)
            cifD, docBySequenceD, docByClusterD = cdp.extract(dataSetId, clusterSetLocator=dataLocator, levels=levels, clusterType="entity")
            self.assertEqual(len(cifD), 1)
            self.assertEqual(len(docBySequenceD), 1)
            self.assertEqual(len(docByClusterD), 1)
            if collectionName == "entity_members":
                return docBySequenceD[entitySchemaName]
            elif collectionName == "cluster_members":
                return docByClusterD[clusterSchemaName]

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return None
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument(
        "--full",
        default=True,
        action="store_true",
        help="Fresh full load in a new tables/collections (Default)")
    #
    parser.add_argument("--etl_entity_sequence_clusters",
                        default=False,
                        action="store_true",
                        help="ETL entity sequence clusters")
    parser.add_argument("--etl_repository_holdings",
                        default=False,
                        action="store_true",
                        help="ETL repository holdings")
    # parser.add_argument("--etl_chemref", default=False, action="store_true", help="ETL integrated chemical reference data")
    # parser.add_argument("--etl_tree_node_lists", default=False, action='store_true', help="ETL tree node lists")

    parser.add_argument(
        "--data_set_id",
        default=None,
        help="Data set identifier (default= 2018_14 for current week)")
    #
    parser.add_argument(
        "--sequence_cluster_data_path",
        default=None,
        help="Sequence cluster data path (default set by configuration")
    parser.add_argument(
        "--sandbox_data_path",
        default=None,
        help="Date exchange sandboxPath data path (default set by configuration"
    )

    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")

    parser.add_argument("--db_type",
                        default="mongo",
                        help="Database server type (default=mongo)")

    # parser.add_argument("--document_style", default="rowwise_by_name_with_cardinality",
    #                    help="Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name")
    parser.add_argument("--read_back_check",
                        default=False,
                        action="store_true",
                        help="Perform read back check on all documents")
    #
    parser.add_argument("--num_proc",
                        default=2,
                        help="Number of processes to execute (default=2)")
    parser.add_argument("--chunk_size",
                        default=10,
                        help="Number of files loaded per process")
    parser.add_argument("--document_limit",
                        default=None,
                        help="Load document limit for testing")
    parser.add_argument("--prune_document_size",
                        default=None,
                        help="Prune large documents to this size limit (MB)")
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument("--mock",
                        default=False,
                        action="store_true",
                        help="Use MOCK repository configuration for testing")
    parser.add_argument("--cache_path",
                        default=None,
                        help="Path containing cache directories")
    # parser.add_argument("--use_cache", default=False, action="store_true", help="Use cache files from remote resources")
    parser.add_argument("--rebuild_cache",
                        default=False,
                        action="store_true",
                        help="Rebuild cached resource files")
    # parser.add_argument("--rebuild_schema", default=False, action="store_true", help="Rebuild schema on-the-fly if not cached")
    #
    #
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    # useCache = args.use_cache
    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
        #
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)

    #
    try:
        readBackCheck = args.read_back_check
        tU = TimeUtil()
        dataSetId = args.data_set_id if args.data_set_id else tU.getCurrentWeekSignature(
        )
        seqDataLocator = args.sequence_cluster_data_path if args.sequence_cluster_data_path else cfgOb.getPath(
            "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        sandboxPath = args.sandbox_data_path if args.sandbox_data_path else cfgOb.getPath(
            "RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName)
        numProc = int(args.num_proc)
        chunkSize = int(args.chunk_size)
        documentLimit = int(
            args.document_limit) if args.document_limit else None

        loadType = "full" if args.full else "replace"
        # loadType = 'replace' if args.replace else 'full'

        cachePath = args.cache_path if args.cache_path else "."
        rebuildCache = args.rebuild_cache if args.rebuild_cache else False
        # rebuildSchemaFlag = args.rebuild_schema if args.rebuild_schema else False
        #
        # if args.document_style not in ['rowwise_by_name', 'rowwise_by_name_with_cardinality', 'columnwise_by_name', 'rowwise_by_id', 'rowwise_no_name']:
        #    logger.error("Unsupported document style %s" % args.document_style)

        if args.db_type != "mongo":
            logger.error("Unsupported database server type %s", args.db_type)
    except Exception as e:
        logger.exception("Argument processing problem %s", str(e))
        parser.print_help(sys.stderr)
        exit(1)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #  Rebuild or check resource cache
    ok = buildResourceCache(cfgOb,
                            configName,
                            cachePath,
                            rebuildCache=rebuildCache)
    if not ok:
        logger.error("Cache rebuild or check failure (rebuild %r) %r",
                     rebuildCache, cachePath)
        exit(1)
    ##
    if args.db_type == "mongo":
        if args.etl_entity_sequence_clusters:
            cw = SequenceClustersEtlWorker(cfgOb,
                                           numProc=numProc,
                                           chunkSize=chunkSize,
                                           documentLimit=documentLimit,
                                           verbose=debugFlag,
                                           readBackCheck=readBackCheck,
                                           workPath=cachePath)
            ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType)
            okS = loadStatus(cw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        if args.etl_repository_holdings:
            rhw = RepoHoldingsEtlWorker(cfgOb,
                                        sandboxPath,
                                        cachePath,
                                        numProc=numProc,
                                        chunkSize=chunkSize,
                                        documentLimit=documentLimit,
                                        verbose=debugFlag,
                                        readBackCheck=readBackCheck)
            ok = rhw.load(dataSetId, loadType=loadType)
            okS = loadStatus(rhw.getLoadStatus(),
                             cfgOb,
                             cachePath,
                             readBackCheck=readBackCheck)

        logger.info("Operation completed with status %r " % ok and okS)
Esempio n. 12
0
class RepoLoadWorkflow(object):
    def __init__(self, **kwargs):
        #  Configuration Details
        configPath = kwargs.get("configPath", "exdb-config-example.yml")
        self.__configName = kwargs.get("configName", "site_info_configuration")
        mockTopPath = kwargs.get("mockTopPath", None)
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=mockTopPath)
        #
        self.__cachePath = kwargs.get("cachePath", ".")
        self.__cachePath = os.path.abspath(self.__cachePath)
        self.__debugFlag = kwargs.get("debugFlag", False)
        if self.__debugFlag:
            logger.setLevel(logging.DEBUG)
        #
        #  Rebuild or check resource cache
        # rebuildCache = kwargs.get("rebuildCache", False)
        # self.__cacheStatus = self.buildResourceCache(rebuildCache=rebuildCache)
        # logger.debug("Cache status if %r", self.__cacheStatus)
        #

    def load(self, op, **kwargs):
        # if not self.__cacheStatus:
        #    logger.error("Resource cache test or rebuild has failed - exiting")
        #    return False
        # argument processing
        if op not in [
                "pdbx-loader", "etl-repository-holdings",
                "etl-entity-sequence-clusters"
        ]:
            logger.error("Unsupported operation %r - exiting", op)
            return False
        try:
            readBackCheck = kwargs.get("readBackCheck", False)
            numProc = int(kwargs.get("numProc", 1))
            chunkSize = int(kwargs.get("chunkSize", 10))
            fileLimit = int(
                kwargs.get("fileLimit")) if "fileLimit" in kwargs else None
            documentLimit = int(kwargs.get(
                "documentLimit")) if "documentLimit" in kwargs else None
            failedFilePath = kwargs.get("failFileListPath", None)
            loadFileListPath = kwargs.get("loadFileListPath", None)
            saveInputFileListPath = kwargs.get("saveFileListPath", None)
            schemaLevel = kwargs.get("schemaLevel",
                                     "min") if kwargs.get("schemaLevel") in [
                                         "min", "full"
                                     ] else "min"
            loadType = kwargs.get("loadType", "full")  # or replace
            updateSchemaOnReplace = kwargs.get("updateSchemaOnReplace", True)
            pruneDocumentSize = float(
                kwargs.get("pruneDocumentSize"
                           )) if "pruneDocumentSize" in kwargs else None

            # "Document organization (rowwise_by_name_with_cardinality|rowwise_by_name|columnwise_by_name|rowwise_by_id|rowwise_no_name",
            documentStyle = kwargs.get("documentStyle",
                                       "rowwise_by_name_with_cardinality")
            dbType = kwargs.get("dbType", "mongo")
            #
            databaseName = kwargs.get("databaseName", None)
            databaseNameList = self.__cfgOb.get(
                "DATABASE_NAMES_ALL",
                sectionName="database_catalog_configuration").split(",")
            collectionNameList = kwargs.get("collectionNameList", None)
            mergeValidationReports = kwargs.get("mergeValidationReports", True)
            #
            tU = TimeUtil()
            dataSetId = kwargs.get(
                "dataSetId"
            ) if "dataSetId" in kwargs else tU.getCurrentWeekSignature()
            seqDataLocator = self.__cfgOb.getPath(
                "RCSB_SEQUENCE_CLUSTER_DATA_PATH",
                sectionName=self.__configName)
            sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                               sectionName=self.__configName)

        except Exception as e:
            logger.exception(
                "Argument and configuration processing failing with %s",
                str(e))
            return False
        #

        if op == "pdbx-loader" and dbType == "mongo" and databaseName in databaseNameList:
            okS = True
            try:
                inputPathList = None
                if loadFileListPath:
                    mu = MarshalUtil(workPath=self.__cachePath)
                    inputPathList = mu.doImport(loadFileListPath, fmt="list")
                    if not inputPathList:
                        logger.error(
                            "Operation %r missing or empty input file path list %s - exiting",
                            op, loadFileListPath)
                        return False
            except Exception as e:
                logger.exception(
                    "Operation %r processing input path list failing with %s",
                    op, str(e))
                return False
            #
            try:
                mw = PdbxLoader(
                    self.__cfgOb,
                    self.__cachePath,
                    resourceName="MONGO_DB",
                    numProc=numProc,
                    chunkSize=chunkSize,
                    fileLimit=fileLimit,
                    verbose=self.__debugFlag,
                    readBackCheck=readBackCheck,
                )
                ok = mw.load(
                    databaseName,
                    collectionLoadList=collectionNameList,
                    loadType=loadType,
                    inputPathList=inputPathList,
                    styleType=documentStyle,
                    dataSelectors=["PUBLIC_RELEASE"],
                    failedFilePath=failedFilePath,
                    saveInputFileListPath=saveInputFileListPath,
                    pruneDocumentSize=pruneDocumentSize,
                    validationLevel=schemaLevel,
                    mergeContentTypes=["vrpt"]
                    if mergeValidationReports else None,
                    updateSchemaOnReplace=updateSchemaOnReplace,
                )
                okS = self.loadStatus(mw.getLoadStatus(),
                                      readBackCheck=readBackCheck)
            except Exception as e:
                logger.exception("Operation %r database %r failing with %s",
                                 op, databaseName, str(e))
        elif op == "etl-entity-sequence-clusters" and dbType == "mongo":
            cw = SequenceClustersEtlWorker(self.__cfgOb,
                                           numProc=numProc,
                                           chunkSize=chunkSize,
                                           documentLimit=documentLimit,
                                           verbose=self.__debugFlag,
                                           readBackCheck=readBackCheck,
                                           workPath=self.__cachePath)
            ok = cw.etl(dataSetId, seqDataLocator, loadType=loadType)
            okS = self.loadStatus(cw.getLoadStatus(),
                                  readBackCheck=readBackCheck)
        elif op == "etl-repository-holdings" and dbType == "mongo":
            rhw = RepoHoldingsEtlWorker(
                self.__cfgOb,
                sandboxPath,
                self.__cachePath,
                numProc=numProc,
                chunkSize=chunkSize,
                documentLimit=documentLimit,
                verbose=self.__debugFlag,
                readBackCheck=readBackCheck,
            )
            ok = rhw.load(dataSetId, loadType=loadType)
            okS = self.loadStatus(rhw.getLoadStatus(),
                                  readBackCheck=readBackCheck)

        logger.info("Completed operation %r with status %r", op, ok and okS)

        return ok and okS

    def loadStatus(self, statusList, readBackCheck=True):
        ret = False
        try:
            dl = DocumentLoader(self.__cfgOb,
                                self.__cachePath,
                                "MONGO_DB",
                                numProc=1,
                                chunkSize=2,
                                documentLimit=None,
                                verbose=False,
                                readBackCheck=readBackCheck)
            #
            sectionName = "data_exchange_configuration"
            databaseName = self.__cfgOb.get("DATABASE_NAME",
                                            sectionName=sectionName)
            collectionName = self.__cfgOb.get("COLLECTION_UPDATE_STATUS",
                                              sectionName=sectionName)
            ret = dl.load(databaseName,
                          collectionName,
                          loadType="append",
                          documentList=statusList,
                          indexAttributeList=[
                              "update_id", "database_name", "object_name"
                          ],
                          keyNames=None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ret

    def buildResourceCache(self, rebuildCache=False):
        """Generate and cache resource dependencies.
        """
        ret = False
        try:
            useCache = not rebuildCache
            logger.info("Cache setting useCache is %r", useCache)
            rp = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath)
            ret = rp.cacheResources(useCache=useCache)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ret
Esempio n. 13
0
class RepoHoldingsRemoteLoaderTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(RepoHoldingsRemoteLoaderTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)

        self.__resourceName = "MONGO_DB"
        self.__readBackCheck = True
        self.__numProc = 2
        self.__chunkSize = 10
        self.__documentLimit = None
        self.__filterType = "assign-dates"
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        # sample data set
        self.__updateId = "2021_36"
        #
        eiP = EntryInfoProvider(cachePath=self.__cachePath, useCache=True)
        ok = eiP.testCache(minCount=0)
        self.assertTrue(ok)
        ok = eiP.restore(self.__cfgOb, configName, useStash=False, useGit=True)
        self.assertTrue(ok)
        ok = eiP.reload()
        self.assertTrue(ok)

        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testLoadHoldingsRemote(self):
        """Test case - load legacy repository holdings and status data -

        [repository_holdings]
        DATABASE_NAME=repository_holdings
        DATABASE_VERSION_STRING=v5
        COLLECTION_HOLDINGS_UPDATE=rcsb_repository_holdings_update_entry
        COLLECTION_HOLDINGS_CURRENT=rcsb_repository_holdings_current_entry
        COLLECTION_HOLDINGS_UNRELEASED=rcsb_repository_holdings_unreleased_entry
        COLLECTION_HOLDINGS_REMOVED=rcsb_repository_holdings_removed_entry
        COLLECTION_HOLDINGS_COMBINED=rcsb_repository_holdings_combined_entry

        """
        try:
            sectionName = "repository_holdings_configuration"
            rhdp = RepoHoldingsRemoteDataPrep(cachePath=self.__cachePath,
                                              filterType=self.__filterType)
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            databaseName = self.__cfgOb.get("DATABASE_NAME",
                                            sectionName=sectionName)
            logger.info("databaseName %r", databaseName)
            addValues = None
            #
            maxDoc = 5
            dList = rhdp.getHoldingsRemovedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_REMOVED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsUnreleasedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UNRELEASED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsUpdateEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_UPDATE",
                                              sectionName=sectionName)
            logger.info("collectionName %r", collectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsCurrentEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_CURRENT",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
            dList = rhdp.getHoldingsCombinedEntry(updateId=self.__updateId)
            dList = dList[:maxDoc] if maxDoc else dList
            collectionName = self.__cfgOb.get("COLLECTION_HOLDINGS_COMBINED",
                                              sectionName=sectionName)
            ok = dl.load(databaseName,
                         collectionName,
                         loadType="full",
                         documentList=dList,
                         indexAttributeList=["update_id", "entry_id"],
                         keyNames=None,
                         addValues=addValues)
            logger.info("Collection %r length %d load status %r",
                        collectionName, len(dList), ok)
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Esempio n. 14
0
    def testReadYamlConfig(self):
        try:
            cfgOb = ConfigUtil(configPath=self.__inpPathConfigYaml,
                               configFormat="yaml",
                               mockTopPath=self.__mockTopPath)
            ok = cfgOb.appendConfig(self.__inpPathConfigAppendYaml,
                                    configFormat="yaml")
            self.assertTrue(ok)
            #
            sName = "DEFAULT"
            pathBird = cfgOb.getPath("BIRD_REPO_PATH", sectionName=sName)
            pathPdbx = cfgOb.getPath("PDBX_REPO_PATH", sectionName=sName)
            #
            self.assertEqual(
                pathBird, os.path.join(self.__mockTopPath, "MOCK_BIRD_REPO"))
            self.assertEqual(
                pathPdbx, os.path.join(self.__mockTopPath,
                                       "MOCK_PDBX_SANDBOX"))

            pathBird = cfgOb.get("BIRD_REPO_PATH", sectionName=sName)
            pathPdbx = cfgOb.get("PDBX_REPO_PATH", sectionName=sName)

            self.assertEqual(pathBird, "MOCK_BIRD_REPO")
            self.assertEqual(pathPdbx, "MOCK_PDBX_SANDBOX")
            sName = "Section1"
            #
            helperMethod = cfgOb.getHelper("DICT_METHOD_HELPER_MODULE",
                                           sectionName=sName)

            tv = helperMethod.echo("test_value")
            self.assertEqual(tv, "test_value")
            #
            tEnv = "TEST_ENV_VAR"
            tVal = "TEST_ENV_VAR_VALUE"
            os.environ[tEnv] = tVal
            eVal = cfgOb.getEnvValue("ENV_OPTION_A", sectionName=sName)
            self.assertEqual(tVal, eVal)

            ky = "42d13dfc9eb689e48c774aa5af8a7e15dbabcd5041939bef213eb37aed882fd6"
            os.environ["CONFIG_SUPPORT_TOKEN_ENV"] = ky
            #
            un = cfgOb.getSecret("SECRET_TEST_USERNAME",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.getSecret("SECRET_TEST_PASSWORD",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser")
            self.assertEqual(pw, "testuserpassword")
            #
            un = cfgOb.get("_TEST_USERNAME",
                           default=None,
                           sectionName=sName,
                           tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.get("_TEST_PASSWORD",
                           default=None,
                           sectionName=sName,
                           tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser")
            self.assertEqual(pw, "testuserpassword")
            #
            un = cfgOb.getSecret("_TEST_USERNAME",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.getSecret("_TEST_PASSWORD",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser")
            self.assertEqual(pw, "testuserpassword")
            #
            sName = "Section2"
            un = cfgOb.getSecret("_TEST_USERNAME",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.getSecret("_TEST_PASSWORD",
                                 default=None,
                                 sectionName=sName,
                                 tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser")
            self.assertEqual(pw, "testuserpassword")
            # test fallback
            # CLEAR_TEXT_USERNAME: testuser2
            # CLEAR_TEXT_PASSWORD: changeme2
            un = cfgOb.get("_CLEAR_TEXT_USERNAME",
                           default=None,
                           sectionName=sName,
                           tokenName="CONFIG_SUPPORT_TOKEN")
            pw = cfgOb.get("_CLEAR_TEXT_PASSWORD",
                           default=None,
                           sectionName=sName,
                           tokenName="CONFIG_SUPPORT_TOKEN")
            self.assertEqual(un, "testuser2")
            self.assertEqual(pw, "changeme2")
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return {}
class SequenceClusterLoaderTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SequenceClusterLoaderTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        #
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        # self.__cfgOb.dump()
        self.__resourceName = "MONGO_DB"
        self.__failedFilePath = os.path.join(HERE, "test-output",
                                             "failed-list.txt")
        self.__readBackCheck = True
        self.__numProc = 2
        self.__chunkSize = 10
        self.__documentLimit = 1000
        #
        # sample data set
        self.__dataSetId = "2018_23"
        self.__pathClusterData = self.__cfgOb.getPath(
            "RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        self.__levels = ["100", "95", "90", "70", "50", "30"]
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__pathSaveStyleCif = os.path.join(HERE, "test-output",
                                               "cluster-data-cif.json")
        self.__pathSaveStyleDocSequence = os.path.join(
            HERE, "test-output", "cluster-data-doc-sequence.json")
        self.__pathSaveStyleDocCluster = os.path.join(
            HERE, "test-output", "cluster-data-doc-cluster.json")
        #
        self.__entitySchemaName = "rcsb_entity_sequence_cluster_list"
        self.__clusterSchemaName = "rcsb_entity_sequence_cluster_identifer_list"
        self.__provKeyName = "rcsb_entity_sequence_cluster_prov"
        #
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __fetchProvenance(self):
        """ Test case for fetching a provenance dictionary content.
        """
        try:
            provU = ProvenanceProvider(self.__cfgOb, self.__cachePath)
            pD = provU.fetch()
            return pD[self.__provKeyName] if self.__provKeyName in pD else {}
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __testExtract(self, dataSetId, dataLocator, levels):
        """ Test extraction on an example sequence cluster data set.
        """
        try:
            cdp = ClusterDataPrep(workPath=self.__workPath,
                                  entitySchemaName=self.__entitySchemaName,
                                  clusterSchemaName=self.__clusterSchemaName)
            cifD, docBySequenceD, docByClusterD = cdp.extract(
                dataSetId,
                clusterSetLocator=dataLocator,
                levels=levels,
                clusterType="entity")
            self.assertEqual(len(cifD), 1)
            self.assertEqual(len(docBySequenceD), 1)
            self.assertEqual(len(docByClusterD), 1)
            return docBySequenceD, docByClusterD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadCluster(self):
        """ Test case - load example sequence cluster document data
        """
        try:
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            docBySequenceD, docByClusterD = self.__testExtract(
                dataSetId=self.__dataSetId,
                dataLocator=self.__pathClusterData,
                levels=self.__levels)
            #
            dList = docBySequenceD[self.__entitySchemaName]
            ok = dl.load(
                "sequence_clusters",
                "entity_members",
                loadType="full",
                documentList=dList,
                indexAttributeList=["data_set_id", "entry_id", "entity_id"],
                keyNames=None)
            self.assertTrue(ok)
            dList = docByClusterD[self.__clusterSchemaName]
            ok = dl.load(
                "sequence_clusters",
                "cluster_members",
                loadType="full",
                documentList=dList,
                indexAttributeList=["data_set_id", "identity", "cluster_id"],
                keyNames=None)
            self.assertTrue(ok)
            pD = self.__fetchProvenance()
            ok = dl.load("sequence_clusters",
                         "cluster_provenance",
                         loadType="full",
                         documentList=[pD],
                         indexAttributeList=None,
                         keyNames=None)
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()