class RepoHoldingsDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        self.__export = False
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testValidateOptsStrict(self):
        updateId = self.__updateId
        schemaLevel = "full"
        eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertTrue(eCount <= 1)

    @unittest.skip("Troubleshooting test")
    def testValidateOptsMin(self):
        updateId = self.__updateId
        schemaLevel = "min"
        eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertTrue(eCount <= 1)

    def __testValidateOpts(self, updateId, schemaLevel="full"):
        schemaNames = ["repository_holdings"]
        collectionNames = {
            "repository_holdings": [
                "repository_holdings_update_entry",
                "repository_holdings_current_entry",
                "repository_holdings_unreleased_entry",
                "repository_holdings_removed_entry",
                "repository_holdings_combined_entry",
            ],
            "entity_sequence_clusters":
            ["cluster_members", "cluster_provenance", "entity_members"],
        }
        #
        eCount = 0
        for schemaName in schemaNames:
            for collectionName in collectionNames[schemaName]:
                _ = self.__schP.makeSchemaDef(schemaName,
                                              dataTyping="ANY",
                                              saveSchema=True)
                cD = self.__schP.makeSchema(schemaName,
                                            collectionName,
                                            encodingType="JSON",
                                            level=schemaLevel,
                                            saveSchema=True)
                dL = self.__getRepositoryHoldingsDocuments(
                    schemaName, collectionName, updateId)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            collectionName + ".json")
                    self.__mU.doExport(savePath, dL, fmt="json", indent=3)
                # Raises exceptions for schema compliance.
                Draft4Validator.check_schema(cD)
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d",
                                 schemaName, collectionName, ii)
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info(
                                "schema %s collection %s path %s error: %s",
                                schemaName, collectionName, error.path,
                                error.message)
                            logger.info(">>>")
                            logger.info(">>> failing object is %r", dD)
                            logger.info(">>>")
                            eCount += 1
                            cCount += 1
                        #
                        logger.debug("schema %s collection %s count %d",
                                     schemaName, collectionName, cCount)
                    except Exception as e:
                        logger.exception("Validation error %s", str(e))

        return eCount

    def __getRepositoryHoldingsDocuments(self, schemaName, collectionName,
                                         updateId):
        """Test loading and processing operations for legacy holdings and status data."""
        rL = []
        try:
            rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb,
                                        sandboxPath=self.__sandboxPath,
                                        workPath=self.__cachePath)
            if collectionName == "repository_holdings_update_entry":
                rL = rhdp.getHoldingsUpdateEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("update data length %r", len(rL))
            #
            elif collectionName == "repository_holdings_current_entry":
                rL = rhdp.getHoldingsCurrentEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("holdings data length %r", len(rL))
            #
            elif collectionName == "repository_holdings_unreleased_entry":
                rL = rhdp.getHoldingsUnreleasedEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("unreleased data length %r", len(rL))
            #
            elif collectionName in ["repository_holdings_removed_entry"]:
                rL = rhdp.getHoldingsRemovedEntry(updateId=updateId)
                if collectionName == "repository_holdings_removed":
                    self.assertGreaterEqual(len(rL), 10)
                    logger.debug("removed data length %r", len(rL))
            elif collectionName == "repository_holdings_combined_entry":
                rL = rhdp.getHoldingsCombinedEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("holdings data length %r", len(rL))

            #
        except Exception as e:
            logger.exception("%s %s failing with %s", schemaName,
                             collectionName, str(e))
            self.fail()

        return rL
class SchemaDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__numProc = 2
        # self.__fileLimit = 200
        self.__fileLimit = None
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = True
        #
        self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        #self.__extraOpts = None
        # The following for extended parent/child info -
        self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsRepo(self):
        # schemaLevel = "min"

        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        self.assertLessEqual(eCount, 1)

    @unittest.skip("Disable troubleshooting test")
    def testValidateOptsList(self):
        schemaLevel = "min"
        inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list")
        # inputPathList = glob.glob(self.__testDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]}
        for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)):
            if ii < 5:
                continue
            eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
            logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmRepo(self):
        schemaLevel = "min"
        inputPathList = None
        self.__export = True

        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmList(self):
        #schemaLevel = "full"
        schemaLevel = "min"

        inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif")
        if not inputPathList:
            return True
        #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None):
        #
        eCount = 0
        for databaseName in databaseNameD:
            mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None
            _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
            pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes)
            for collectionName in databaseNameD[databaseName]:
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts)
                #
                dL, cnL = self.__testPrepDocumentsFromContainers(
                    pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes
                )
                # Raises exceptions for schema compliance.
                try:
                    Draft4Validator.check_schema(cD)
                except Exception as e:
                    logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e))
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName)
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii)
                    try:
                        cCount = 0
                        #for error in sorted(valInfo.iter_errors(dD), key=str):
                        #    logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message)
                        #    logger.debug("Failing document %d : %r", ii, list(dD.items()))
                        #    eCount += 1
                        #    cCount += 1
                        #if cCount > 0:
                        #    logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount)
                    except Exception as e:
                        logger.exception("Validation processing error %s", str(e))

        return eCount

    def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None):
        """Test case -  create loadable PDBx data from repository files
        """
        try:

            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName)
            #
            dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False)
            dictApi = dP.getApiByName(databaseName)
            rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif")
                    #self.__mU.doExport(savePath, [container], fmt="mmcif")
            #
            tableIdExcludeList = sd.getCollectionExcluded(collectionName)
            tableIdIncludeList = sd.getCollectionSelected(collectionName)
            sliceFilter = sd.getCollectionSliceFilter(collectionName)
            sdp.setSchemaIdExcludeList(tableIdExcludeList)
            sdp.setSchemaIdIncludeList(tableIdIncludeList)
            #
            docList, containerNameList, _ = sdp.processDocuments(
                containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName
            )

            docList = sdp.addDocumentPrivateAttributes(docList, collectionName)
            docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName)
            #
            mergeS = "-".join(mergeContentTypes) if mergeContentTypes else ""
            if self.__export and docList:
                # for ii, doc in enumerate(docList[:1]):
                for ii, doc in enumerate(docList):
                    cn = containerNameList[ii]
                    fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS))
                    self.__mU.doExport(fp, [doc], fmt="json", indent=3)
                    logger.debug("Exported %r", fp)
            #
            return docList, containerNameList

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Ejemplo n.º 3
0
class SchemaDefBuildTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=False)
        #
        self.__validationLevels = self.__cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        self.__encodingTypes = self.__cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
        #
        buildAll = True
        if buildAll:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_DEPLOYED",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_DEPLOYED",
                sectionName="database_catalog_configuration")
            #
        else:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_TEST",
                sectionName="database_catalog_configuration")
            # self.__databaseNameList = ["repository_holdings"]
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_TEST",
                sectionName="database_catalog_configuration")

        # self.__databaseNameList = ["sequence_clusters"]
        self.__saveSchema = True
        self.__compareDefSchema = False
        self.__compareSchema = False
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testBuildSchemaDefs(self):
        try:
            for databaseName in self.__databaseNameList:
                for dataTyping in self.__dataTypingList:
                    logger.debug("Building schema %s with types %s",
                                 databaseName, dataTyping)
                    self.__schP.makeSchemaDef(databaseName,
                                              dataTyping=dataTyping,
                                              saveSchema=self.__saveSchema)
                    if self.__compareDefSchema:
                        self.__schP.schemaDefCompare(databaseName, dataTyping)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testBuildCollectionSchema(self):
        schemaDifPathList = []
        for databaseName in self.__databaseNameList:
            dD = self.__schP.makeSchemaDef(databaseName,
                                           dataTyping="ANY",
                                           saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for encodingType in self.__encodingTypes:
                    if encodingType.lower() == "rcsb":
                        continue
                    for level in self.__validationLevels:
                        self.__schP.makeSchema(databaseName,
                                               collectionName,
                                               encodingType=encodingType,
                                               level=level,
                                               saveSchema=self.__saveSchema)
                        if self.__compareSchema and encodingType.lower(
                        ) == "json":
                            pth = self.__schP.jsonSchemaCompare(
                                databaseName, collectionName, encodingType,
                                level)
                            if pth:
                                schemaDifPathList.append(pth)
        if schemaDifPathList:
            logger.info("Path dif list %r", schemaDifPathList)

    def testCompareSchema(self):
        databaseName = "pdbx_core"
        collectionName = "pdbx_core_entry"
        encodingType = "json"
        level = "full"
        #
        oldPath = os.path.join(
            HERE, "test-saved-output",
            "json-full-db-pdbx_core-col-pdbx_core_entry.json")
        mU = MarshalUtil(workPath=os.path.join(HERE, "test-output"))
        sOld = mU.doImport(oldPath, fmt="json")
        sNew = self.__schP.makeSchema(databaseName,
                                      collectionName,
                                      encodingType=encodingType,
                                      level=level)
        numDif, difD = self.__schP.schemaCompare(sOld, sNew)
        logger.debug("numDiffs %d", numDif)
        self.assertGreaterEqual(numDif, 141)
        self.assertGreaterEqual(len(difD["changed"]), 160)
        logger.debug("difD %r", difD)

    @unittest.skip("Deprecated test")
    def testCompareSchemaCategories(self):
        """Compare common categories across schema definitions."""
        try:
            sdCc = SchemaDefAccess(
                self.__schP.makeSchemaDef("chem_comp_core",
                                          dataTyping="ANY",
                                          saveSchema=False))
            sdBcc = SchemaDefAccess(
                self.__schP.makeSchemaDef("bird_chem_comp_core",
                                          dataTyping="ANY",
                                          saveSchema=False))
            #
            logger.info("")
            for schemaId in ["CHEM_COMP", "PDBX_CHEM_COMP_AUDIT"]:
                atCcL = sdCc.getAttributeIdList(schemaId)
                atBCcL = sdBcc.getAttributeIdList(schemaId)

                logger.debug("%s attributes (%d) %r", schemaId, len(atCcL),
                             atCcL)
                logger.debug("%s attributes (%d) %r", schemaId, len(atBCcL),
                             atBCcL)

                sDif = set(atCcL) - set(atBCcL)
                if sDif:
                    logger.info("For %s attribute differences %r", schemaId,
                                sDif)
                self.assertEqual(len(sDif), 0)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testBuildColSchemaWithRefs(self):
        for databaseName in ["ihm_dev_full"]:
            dD = self.__schP.makeSchemaDef(databaseName,
                                           dataTyping="ANY",
                                           saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for schemaType in self.__encodingTypes:
                    if schemaType.lower() == "rcsb":
                        continue
                    for level in self.__validationLevels:
                        self.__schP.makeSchema(
                            databaseName,
                            collectionName,
                            encodingType=schemaType,
                            level=level,
                            saveSchema=True,
                            extraOpts="addParentRefs|addPrimaryKey")
Ejemplo n.º 4
0
class ClusterDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        #
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName)
        #
        self.__dataSetId = "2018_23"
        self.__pathClusterData = self.__cfgOb.getPath("RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        # self.__levels = ['100', '95', '90', '70', '50', '30']
        self.__levels = ["100"]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsStrict(self):
        updateId = self.__updateId
        validationLevel = "full"
        eCount = self.__testValidateOpts(updateId, validationLevel=validationLevel)
        logger.info("Total validation errors validation level %s : %d", validationLevel, eCount)
        self.assertTrue(eCount <= 1)

    def __testValidateOpts(self, updateId, validationLevel="full"):
        _ = updateId
        databaseNames = ["sequence_clusters"]
        collectionNames = {"sequence_clusters": ["cluster_provenance", "cluster_members", "entity_members"]}
        #
        eCount = 0
        for databaseName in databaseNames:
            for collectionName in collectionNames[databaseName]:
                _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=validationLevel, saveSchema=True)
                #
                dL = self.__getSequenceClusterData(collectionName, levels=self.__levels, dataSetId=self.__dataSetId, dataLocator=self.__pathClusterData)
                # Raises exceptions for schema compliance.
                Draft4Validator.check_schema(cD)
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                for _, dD in enumerate(dL):
                    # logger.debug("Schema %s collection %s document %d" % (schemaName, collectionName, ii))
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info("schema %s collection %s path %s error: %s", databaseName, collectionName, error.path, error.message)
                            logger.info(">>> failing object is %r", dD)
                            eCount += 1
                            cCount += 1
                        #
                        logger.debug("schema %s collection %s count %d", databaseName, collectionName, cCount)
                    except Exception as e:
                        logger.exception("Validation error %s", str(e))

        return eCount

    def __fetchProvenance(self):
        """Test case for fetching a provenance dictionary content."""
        try:
            provKeyName = "rcsb_entity_sequence_cluster_prov"
            provU = ProvenanceProvider(self.__cfgOb, self.__cachePath, useCache=True)
            pD = provU.fetch()
            return pD[provKeyName] if provKeyName in pD else {}
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __getSequenceClusterData(self, collectionName, dataSetId=None, dataLocator=None, levels=None):
        """Test extraction on an example sequence cluster data set."""
        try:
            #
            if collectionName == "cluster_provenance":
                return [self.__fetchProvenance()]
            #
            entitySchemaName = "rcsb_entity_sequence_cluster_list"
            clusterSchemaName = "rcsb_entity_sequence_cluster_identifer_list"
            cdp = ClusterDataPrep(workPath=self.__cachePath, entitySchemaName=entitySchemaName, clusterSchemaName=clusterSchemaName)
            cifD, docBySequenceD, docByClusterD = cdp.extract(dataSetId, clusterSetLocator=dataLocator, levels=levels, clusterType="entity")
            self.assertEqual(len(cifD), 1)
            self.assertEqual(len(docBySequenceD), 1)
            self.assertEqual(len(docByClusterD), 1)
            if collectionName == "entity_members":
                return docBySequenceD[entitySchemaName]
            elif collectionName == "cluster_members":
                return docByClusterD[clusterSchemaName]

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return None
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument(
        "--update_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_chem_comp_core_ref",
        default=False,
        action="store_true",
        help="Update core schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Update core schema for Bird Chemical Component reference definitions")

    parser.add_argument("--update_bird_ref",
                        default=False,
                        action="store_true",
                        help="Update schema for Bird reference definitions")
    parser.add_argument(
        "--update_bird_family_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Family reference definitions")

    parser.add_argument("--update_pdbx",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx entry data")
    parser.add_argument("--update_pdbx_core",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx core entry/entity data")
    parser.add_argument(
        "--update_pdbx_comp_model_core",
        default=False,
        action="store_true",
        help="Update schema for PDBx computational model core entry/entity data"
    )
    #
    parser.add_argument("--update_repository_holdings",
                        default=False,
                        action="store_true",
                        help="Update schema for repository holdings")
    parser.add_argument("--update_entity_sequence_clusters",
                        default=False,
                        action="store_true",
                        help="Update schema for entity sequence clusters")
    parser.add_argument("--update_data_exchange",
                        default=False,
                        action="store_true",
                        help="Update schema for data exchange status")
    parser.add_argument("--update_ihm_dev",
                        default=False,
                        action="store_true",
                        help="Update schema for I/HM dev entry data")
    parser.add_argument("--update_drugbank_core",
                        default=False,
                        action="store_true",
                        help="Update DrugBank schema")
    #
    parser.add_argument(
        "--update_config_all",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_ALL)")
    parser.add_argument(
        "--update_config_deployed",
        default=False,
        action="store_true",
        help=
        "Update using configuration settings (e.g. DATABASE_NAMES_DEPLOYED)")
    parser.add_argument(
        "--update_config_test",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_TEST)")
    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")
    #
    parser.add_argument("--cache_path",
                        default=None,
                        help="Schema cache directory path")
    parser.add_argument(
        "--encoding_types",
        default=None,
        help="Schema encoding (rcsb|json|bson) (comma separated)")
    parser.add_argument(
        "--validation_levels",
        default=None,
        help="Schema validation level (full|min) (comma separated)")
    parser.add_argument("--compare_only",
                        default=False,
                        action="store_true",
                        help="Perform comparison with cached schema")
    #
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument(
        "--mock",
        default=False,
        action="store_true",
        help="Use MOCK repository configuration for dependencies and testing")
    # parser.add_argument("--working_path", default=None, help="Working/alternative path for temporary and schema files")
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    cachePath = args.cache_path
    compareOnly = args.compare_only
    #
    encodingTypes = args.encoding_types.split(
        ",") if args.encoding_types else []
    validationLevels = args.validation_levels.split(
        ",") if args.validation_levels else []
    dataTypingList = ["ANY", "SQL"]

    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)
    #
    databaseNameList = []
    if args.update_chem_comp_ref:
        databaseNameList.append("chem_comp")

    if args.update_bird_chem_comp_ref:
        databaseNameList.append("bird_chem_comp")

    if args.update_chem_comp_core_ref:
        databaseNameList.append("chem_comp_core")

    if args.update_bird_chem_comp_core_ref:
        databaseNameList.append("bird_chem_comp_core")

    if args.update_bird_ref:
        databaseNameList.append("bird")

    if args.update_bird_family_ref:
        databaseNameList.append("bird_family")

    if args.update_pdbx:
        databaseNameList.append("pdbx")

    if args.update_pdbx_core:
        databaseNameList.append("pdbx_core")

    if args.update_pdbx_comp_model_core:
        databaseNameList.append("pdbx_comp_model_core")

    if args.update_repository_holdings:
        databaseNameList.append("repository_holdings")

    if args.update_entity_sequence_clusters:
        databaseNameList.append("sequence_clusters")

    if args.update_data_exchange:
        databaseNameList.append("data_exchange")

    if args.update_ihm_dev:
        databaseNameList.append("ihm_dev")

    if args.update_drugbank_core:
        databaseNameList.append("drugbank_core")

    if args.update_config_deployed:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_DEPLOYED",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_DEPLOYED",
            sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_DEPLOYED",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_DEPLOYED",
            sectionName="database_catalog_configuration")

    if args.update_config_all:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_ALL", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_ALL",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_ALL", sectionName="database_catalog_configuration")

    if args.update_config_test:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_TEST",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_TEST", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
    #
    scnD = cfgOb.get("document_collection_names",
                     sectionName="document_helper_configuration")
    #
    databaseNameList = list(set(databaseNameList))
    logger.debug("Collections %s", list(scnD.items()))
    logger.debug("databaseNameList %s", databaseNameList)

    if compareOnly:
        schP = SchemaProvider(cfgOb, cachePath, useCache=True)
        difPathList = []
        for databaseName in databaseNameList:
            for dataTyping in dataTypingList:
                logger.debug("Building schema %s with types %s", databaseName,
                             dataTyping)
                pth = schP.schemaDefCompare(databaseName, dataTyping)
                if pth:
                    difPathList.append(pth)
        if difPathList:
            logger.info("Schema definition difference path list %r",
                        difPathList)
        difPathList = []
        for databaseName in databaseNameList:
            dD = schP.makeSchemaDef(databaseName,
                                    dataTyping="ANY",
                                    saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for encodingType in encodingTypes:
                    if encodingType.lower() != "json":
                        continue
                    for level in validationLevels:
                        pth = schP.jsonSchemaCompare(databaseName,
                                                     collectionName,
                                                     encodingType, level)
                        if pth:
                            difPathList.append(pth)
        if difPathList:
            logger.info("JSON schema difference path list %r", difPathList)

    else:
        schP = SchemaProvider(cfgOb, cachePath, useCache=False)
        for databaseName in databaseNameList:
            for encodingType in encodingTypes:
                if encodingType == "rcsb":
                    for dataTyping in dataTypingList:
                        logger.info(
                            "Creating schema definition for content type %s data typing %s",
                            databaseName, dataTyping)
                        schP.makeSchemaDef(databaseName,
                                           dataTyping=dataTyping,
                                           saveSchema=True)
                else:
                    if databaseName in scnD:
                        for dD in scnD[databaseName]:
                            collectionName = dD["NAME"]
                            for validationLevel in validationLevels:
                                logger.info(
                                    "Creating %r schema for content type %s collection %s",
                                    encodingType, databaseName, collectionName)
                                schP.makeSchema(databaseName,
                                                collectionName,
                                                encodingType=encodingType,
                                                level=validationLevel,
                                                saveSchema=True)
Ejemplo n.º 6
0
class ChemRefDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testValidateFull(self):
        self.__validateChemRef("DrugBank", schemaLevel="full")

    def __validateChemRef(self, extResource, schemaLevel="full"):
        eCount = 0
        if extResource == "DrugBank":
            schemaName = "drugbank_core"
            collectionNames = ["drugbank_core"]
            user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME",
                                    sectionName=self.__configName)
            pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD",
                                  sectionName=self.__configName)
            # cacheDir = self.__cfgOb.get("DRUGBANK_CACHE_DIR", sectionName=self.__configName)
            dbP = DrugBankProvider(cachePath=self.__cachePath,
                                   useCache=True,
                                   username=user,
                                   password=pw)
            # idD = dbP.getMapping()
            # crExt = ChemRefExtractor(self.__cfgOb)
            # idD = crExt.getChemCompAccesionMapping(extResource)
            dList = dbP.getDocuments()
            logger.info("Validating %d Drugbank documents", len(dList))
            eCount = self.__validate(schemaName,
                                     collectionNames,
                                     dList,
                                     schemaLevel=schemaLevel)

        return eCount

    def __validate(self,
                   databaseName,
                   collectionNames,
                   dList,
                   schemaLevel="full"):

        eCount = 0
        for collectionName in collectionNames:
            _ = self.__schP.makeSchemaDef(databaseName,
                                          dataTyping="ANY",
                                          saveSchema=True)
            cD = self.__schP.makeSchema(databaseName,
                                        collectionName,
                                        encodingType="JSON",
                                        level=schemaLevel,
                                        saveSchema=True)
            # Raises exceptions for schema compliance.
            Draft4Validator.check_schema(cD)
            #
            valInfo = Draft4Validator(cD, format_checker=FormatChecker())
            for ii, dD in enumerate(dList):
                logger.debug("Database %s collection %s document %d",
                             databaseName, collectionName, ii)
                try:
                    cCount = 0
                    for error in sorted(valInfo.iter_errors(dD), key=str):
                        logger.info(
                            "database %s collection %s path %s error: %s",
                            databaseName, collectionName, error.path,
                            error.message)
                        logger.info(">>> failing object is %r", dD)
                        eCount += 1
                        cCount += 1
                    #
                    logger.debug("database %s collection %s count %d",
                                 databaseName, collectionName, cCount)
                except Exception as e:
                    logger.exception("Validation error %s", str(e))

        return eCount
Ejemplo n.º 7
0
class ObjectValidator(object):
    """Utilities to extract and update object from the document object server with validation."""
    def __init__(self,
                 cfgOb,
                 objectAdapter=None,
                 cachePath=".",
                 useCache=True,
                 **kwargs):
        self.__cfgOb = cfgOb
        self.__oAdapt = objectAdapter
        self.__resourceName = "MONGO_DB"
        _ = kwargs
        self.__statusList = []
        self.__schP = SchemaProvider(self.__cfgOb,
                                     cachePath,
                                     useCache=useCache)
        self.__valInst = None

    def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
        _ = self.__schP.makeSchemaDef(databaseName,
                                      dataTyping="ANY",
                                      saveSchema=True)
        cD = self.__schP.makeSchema(databaseName,
                                    collectionName,
                                    encodingType="JSON",
                                    level=schemaLevel,
                                    saveSchema=True)
        # Raises exceptions for schema compliance.
        Draft4Validator.check_schema(cD)
        valInst = Draft4Validator(cD, format_checker=FormatChecker())
        return valInst

    def __validateObj(self, databaseName, collectionName, rObj, label=""):
        try:
            eCount = 0
            tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
            for error in sorted(self.__valInst.iter_errors(rObj), key=str):
                logger.info(
                    "Database %s collection %s (%s %r) path %s error: %s",
                    databaseName, collectionName, label, tId, error.path,
                    error.message)
                logger.debug(">>> Failing object is %r", rObj)
                eCount += 1
        except Exception as e:
            logger.exception("Validation failing %s", str(e))

        return eCount

    def doTransform(self, **kwargs):
        desp = DataExchangeStatus()
        statusStartTimestamp = desp.setStartTime()
        #
        databaseName = kwargs.get("databaseName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("selectionQuery", {})
        fetchLimit = kwargs.get("fetchLimit", None)
        #

        #
        tU = TimeUtil()
        updateId = kwargs.get("updateId", tU.getCurrentWeekSignature())
        #
        docSelectList = self.__selectObjectIds(databaseName, collectionName,
                                               selectionQueryD)
        docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList

        ok = self.__transform(databaseName, collectionName, docSelectList)
        #
        if updateId:
            okS = self.__updateStatus(updateId, databaseName, collectionName,
                                      ok, statusStartTimestamp)
        return ok and okS

    def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
        """Return a list of object identifiers for the input selection query."""
        try:

            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    selectL = ["_id"]
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return dL
        #

    def __transform(self,
                    databaseName,
                    collectionName,
                    docSelectList,
                    logIncrement=100):
        """Return a list of object identifiers for the input selection query."""
        #
        ok = True
        try:
            self.__valInst = self.__getValidator(databaseName,
                                                 collectionName,
                                                 schemaLevel="full")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    numDoc = len(docSelectList)
                    for ii, dD in enumerate(docSelectList, 1):
                        if "_id" not in dD:
                            continue
                        rObj = mg.fetchOne(databaseName, collectionName, "_id",
                                           dD["_id"])
                        del rObj["_id"]
                        #
                        fOk = True

                        if self.__oAdapt:
                            self.__validateObj(databaseName,
                                               collectionName,
                                               rObj,
                                               label="Original")
                            fOk, rObj = self.__oAdapt.filter(rObj)
                            self.__validateObj(databaseName,
                                               collectionName,
                                               rObj,
                                               label="Updated")
                        if fOk:
                            rOk = mg.replace(databaseName, collectionName,
                                             rObj, dD)
                            if rOk is None:
                                tId = rObj[
                                    "rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
                                logger.error("%r %r (%r) failing",
                                             databaseName, collectionName, tId)
                                # logger.info("rObj.keys() %r", list(rObj.keys()))
                                # logger.info("rObj.items() %s", rObj.items())
                                rOk = False
                            ok = ok and rOk
                        #
                        if ii % logIncrement == 0 or ii == numDoc:
                            logger.info("Replace status %r object (%d of %d)",
                                        ok, ii, numDoc)
                        #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def getLoadStatus(self):
        return self.__statusList

    def __updateStatus(self, updateId, databaseName, collectionName, status,
                       startTimestamp):
        try:
            sFlag = "Y" if status else "N"
            desp = DataExchangeStatus()
            desp.setStartTime(tS=startTimestamp)
            desp.setObject(databaseName, collectionName)
            desp.setStatus(updateId=updateId, successFlag=sFlag)
            desp.setEndTime()
            self.__statusList.append(desp.getStatus())
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False