def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        self.__export = False
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
 def setUp(self):
     self.__isMac = platform.system() == "Darwin"
     self.__excludeType = None if self.__isMac else "optional"
     self.__verbose = True
     #
     fileLimit = 100
     numProc = 2
     self.__cachePath = os.path.join(TOPDIR, "CACHE")
     self.__workPath = os.path.join(HERE, "test-output")
     mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                               "exdb-config-example.yml")
     #
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName,
                               mockTopPath=mockTopPath)
     self.__resourceName = "MYSQL_DB"
     #
     self.__schP = SchemaProvider(self.__cfgOb,
                                  self.__cachePath,
                                  useCache=True)
     self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                     numProc=numProc,
                                     fileLimit=fileLimit,
                                     cachePath=self.__cachePath)
     #
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
 def __init__(self,
              cfgOb,
              cachePath,
              useCache=True,
              numProc=2,
              chunkSize=10,
              readBackCheck=False,
              documentLimit=None,
              doValidate=False,
              verbose=False):
     self.__cfgOb = cfgOb
     self.__cachePath = cachePath
     self.__useCache = useCache
     self.__readBackCheck = readBackCheck
     self.__numProc = numProc
     self.__chunkSize = chunkSize
     self.__documentLimit = documentLimit
     #
     self.__resourceName = "MONGO_DB"
     self.__verbose = verbose
     self.__statusList = []
     self.__schP = SchemaProvider(self.__cfgOb,
                                  self.__cachePath,
                                  useCache=self.__useCache)
     self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
     self.__valInst = None
     self.__doValidate = doValidate
Beispiel #4
0
 def setUp(self):
     self.__verbose = True
     self.__numProc = 2
     self.__fileLimit = 100
     self.__chunkSize = 0
     self.__workPath = os.path.join(HERE, "test-output")
     self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                               "exdb-config-example.yml")
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName)
     self.__resourceName = "CRATE_DB"
     self.__schP = SchemaProvider(self.__cfgOb,
                                  self.__workPath,
                                  useCache=True)
     self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                     numProc=self.__numProc,
                                     fileLimit=self.__fileLimit,
                                     cachePath=self.__workPath)
     #
     #
     self.__tableIdSkipD = {
         "ATOM_SITE": True,
         "ATOM_SITE_ANISOTROP": True,
         "__LOAD_STATUS__": True
     }
     self.__ioObj = IoAdapter(verbose=self.__verbose)
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Beispiel #5
0
 def __init__(
     self,
     cfgOb,
     cachePath,
     resourceName="MONGO_DB",
     numProc=4,
     chunkSize=15,
     documentLimit=None,
     verbose=False,
     readBackCheck=False,
     maxStepLength=2000,
     schemaRebuildFlag=False,
 ):
     self.__verbose = verbose
     #
     # Limit the load length of each file type for testing  -  Set to None to remove -
     self.__documentLimit = documentLimit
     self.__maxStepLength = maxStepLength
     #
     # Controls for multiprocessing execution -
     self.__numProc = numProc
     self.__chunkSize = chunkSize
     #
     self.__cfgOb = cfgOb
     self.__resourceName = resourceName
     #
     self.__cachePath = cachePath if cachePath else "."
     self.__schP = SchemaProvider(cfgOb,
                                  cachePath,
                                  useCache=True,
                                  rebuildFlag=schemaRebuildFlag)
     #
     self.__readBackCheck = readBackCheck
     self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"
    def setUp(self):
        self.__numProc = 2
        # self.__fileLimit = 200
        self.__fileLimit = None
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = True
        #
        self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        #self.__extraOpts = None
        # The following for extended parent/child info -
        self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Beispiel #7
0
 def __init__(self,
              cfgOb,
              objectAdapter=None,
              cachePath=".",
              useCache=True,
              **kwargs):
     self.__cfgOb = cfgOb
     self.__oAdapt = objectAdapter
     self.__resourceName = "MONGO_DB"
     _ = kwargs
     self.__statusList = []
     self.__schP = SchemaProvider(self.__cfgOb,
                                  cachePath,
                                  useCache=useCache)
     self.__valInst = None
Beispiel #8
0
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=False)
        #
        self.__validationLevels = self.__cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        self.__encodingTypes = self.__cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
        #
        buildAll = True
        if buildAll:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_DEPLOYED",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_DEPLOYED",
                sectionName="database_catalog_configuration")
            #
        else:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_TEST",
                sectionName="database_catalog_configuration")
            # self.__databaseNameList = ["repository_holdings"]
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_TEST",
                sectionName="database_catalog_configuration")

        # self.__databaseNameList = ["sequence_clusters"]
        self.__saveSchema = True
        self.__compareDefSchema = False
        self.__compareSchema = False
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Beispiel #9
0
 def setUp(self):
     self.__verbose = True
     mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     self.__cachePath = os.path.join(TOPDIR, "CACHE")
     pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                               "exdb-config-example.yml")
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=pathConfig,
                               defaultSectionName=configName,
                               mockTopPath=mockTopPath)
     self.__schP = SchemaProvider(self.__cfgOb,
                                  self.__cachePath,
                                  useCache=True,
                                  clearPath=False)
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Beispiel #10
0
 def setUp(self):
     self.__verbose = True
     #
     self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
     self.__cachePath = os.path.join(TOPDIR, "CACHE")
     self.__updateId = "2018_25"
     #
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
     self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
     #
     self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName)
     #
     self.__dataSetId = "2018_23"
     self.__pathClusterData = self.__cfgOb.getPath("RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
     # self.__levels = ['100', '95', '90', '70', '50', '30']
     self.__levels = ["100"]
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
class SchemaDefLoaderDbTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefLoaderDbTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__verbose = True
        #
        fileLimit = 100
        numProc = 2
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__workPath = os.path.join(HERE, "test-output")
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__resourceName = "MYSQL_DB"
        #
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=numProc,
                                        fileLimit=fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __schemaCreate(self, schemaDefObj):
        """Create table schema using schema definition"""
        try:
            tableIdList = schemaDefObj.getSchemaIdList()
            sqlGen = SqlGenAdmin(self.__verbose)
            sqlL = sqlGen.createDatabaseSQL(schemaDefObj.getDatabaseName())
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getSchemaObject(tableId)
                sqlL.extend(
                    sqlGen.createTableSQL(
                        databaseName=schemaDefObj.getDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema creation SQL string\n %s\n\n",
                         "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                myQ = MyDbQuery(dbcon=client, verbose=self.__verbose)
                #
                # Permit warnings to support "drop table if exists" for missing tables.
                #
                myQ.setWarning("ignore")
                ret = myQ.sqlCommand(sqlCommandList=sqlL)
                logger.debug("\n\n+INFO mysql server returns %r\n", ret)
                self.assertTrue(ret)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    # ------------- - -------------------------------------------------------------------------------------------
    def testSchemaCreate(self):
        """Create table schema for BIRD, chemical component, and PDBx data."""
        cD = self.__schP.makeSchemaDef("bird",
                                       dataTyping="SQL",
                                       saveSchema=True)
        sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)
        #
        cD = self.__schP.makeSchemaDef("chem_comp",
                                       dataTyping="SQL",
                                       saveSchema=True)
        sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)
        #
        # cD = self.__schP.makeSchemaDef("pdbx", dataTyping="SQL", saveSchema=True)
        # sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)

    def testLoadBirdReference(self):
        try:
            cD = self.__schP.makeSchemaDef("bird",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="bird")
            inputPathList.extend(
                self.__rpP.getLocatorObjList(contentType="bird_family"))
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReLoadBirdReference(self):
        try:
            cD = self.__schP.makeSchemaDef("bird",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="bird")
            inputPathList.extend(
                self.__rpP.getLocatorObjList(contentType="bird_family"))
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                sdl.load(inputPathList=inputPathList, loadType="batch-file")
                #
                logger.debug(
                    "INFO BATCH FILE RELOAD TEST --------------------------------------------\n"
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file",
                              deleteOpt="all")
                self.assertTrue(ok)
                #
                logger.debug(
                    "\n\n\n+INFO BATCH INSERT RELOAD TEST --------------------------------------------\n"
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file",
                              deleteOpt="selected")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadChemCompReference(self):
        try:
            cD = self.__schP.makeSchemaDef("chem_comp",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(
                contentType="chem_comp")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skip("Disable test - schema not optimized for mysql limitations")
    def testLoadPdbxFiles(self):
        try:
            cD = self.__schP.makeSchemaDef("pdbx",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="pdbx")
            logger.debug("Input path list %r", inputPathList)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-insert",
                              deleteOpt="all")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #12
0
class ObjectValidator(object):
    """Utilities to extract and update object from the document object server with validation."""
    def __init__(self,
                 cfgOb,
                 objectAdapter=None,
                 cachePath=".",
                 useCache=True,
                 **kwargs):
        self.__cfgOb = cfgOb
        self.__oAdapt = objectAdapter
        self.__resourceName = "MONGO_DB"
        _ = kwargs
        self.__statusList = []
        self.__schP = SchemaProvider(self.__cfgOb,
                                     cachePath,
                                     useCache=useCache)
        self.__valInst = None

    def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
        _ = self.__schP.makeSchemaDef(databaseName,
                                      dataTyping="ANY",
                                      saveSchema=True)
        cD = self.__schP.makeSchema(databaseName,
                                    collectionName,
                                    encodingType="JSON",
                                    level=schemaLevel,
                                    saveSchema=True)
        # Raises exceptions for schema compliance.
        Draft4Validator.check_schema(cD)
        valInst = Draft4Validator(cD, format_checker=FormatChecker())
        return valInst

    def __validateObj(self, databaseName, collectionName, rObj, label=""):
        try:
            eCount = 0
            tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
            for error in sorted(self.__valInst.iter_errors(rObj), key=str):
                logger.info(
                    "Database %s collection %s (%s %r) path %s error: %s",
                    databaseName, collectionName, label, tId, error.path,
                    error.message)
                logger.debug(">>> Failing object is %r", rObj)
                eCount += 1
        except Exception as e:
            logger.exception("Validation failing %s", str(e))

        return eCount

    def doTransform(self, **kwargs):
        desp = DataExchangeStatus()
        statusStartTimestamp = desp.setStartTime()
        #
        databaseName = kwargs.get("databaseName", "pdbx_core")
        collectionName = kwargs.get("collectionName", "pdbx_core_entry")
        selectionQueryD = kwargs.get("selectionQuery", {})
        fetchLimit = kwargs.get("fetchLimit", None)
        #

        #
        tU = TimeUtil()
        updateId = kwargs.get("updateId", tU.getCurrentWeekSignature())
        #
        docSelectList = self.__selectObjectIds(databaseName, collectionName,
                                               selectionQueryD)
        docSelectList = docSelectList[:fetchLimit] if fetchLimit else docSelectList

        ok = self.__transform(databaseName, collectionName, docSelectList)
        #
        if updateId:
            okS = self.__updateStatus(updateId, databaseName, collectionName,
                                      ok, statusStartTimestamp)
        return ok and okS

    def __selectObjectIds(self, databaseName, collectionName, selectionQueryD):
        """Return a list of object identifiers for the input selection query."""
        try:

            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    logger.info("%s %s document count is %d", databaseName,
                                collectionName,
                                mg.count(databaseName, collectionName))
                    qD = {}
                    if selectionQueryD:
                        qD.update(selectionQueryD)
                    selectL = ["_id"]
                    dL = mg.fetch(databaseName,
                                  collectionName,
                                  selectL,
                                  queryD=qD)
                    logger.info("Selection %r fetch result count %d", selectL,
                                len(dL))

        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return dL
        #

    def __transform(self,
                    databaseName,
                    collectionName,
                    docSelectList,
                    logIncrement=100):
        """Return a list of object identifiers for the input selection query."""
        #
        ok = True
        try:
            self.__valInst = self.__getValidator(databaseName,
                                                 collectionName,
                                                 schemaLevel="full")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if mg.collectionExists(databaseName, collectionName):
                    numDoc = len(docSelectList)
                    for ii, dD in enumerate(docSelectList, 1):
                        if "_id" not in dD:
                            continue
                        rObj = mg.fetchOne(databaseName, collectionName, "_id",
                                           dD["_id"])
                        del rObj["_id"]
                        #
                        fOk = True

                        if self.__oAdapt:
                            self.__validateObj(databaseName,
                                               collectionName,
                                               rObj,
                                               label="Original")
                            fOk, rObj = self.__oAdapt.filter(rObj)
                            self.__validateObj(databaseName,
                                               collectionName,
                                               rObj,
                                               label="Updated")
                        if fOk:
                            rOk = mg.replace(databaseName, collectionName,
                                             rObj, dD)
                            if rOk is None:
                                tId = rObj[
                                    "rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
                                logger.error("%r %r (%r) failing",
                                             databaseName, collectionName, tId)
                                # logger.info("rObj.keys() %r", list(rObj.keys()))
                                # logger.info("rObj.items() %s", rObj.items())
                                rOk = False
                            ok = ok and rOk
                        #
                        if ii % logIncrement == 0 or ii == numDoc:
                            logger.info("Replace status %r object (%d of %d)",
                                        ok, ii, numDoc)
                        #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def getLoadStatus(self):
        return self.__statusList

    def __updateStatus(self, updateId, databaseName, collectionName, status,
                       startTimestamp):
        try:
            sFlag = "Y" if status else "N"
            desp = DataExchangeStatus()
            desp.setStartTime(tS=startTimestamp)
            desp.setObject(databaseName, collectionName)
            desp.setStatus(updateId=updateId, successFlag=sFlag)
            desp.setEndTime()
            self.__statusList.append(desp.getStatus())
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
Beispiel #13
0
class SqlGenTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__sdu = SchemaProvider(self.__cfgOb,
                                    self.__cachePath,
                                    useCache=True)
        #

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def testSQLMethods(self):
        schemaNames = ["pdbx_core"]
        dataTyping = "SQL"
        for schemaName in schemaNames:
            dD = self.__sdu.makeSchemaDef(schemaName,
                                          dataTyping=dataTyping,
                                          saveSchema=False)
            sD = SchemaDefAccess(dD)
            self.__testSchemaCreate(sD)
            self.__testImportExport(sD)
            self.__testSelectionAndConditions(sD)

    #

    def __getHelper(self, modulePath, **kwargs):
        aMod = __import__(modulePath, globals(), locals(), [""])
        sys.modules[modulePath] = aMod
        #
        # Strip off any leading path to the module before we instaniate the object.
        mpL = modulePath.split(".")
        moduleName = mpL[-1]
        #
        aObj = getattr(aMod, moduleName)(**kwargs)
        return aObj

    def __testSchemaCreate(self, sD):
        """Test case -  create table schema using input schema definition as an example
        """

        try:
            tableIdList = sD.getSchemaIdList()
            myAd = SqlGenAdmin(self.__verbose)
            sqlL = []
            for tableId in tableIdList:
                tableDefObj = sD.getSchemaObject(tableId)
                sqlL.extend(
                    myAd.createTableSQL(databaseName=sD.getDatabaseName(),
                                        tableDefObj=tableDefObj))
                logger.debug(
                    "\n\n+SqlGenTests table creation SQL string\n %s\n\n",
                    "\n".join(sqlL))
            self.assertGreaterEqual(len(sqlL), 10)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __testImportExport(self, sD):
        """Test case -  import and export commands --
        """

        try:
            databaseName = sD.getDatabaseName()
            tableIdList = sD.getSchemaIdList()
            myAd = SqlGenAdmin(self.__verbose)
            for tableId in tableIdList:
                tableDefObj = sD.getSchemaObject(tableId)
                exportPath = os.path.join(HERE, "test-output",
                                          tableDefObj.getName() + ".tdd")
                sqlExport = myAd.exportTable(databaseName,
                                             tableDefObj,
                                             exportPath=exportPath)
                logger.debug(
                    "\n\n+SqlGenTests table export SQL string\n %s\n\n",
                    sqlExport)
                sqlImport = myAd.importTable(databaseName,
                                             tableDefObj,
                                             importPath=exportPath)
                logger.debug(
                    "\n\n+SqlGenTests table import SQL string\n %s\n\n",
                    sqlImport)
                self.assertGreaterEqual(len(sqlImport), 100)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __testSelectionAndConditions(self, sD):
        """Test case -  selection everything for a simple condition-
        """
        try:
            # get delete attribute -
            #
            tableIdList = sD.getSchemaIdList()
            logger.debug("TableIdList %r", tableIdList)
            sqlGen = SqlGenQuery(schemaDefObj=sD, verbose=self.__verbose)

            for tableId in tableIdList:
                tableDefObj = sD.getSchemaObject(tableId)
                dAtId = tableDefObj.getDeleteAttributeId()

                if dAtId:
                    sqlCondition = SqlGenCondition(schemaDefObj=sD,
                                                   verbose=self.__verbose)
                    sqlCondition.addValueCondition((tableId, dAtId), "EQ",
                                                   ("D000001", "CHAR"))
                    aIdList = sD.getAttributeIdList(tableId)
                    for aId in aIdList:
                        sqlGen.addSelectAttributeId(attributeTuple=(tableId,
                                                                    aId))
                    sqlGen.setCondition(sqlCondition)
                    sqlGen.addOrderByAttributeId(attributeTuple=(tableId,
                                                                 dAtId))
                    sqlS = sqlGen.getSql()
                    logger.debug(
                        "\n\n+SqlGenTests table creation SQL string\n %s\n\n",
                        sqlS)
                    self.assertGreaterEqual(len(sqlS), 50)
                    sqlGen.clear()
                else:
                    logger.debug("Missing delete atttribe for table %r",
                                 tableId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #14
0
class CockroachDbLoaderCockroachDbTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(CockroachDbLoaderCockroachDbTests, self).__init__(methodName)
        self.__verbose = True
        self.__createFlag = False

    def setUp(self):
        self.__verbose = True
        self.__numProc = 2
        self.__fileLimit = 100
        self.__workPath = os.path.join(HERE, "test-output")
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName)
        self.__resourceName = "COCKROACH_DB"
        self.__schP = SchemaProvider(self.__cfgOb, self.__workPath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath)
        #
        self.__tableIdSkipD = {"ATOM_SITE": True, "ATOM_SITE_ANISOTROP": True}
        self.__ioObj = IoAdapter(verbose=self.__verbose)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testConnection(self):
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                self.assertNotEqual(client, None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaCreate(self):
        """Create table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaRemove(self):
        """Remove table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertBirdReference(self):

        try:

            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("bird")
            inputPathList.extend(self.__rpP.getLocatorObjList("bird_family"))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyBirdReference(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("bird")
            inputPathList.extend(self.__rpP.getLocatorObjList("bird_family"))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertChemCompReference(self):

        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("chem_comp")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyChemCompReference(self):

        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("chem_comp")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertPdbxExampleFiles(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("pdbx")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD)
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyPdbxExampleFiles(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("pdbx")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD)
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaCreateSQL(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        sqlL = []
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlL = sqlGen.createDatabaseSQL(dbName)
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj))
            logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return sqlL

    def __schemaCreate(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlL = sqlGen.createDatabaseSQL(dbName)
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj))

            logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL))
            logger.info("Creating schema using database %s", schemaDefObj.getVersionedDatabaseName())
            #
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                # ret = crQ.sqlCommand(' '.join(sqlL))
                logger.info("Schema create command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaRemove(self, schemaDefObj):
        """Test case -  remove table schema using schema definition"""
        ret = 0
        try:
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            sqlL = sqlGen.removeDatabaseSQL(dbName)
            logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                # ret = crQ.sqlCommand(' '.join(sqlL))
                logger.debug("Schema remove command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser()
    #
    defaultConfigName = "site_info_configuration"
    #
    parser.add_argument(
        "--update_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_chem_comp_core_ref",
        default=False,
        action="store_true",
        help="Update core schema for Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Chemical Component reference definitions")
    parser.add_argument(
        "--update_bird_chem_comp_core_ref",
        default=False,
        action="store_true",
        help=
        "Update core schema for Bird Chemical Component reference definitions")

    parser.add_argument("--update_bird_ref",
                        default=False,
                        action="store_true",
                        help="Update schema for Bird reference definitions")
    parser.add_argument(
        "--update_bird_family_ref",
        default=False,
        action="store_true",
        help="Update schema for Bird Family reference definitions")

    parser.add_argument("--update_pdbx",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx entry data")
    parser.add_argument("--update_pdbx_core",
                        default=False,
                        action="store_true",
                        help="Update schema for PDBx core entry/entity data")
    parser.add_argument(
        "--update_pdbx_comp_model_core",
        default=False,
        action="store_true",
        help="Update schema for PDBx computational model core entry/entity data"
    )
    #
    parser.add_argument("--update_repository_holdings",
                        default=False,
                        action="store_true",
                        help="Update schema for repository holdings")
    parser.add_argument("--update_entity_sequence_clusters",
                        default=False,
                        action="store_true",
                        help="Update schema for entity sequence clusters")
    parser.add_argument("--update_data_exchange",
                        default=False,
                        action="store_true",
                        help="Update schema for data exchange status")
    parser.add_argument("--update_ihm_dev",
                        default=False,
                        action="store_true",
                        help="Update schema for I/HM dev entry data")
    parser.add_argument("--update_drugbank_core",
                        default=False,
                        action="store_true",
                        help="Update DrugBank schema")
    #
    parser.add_argument(
        "--update_config_all",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_ALL)")
    parser.add_argument(
        "--update_config_deployed",
        default=False,
        action="store_true",
        help=
        "Update using configuration settings (e.g. DATABASE_NAMES_DEPLOYED)")
    parser.add_argument(
        "--update_config_test",
        default=False,
        action="store_true",
        help="Update using configuration settings (e.g. DATABASE_NAMES_TEST)")
    #
    parser.add_argument("--config_path",
                        default=None,
                        help="Path to configuration options file")
    parser.add_argument("--config_name",
                        default=defaultConfigName,
                        help="Configuration section name")
    #
    parser.add_argument("--cache_path",
                        default=None,
                        help="Schema cache directory path")
    parser.add_argument(
        "--encoding_types",
        default=None,
        help="Schema encoding (rcsb|json|bson) (comma separated)")
    parser.add_argument(
        "--validation_levels",
        default=None,
        help="Schema validation level (full|min) (comma separated)")
    parser.add_argument("--compare_only",
                        default=False,
                        action="store_true",
                        help="Perform comparison with cached schema")
    #
    parser.add_argument("--debug",
                        default=False,
                        action="store_true",
                        help="Turn on verbose logging")
    parser.add_argument(
        "--mock",
        default=False,
        action="store_true",
        help="Use MOCK repository configuration for dependencies and testing")
    # parser.add_argument("--working_path", default=None, help="Working/alternative path for temporary and schema files")
    args = parser.parse_args()
    #
    debugFlag = args.debug
    if debugFlag:
        logger.setLevel(logging.DEBUG)
    # ----------------------- - ----------------------- - ----------------------- - ----------------------- - ----------------------- -
    #                                       Configuration Details
    configPath = args.config_path
    configName = args.config_name
    cachePath = args.cache_path
    compareOnly = args.compare_only
    #
    encodingTypes = args.encoding_types.split(
        ",") if args.encoding_types else []
    validationLevels = args.validation_levels.split(
        ",") if args.validation_levels else []
    dataTypingList = ["ANY", "SQL"]

    if not configPath:
        configPath = os.getenv("DBLOAD_CONFIG_PATH", None)
    try:
        if os.access(configPath, os.R_OK):
            os.environ["DBLOAD_CONFIG_PATH"] = configPath
            logger.info("Using configuation path %s (%s)", configPath,
                        configName)
        else:
            logger.error("Missing or access issue with config file %r",
                         configPath)
            exit(1)
        mockTopPath = os.path.join(TOPDIR, "rcsb",
                                   "mock-data") if args.mock else None
        cfgOb = ConfigUtil(configPath=configPath,
                           defaultSectionName=defaultConfigName,
                           mockTopPath=mockTopPath)
        if configName != defaultConfigName:
            cfgOb.replaceSectionName(defaultConfigName, configName)
    except Exception as e:
        logger.error("Missing or access issue with config file %r with %s",
                     configPath, str(e))
        exit(1)
    #
    databaseNameList = []
    if args.update_chem_comp_ref:
        databaseNameList.append("chem_comp")

    if args.update_bird_chem_comp_ref:
        databaseNameList.append("bird_chem_comp")

    if args.update_chem_comp_core_ref:
        databaseNameList.append("chem_comp_core")

    if args.update_bird_chem_comp_core_ref:
        databaseNameList.append("bird_chem_comp_core")

    if args.update_bird_ref:
        databaseNameList.append("bird")

    if args.update_bird_family_ref:
        databaseNameList.append("bird_family")

    if args.update_pdbx:
        databaseNameList.append("pdbx")

    if args.update_pdbx_core:
        databaseNameList.append("pdbx_core")

    if args.update_pdbx_comp_model_core:
        databaseNameList.append("pdbx_comp_model_core")

    if args.update_repository_holdings:
        databaseNameList.append("repository_holdings")

    if args.update_entity_sequence_clusters:
        databaseNameList.append("sequence_clusters")

    if args.update_data_exchange:
        databaseNameList.append("data_exchange")

    if args.update_ihm_dev:
        databaseNameList.append("ihm_dev")

    if args.update_drugbank_core:
        databaseNameList.append("drugbank_core")

    if args.update_config_deployed:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_DEPLOYED",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_DEPLOYED",
            sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_DEPLOYED",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_DEPLOYED",
            sectionName="database_catalog_configuration")

    if args.update_config_all:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_ALL", sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_ALL", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_ALL",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_ALL", sectionName="database_catalog_configuration")

    if args.update_config_test:
        databaseNameList = cfgOb.getList(
            "DATABASE_NAMES_TEST",
            sectionName="database_catalog_configuration")
        dataTypingList = cfgOb.getList(
            "DATATYPING_TEST", sectionName="database_catalog_configuration")
        validationLevels = cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        encodingTypes = cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
    #
    scnD = cfgOb.get("document_collection_names",
                     sectionName="document_helper_configuration")
    #
    databaseNameList = list(set(databaseNameList))
    logger.debug("Collections %s", list(scnD.items()))
    logger.debug("databaseNameList %s", databaseNameList)

    if compareOnly:
        schP = SchemaProvider(cfgOb, cachePath, useCache=True)
        difPathList = []
        for databaseName in databaseNameList:
            for dataTyping in dataTypingList:
                logger.debug("Building schema %s with types %s", databaseName,
                             dataTyping)
                pth = schP.schemaDefCompare(databaseName, dataTyping)
                if pth:
                    difPathList.append(pth)
        if difPathList:
            logger.info("Schema definition difference path list %r",
                        difPathList)
        difPathList = []
        for databaseName in databaseNameList:
            dD = schP.makeSchemaDef(databaseName,
                                    dataTyping="ANY",
                                    saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for encodingType in encodingTypes:
                    if encodingType.lower() != "json":
                        continue
                    for level in validationLevels:
                        pth = schP.jsonSchemaCompare(databaseName,
                                                     collectionName,
                                                     encodingType, level)
                        if pth:
                            difPathList.append(pth)
        if difPathList:
            logger.info("JSON schema difference path list %r", difPathList)

    else:
        schP = SchemaProvider(cfgOb, cachePath, useCache=False)
        for databaseName in databaseNameList:
            for encodingType in encodingTypes:
                if encodingType == "rcsb":
                    for dataTyping in dataTypingList:
                        logger.info(
                            "Creating schema definition for content type %s data typing %s",
                            databaseName, dataTyping)
                        schP.makeSchemaDef(databaseName,
                                           dataTyping=dataTyping,
                                           saveSchema=True)
                else:
                    if databaseName in scnD:
                        for dD in scnD[databaseName]:
                            collectionName = dD["NAME"]
                            for validationLevel in validationLevels:
                                logger.info(
                                    "Creating %r schema for content type %s collection %s",
                                    encodingType, databaseName, collectionName)
                                schP.makeSchema(databaseName,
                                                collectionName,
                                                encodingType=encodingType,
                                                level=validationLevel,
                                                saveSchema=True)
Beispiel #16
0
class SchemaDefAccessTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True,
                                     clearPath=False)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testAccess(self):
        databaseNames = ["pdbx_core", "bird_chem_comp_core"]
        dataTypingList = ["ANY", "SQL"]
        for databaseName in databaseNames:
            for dataTyping in dataTypingList:
                self.__testAccess(databaseName, dataTyping)

    def __testAccess(self, databaseName, dataTyping):
        try:
            sD = self.__schP.makeSchemaDef(databaseName,
                                           dataTyping=dataTyping,
                                           saveSchema=False)
            ok = self.__testAccessors(sD)
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return {}

    def __testAccessors(self, schemaDef):
        """Verify data and accessor mapping -"""

        sd = SchemaDefAccess(schemaDef)
        logger.debug("Schema name %s", sd.getName())
        logger.debug("Schema name %s", sd.getAppName())

        logger.debug("Database name %s", sd.getDatabaseName())
        logger.debug("Versioned database name %s",
                     sd.getVersionedDatabaseName())

        logger.debug("Collection info %r", sd.getCollectionInfo())

        for dS in sd.getDataSelectorNames():
            logger.debug("Selector %s %r", dS, sd.getDataSelectors(dS))

        collectionInfoL = sd.getCollectionInfo()
        for dD in collectionInfoL:
            collectionName = dD["NAME"]

            logger.debug("Collection excluded %r",
                         sd.getCollectionExcluded(collectionName))
            logger.debug("Collection included %r",
                         sd.getCollectionSelected(collectionName))
            logger.debug("Collection document key attribute names %r",
                         sd.getDocumentKeyAttributeNames(collectionName))

        schemaIdList = sd.getSchemaIdList()
        for schemaId in schemaIdList:
            #
            aIdL = sd.getAttributeIdList(schemaId)
            tObj = sd.getSchemaObject(schemaId)
            attributeIdList = tObj.getAttributeIdList()
            self.assertEqual(len(aIdL), len(attributeIdList))
            attributeNameList = tObj.getAttributeNameList()
            logger.debug("Ordered attribute Id   list %s",
                         str(attributeIdList))
            logger.debug("Ordered attribute name list %s",
                         str(attributeNameList))
            #
            mAL = tObj.getMapAttributeNameList()
            logger.debug("Ordered mapped attribute name list %s", str(mAL))

            mAL = tObj.getMapAttributeIdList()
            logger.debug("Ordered mapped attribute id   list %s", str(mAL))

            cL = tObj.getMapInstanceCategoryList()
            logger.debug("Mapped category list %s", str(cL))

            for cV in cL:
                aL = tObj.getMapInstanceAttributeList(cV)
                logger.debug("Mapped attribute list in %s :  %s", cV, str(aL))
        return True
Beispiel #17
0
class ChemRefEtlWorker(object):
    """Prepare and load chemical reference data collections."""
    def __init__(self,
                 cfgOb,
                 cachePath,
                 useCache=True,
                 numProc=2,
                 chunkSize=10,
                 readBackCheck=False,
                 documentLimit=None,
                 verbose=False):
        self.__cfgOb = cfgOb
        self.__cachePath = cachePath
        self.__useCache = useCache
        self.__readBackCheck = readBackCheck
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        self.__documentLimit = documentLimit
        #
        self.__resourceName = "MONGO_DB"
        self.__verbose = verbose
        self.__statusList = []
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=self.__useCache)
        #

    def __updateStatus(self, updateId, databaseName, collectionName, status,
                       startTimestamp):
        try:
            sFlag = "Y" if status else "N"
            desp = DataExchangeStatus()
            desp.setStartTime(tS=startTimestamp)
            desp.setObject(databaseName, collectionName)
            desp.setStatus(updateId=updateId, successFlag=sFlag)
            desp.setEndTime()
            self.__statusList.append(desp.getStatus())
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def load(self, updateId, extResource, loadType="full"):
        """Load chemical reference integrated data for the input external resource-"""
        try:
            self.__statusList = []
            desp = DataExchangeStatus()
            statusStartTimestamp = desp.setStartTime()
            #
            if extResource == "DrugBank":
                databaseName = "drugbank_core"
                configName = self.__cfgOb.getDefaultSectionName()
                user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME",
                                        sectionName=configName)
                pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD",
                                      sectionName=configName)
                #
                dbP = DrugBankProvider(cachePath=self.__cachePath,
                                       useCache=self.__useCache,
                                       username=user,
                                       password=pw)
                #
                crExt = ChemRefExtractor(self.__cfgOb)
                idD = crExt.getChemCompAccessionMapping(extResource)
                dList = dbP.getDocuments(mapD=idD)
                #
                logger.info("Resource %r extracted mapped document length %d",
                            extResource, len(dList))
                logger.debug("Objects %r", dList[:2])
                sD, _, collectionList, _ = self.__schP.getSchemaInfo(
                    databaseName)
                collectionName = collectionList[
                    0] if collectionList else "unassigned"
                indexL = sD.getDocumentIndex(collectionName, "primary")
                logger.info("Database %r collection %r index attributes %r",
                            databaseName, collectionName, indexL)
                #
                collectionVersion = sD.getCollectionVersion(collectionName)
                addValues = {"_schema_version": collectionVersion}
                #
                addValues = {}
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=indexL,
                         keyNames=None,
                         addValues=addValues)
            self.__updateStatus(updateId, databaseName, collectionName, ok,
                                statusStartTimestamp)

            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def getLoadStatus(self):
        return self.__statusList
class SchemaDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__numProc = 2
        # self.__fileLimit = 200
        self.__fileLimit = None
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example-ihm.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        #self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=False, rebuildFlag=True)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH", sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = True
        #
        self.__modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-files")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        #self.__extraOpts = None
        # The following for extended parent/child info -
        self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_assembly", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsRepo(self):
        # schemaLevel = "min"

        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(databaseNameD=self.__databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        self.assertLessEqual(eCount, 1)

    @unittest.skip("Disable troubleshooting test")
    def testValidateOptsList(self):
        schemaLevel = "min"
        inputPathList = self.__mU.doImport(os.path.join(HERE, "test-output", "failed-path.list"), "list")
        # inputPathList = glob.glob(self.__testDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {"pdbx_core": ["pdbx_core_entity", "pdbx_core_entry", "pdbx_core_entity_instance", "pdbx_core_entity_instance_validation"]}
        for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)):
            if ii < 5:
                continue
            eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=subList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
            logger.info("Chunk %d total validation errors schema level %s : %d", ii, schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmRepo(self):
        schemaLevel = "min"
        inputPathList = None
        self.__export = True

        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    #@unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmList(self):
        #schemaLevel = "full"
        schemaLevel = "min"

        inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif")
        if not inputPathList:
            return True
        #databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(databaseNameD=databaseNameD, inputPathList=inputPathList, schemaLevel=schemaLevel, mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d", schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    def __testValidateOpts(self, databaseNameD, inputPathList=None, schemaLevel="full", mergeContentTypeD=None):
        #
        eCount = 0
        for databaseName in databaseNameD:
            mergeContentTypes = mergeContentTypeD[databaseName] if databaseName in mergeContentTypeD else None
            _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
            pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(databaseName, mergeContentTypes=mergeContentTypes)
            for collectionName in databaseNameD[databaseName]:
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True, extraOpts=self.__extraOpts)
                #
                dL, cnL = self.__testPrepDocumentsFromContainers(
                    pthList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=mergeContentTypes
                )
                # Raises exceptions for schema compliance.
                try:
                    Draft4Validator.check_schema(cD)
                except Exception as e:
                    logger.error("%s %s schema validation fails with %s", databaseName, collectionName, str(e))
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                logger.info("Validating %d documents from %s %s", len(dL), databaseName, collectionName)
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d", databaseName, collectionName, ii)
                    try:
                        cCount = 0
                        #for error in sorted(valInfo.iter_errors(dD), key=str):
                        #    logger.info("schema %s collection %s (%s) path %s error: %s", databaseName, collectionName, cnL[ii], error.path, error.message)
                        #    logger.debug("Failing document %d : %r", ii, list(dD.items()))
                        #    eCount += 1
                        #    cCount += 1
                        #if cCount > 0:
                        #    logger.info("schema %s collection %s container %s error count %d", databaseName, collectionName, cnL[ii], cCount)
                    except Exception as e:
                        logger.exception("Validation processing error %s", str(e))

        return eCount

    def __testPrepDocumentsFromContainers(self, inputPathList, databaseName, collectionName, styleType="rowwise_by_name_with_cardinality", mergeContentTypes=None):
        """Test case -  create loadable PDBx data from repository files
        """
        try:

            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName)
            #
            dP = DictionaryApiProviderWrapper(self.__cfgOb, self.__cachePath, useCache=False)
            dictApi = dP.getApiByName(databaseName)
            rP = DictMethodResourceProvider(self.__cfgOb, configName=self.__configName, cachePath=self.__cachePath, siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi, modulePathMap=self.__modulePathMap, resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd, filterType=self.__fTypeRow)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd, dtObj=dtf, workPath=self.__cachePath, verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output", cName + "-with-method.cif")
                    #self.__mU.doExport(savePath, [container], fmt="mmcif")
            #
            tableIdExcludeList = sd.getCollectionExcluded(collectionName)
            tableIdIncludeList = sd.getCollectionSelected(collectionName)
            sliceFilter = sd.getCollectionSliceFilter(collectionName)
            sdp.setSchemaIdExcludeList(tableIdExcludeList)
            sdp.setSchemaIdIncludeList(tableIdIncludeList)
            #
            docList, containerNameList, _ = sdp.processDocuments(
                containerList, styleType=styleType, filterType=self.__fTypeRow, dataSelectors=["PUBLIC_RELEASE"], sliceFilter=sliceFilter, collectionName=collectionName
            )

            docList = sdp.addDocumentPrivateAttributes(docList, collectionName)
            docList = sdp.addDocumentSubCategoryAggregates(docList, collectionName)
            #
            mergeS = "-".join(mergeContentTypes) if mergeContentTypes else ""
            if self.__export and docList:
                # for ii, doc in enumerate(docList[:1]):
                for ii, doc in enumerate(docList):
                    cn = containerNameList[ii]
                    fp = os.path.join(HERE, "test-output", "prep-%s-%s-%s-%s.json" % (cn, databaseName, collectionName, mergeS))
                    self.__mU.doExport(fp, [doc], fmt="json", indent=3)
                    logger.debug("Exported %r", fp)
            #
            return docList, containerNameList

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #19
0
class SchemaDefBuildTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=False)
        #
        self.__validationLevels = self.__cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        self.__encodingTypes = self.__cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
        #
        buildAll = True
        if buildAll:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_DEPLOYED",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_DEPLOYED",
                sectionName="database_catalog_configuration")
            #
        else:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_TEST",
                sectionName="database_catalog_configuration")
            # self.__databaseNameList = ["repository_holdings"]
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_TEST",
                sectionName="database_catalog_configuration")

        # self.__databaseNameList = ["sequence_clusters"]
        self.__saveSchema = True
        self.__compareDefSchema = False
        self.__compareSchema = False
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testBuildSchemaDefs(self):
        try:
            for databaseName in self.__databaseNameList:
                for dataTyping in self.__dataTypingList:
                    logger.debug("Building schema %s with types %s",
                                 databaseName, dataTyping)
                    self.__schP.makeSchemaDef(databaseName,
                                              dataTyping=dataTyping,
                                              saveSchema=self.__saveSchema)
                    if self.__compareDefSchema:
                        self.__schP.schemaDefCompare(databaseName, dataTyping)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testBuildCollectionSchema(self):
        schemaDifPathList = []
        for databaseName in self.__databaseNameList:
            dD = self.__schP.makeSchemaDef(databaseName,
                                           dataTyping="ANY",
                                           saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for encodingType in self.__encodingTypes:
                    if encodingType.lower() == "rcsb":
                        continue
                    for level in self.__validationLevels:
                        self.__schP.makeSchema(databaseName,
                                               collectionName,
                                               encodingType=encodingType,
                                               level=level,
                                               saveSchema=self.__saveSchema)
                        if self.__compareSchema and encodingType.lower(
                        ) == "json":
                            pth = self.__schP.jsonSchemaCompare(
                                databaseName, collectionName, encodingType,
                                level)
                            if pth:
                                schemaDifPathList.append(pth)
        if schemaDifPathList:
            logger.info("Path dif list %r", schemaDifPathList)

    def testCompareSchema(self):
        databaseName = "pdbx_core"
        collectionName = "pdbx_core_entry"
        encodingType = "json"
        level = "full"
        #
        oldPath = os.path.join(
            HERE, "test-saved-output",
            "json-full-db-pdbx_core-col-pdbx_core_entry.json")
        mU = MarshalUtil(workPath=os.path.join(HERE, "test-output"))
        sOld = mU.doImport(oldPath, fmt="json")
        sNew = self.__schP.makeSchema(databaseName,
                                      collectionName,
                                      encodingType=encodingType,
                                      level=level)
        numDif, difD = self.__schP.schemaCompare(sOld, sNew)
        logger.debug("numDiffs %d", numDif)
        self.assertGreaterEqual(numDif, 141)
        self.assertGreaterEqual(len(difD["changed"]), 160)
        logger.debug("difD %r", difD)

    @unittest.skip("Deprecated test")
    def testCompareSchemaCategories(self):
        """Compare common categories across schema definitions."""
        try:
            sdCc = SchemaDefAccess(
                self.__schP.makeSchemaDef("chem_comp_core",
                                          dataTyping="ANY",
                                          saveSchema=False))
            sdBcc = SchemaDefAccess(
                self.__schP.makeSchemaDef("bird_chem_comp_core",
                                          dataTyping="ANY",
                                          saveSchema=False))
            #
            logger.info("")
            for schemaId in ["CHEM_COMP", "PDBX_CHEM_COMP_AUDIT"]:
                atCcL = sdCc.getAttributeIdList(schemaId)
                atBCcL = sdBcc.getAttributeIdList(schemaId)

                logger.debug("%s attributes (%d) %r", schemaId, len(atCcL),
                             atCcL)
                logger.debug("%s attributes (%d) %r", schemaId, len(atBCcL),
                             atBCcL)

                sDif = set(atCcL) - set(atBCcL)
                if sDif:
                    logger.info("For %s attribute differences %r", schemaId,
                                sDif)
                self.assertEqual(len(sDif), 0)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testBuildColSchemaWithRefs(self):
        for databaseName in ["ihm_dev_full"]:
            dD = self.__schP.makeSchemaDef(databaseName,
                                           dataTyping="ANY",
                                           saveSchema=False)
            sD = SchemaDefAccess(dD)
            for cd in sD.getCollectionInfo():
                collectionName = cd["NAME"]
                for schemaType in self.__encodingTypes:
                    if schemaType.lower() == "rcsb":
                        continue
                    for level in self.__validationLevels:
                        self.__schP.makeSchema(
                            databaseName,
                            collectionName,
                            encodingType=schemaType,
                            level=level,
                            saveSchema=True,
                            extraOpts="addParentRefs|addPrimaryKey")
Beispiel #20
0
class ChemRefDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testValidateFull(self):
        self.__validateChemRef("DrugBank", schemaLevel="full")

    def __validateChemRef(self, extResource, schemaLevel="full"):
        eCount = 0
        if extResource == "DrugBank":
            schemaName = "drugbank_core"
            collectionNames = ["drugbank_core"]
            user = self.__cfgOb.get("_DRUGBANK_AUTH_USERNAME",
                                    sectionName=self.__configName)
            pw = self.__cfgOb.get("_DRUGBANK_AUTH_PASSWORD",
                                  sectionName=self.__configName)
            # cacheDir = self.__cfgOb.get("DRUGBANK_CACHE_DIR", sectionName=self.__configName)
            dbP = DrugBankProvider(cachePath=self.__cachePath,
                                   useCache=True,
                                   username=user,
                                   password=pw)
            # idD = dbP.getMapping()
            # crExt = ChemRefExtractor(self.__cfgOb)
            # idD = crExt.getChemCompAccesionMapping(extResource)
            dList = dbP.getDocuments()
            logger.info("Validating %d Drugbank documents", len(dList))
            eCount = self.__validate(schemaName,
                                     collectionNames,
                                     dList,
                                     schemaLevel=schemaLevel)

        return eCount

    def __validate(self,
                   databaseName,
                   collectionNames,
                   dList,
                   schemaLevel="full"):

        eCount = 0
        for collectionName in collectionNames:
            _ = self.__schP.makeSchemaDef(databaseName,
                                          dataTyping="ANY",
                                          saveSchema=True)
            cD = self.__schP.makeSchema(databaseName,
                                        collectionName,
                                        encodingType="JSON",
                                        level=schemaLevel,
                                        saveSchema=True)
            # Raises exceptions for schema compliance.
            Draft4Validator.check_schema(cD)
            #
            valInfo = Draft4Validator(cD, format_checker=FormatChecker())
            for ii, dD in enumerate(dList):
                logger.debug("Database %s collection %s document %d",
                             databaseName, collectionName, ii)
                try:
                    cCount = 0
                    for error in sorted(valInfo.iter_errors(dD), key=str):
                        logger.info(
                            "database %s collection %s path %s error: %s",
                            databaseName, collectionName, error.path,
                            error.message)
                        logger.info(">>> failing object is %r", dD)
                        eCount += 1
                        cCount += 1
                    #
                    logger.debug("database %s collection %s count %d",
                                 databaseName, collectionName, cCount)
                except Exception as e:
                    logger.exception("Validation error %s", str(e))

        return eCount
class SchemaDefDataPrepTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefDataPrepTests, self).__init__(methodName)
        self.__loadPathList = []
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__outputPath = os.path.join(HERE, "test-output")
        self.__savedOutputPath = os.path.join(HERE, "test-saved-output")

        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE",
                                                sectionName=configName,
                                                default="local")
        self.__fileLimit = 100 if self.__discoveryMode == "local" else 10
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__chemCompMockLen = 24
        self.__pdbxMockLen = 30
        # removes timestamped data items to allow diffs.)
        excludeExtras = ["rcsb_load_status"]
        # excludeExtras = []
        #
        self.__verbose = True
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__exportFlag = True
        self.__diffFlag = False
        #
        self.__simpleTestCaseList = [
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_no_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeCol,
                "styleType": "columnwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 0,
            },
        ]
        #
        self.__fullTestCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": None,
                "rejectLength": 2,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__fullTestCaseListA = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6,
                    unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def __timeStep(self, msg):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", msg,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testSimpleSchemaDefDataPrep(self):
        for tcD in self.__simpleTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__simpleSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"])

    def testFullSchemaDefDataPrep(self):
        for tcD in self.__fullTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__fullSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"],
                excludeExtras=tcD["excludeExtras"],
            )

    def __simpleSchemaDataPrep(self,
                               contentType,
                               filterType,
                               styleType,
                               mockLength,
                               rejectLength=0,
                               dataSelectors=None,
                               mergeContentTypes=None):
        """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            dataSelectors = dataSelectors if dataSelectors else [
                "PUBLIC_RELEASE"
            ]
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType,
                                                    dataTyping="ANY")
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            #

            logger.debug("For %s mock length %d length of path list %d\n",
                         contentType, mockLength, len(inputPathList))
            self.assertGreaterEqual(len(inputPathList), mockLength)
            tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments(
                inputPathList,
                styleType=styleType,
                filterType=filterType,
                dataSelectors=dataSelectors)
            logger.debug(
                "For %s mock length %d reject length %d length of tddl list %d\n",
                contentType, mockLength, rejectLength, len(tableDataDictList))
            self.assertGreaterEqual(len(tableDataDictList),
                                    mockLength - rejectLength)
            self.assertGreaterEqual(len(containerNameList),
                                    mockLength - rejectLength)

            if rejectList:
                logger.debug("For %s rejecting components %r", contentType,
                             rejectList)
            #
            self.assertEqual(len(rejectList), rejectLength)
            fName = "simple-prep-%s-%s.json" % (contentType, styleType)
            if self.__exportFlag:
                fPath = os.path.join(self.__outputPath, fName)
                self.__mU.doExport(fPath,
                                   tableDataDictList,
                                   fmt="json",
                                   indent=3)
            if self.__diffFlag:
                fPath = os.path.join(self.__savedOutputPath, fName)
                refDocList = self.__mU.doImport(fPath, fmt="json")
                self.assertEqual(len(refDocList), len(tableDataDictList))
                #
                jD = diff(refDocList,
                          tableDataDictList,
                          syntax="explicit",
                          marshal=True)
                if jD:
                    _, fn = os.path.split(fPath)
                    bn, _ = os.path.splitext(fn)
                    fPath = os.path.join(self.__outputPath, bn + "-diff.json")
                    logger.debug("jsondiff for %s %s = \n%s", contentType,
                                 styleType,
                                 pprint.pformat(jD, indent=3, width=100))
                    self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __logDocumentOrder(self, docList):
        for doc in docList:
            logger.debug("keys %r", list(doc.keys()))

    def __filterDocuments(self, docList, excludeList=None):
        excludeList = excludeList if excludeList else []
        for doc in docList:
            for excl in excludeList:
                if excl in doc:
                    del doc[excl]

    def __fullSchemaDataPrep(self,
                             contentType,
                             filterType,
                             styleType,
                             mockLength,
                             rejectLength=0,
                             dataSelectors=None,
                             mergeContentTypes=None,
                             excludeExtras=None):
        """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            excludeExtras = excludeExtras if excludeExtras else []
            _ = mockLength
            _ = rejectLength
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, collectionNameList, _ = self.__schP.getSchemaInfo(
                databaseName=contentType, dataTyping="ANY")
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            #
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
            #
            for collectionName in collectionNameList:
                tableIdExcludeList = sd.getCollectionExcluded(collectionName)
                tableIdIncludeList = sd.getCollectionSelected(collectionName)
                sliceFilter = sd.getCollectionSliceFilter(collectionName)
                sdp.setSchemaIdExcludeList(tableIdExcludeList)
                sdp.setSchemaIdIncludeList(tableIdIncludeList)
                #
                docList, _, _ = sdp.processDocuments(
                    containerList,
                    styleType=styleType,
                    sliceFilter=sliceFilter,
                    filterType=filterType,
                    dataSelectors=dataSelectors,
                    collectionName=collectionName)

                docList = sdp.addDocumentPrivateAttributes(
                    docList, collectionName)
                docList = sdp.addDocumentSubCategoryAggregates(
                    docList, collectionName)

                # Special exclusions for the test harness. (removes timestamped data items to allow diffs.)
                self.__filterDocuments(docList, excludeExtras)
                mergeS = "-".join(
                    mergeContentTypes) if mergeContentTypes else ""
                fName = "full-prep-%s-%s-%s-%s.json" % (
                    contentType, collectionName, mergeS, styleType)
                if self.__exportFlag:
                    self.__logDocumentOrder(docList)
                    fPath = os.path.join(self.__outputPath, fName)
                    self.__mU.doExport(fPath, docList, fmt="json", indent=3)
                    logger.debug("Exported %r", fPath)
                #
                if self.__diffFlag:
                    fPath = os.path.join(self.__savedOutputPath, fName)
                    refDocList = self.__mU.doImport(fPath, fmt="json")
                    self.assertEqual(len(refDocList), len(docList))
                    logger.debug("For %s %s len refDocList %d", contentType,
                                 collectionName, len(refDocList))
                    logger.debug("For %s %s len docList %d", contentType,
                                 collectionName, len(docList))
                    jD = diff(refDocList,
                              docList,
                              syntax="explicit",
                              marshal=True)
                    if jD:
                        _, fn = os.path.split(fPath)
                        bn, _ = os.path.splitext(fn)
                        fPath = os.path.join(self.__outputPath,
                                             bn + "-diff.json")
                        logger.debug("jsondiff for %s %s = \n%s", contentType,
                                     collectionName,
                                     pprint.pformat(jD, indent=3, width=100))
                        self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                    self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #22
0
class ClusterDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=configName, mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb, self.__cachePath, useCache=True)
        #
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH", sectionName=configName)
        #
        self.__dataSetId = "2018_23"
        self.__pathClusterData = self.__cfgOb.getPath("RCSB_SEQUENCE_CLUSTER_DATA_PATH", sectionName=configName)
        # self.__levels = ['100', '95', '90', '70', '50', '30']
        self.__levels = ["100"]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testValidateOptsStrict(self):
        updateId = self.__updateId
        validationLevel = "full"
        eCount = self.__testValidateOpts(updateId, validationLevel=validationLevel)
        logger.info("Total validation errors validation level %s : %d", validationLevel, eCount)
        self.assertTrue(eCount <= 1)

    def __testValidateOpts(self, updateId, validationLevel="full"):
        _ = updateId
        databaseNames = ["sequence_clusters"]
        collectionNames = {"sequence_clusters": ["cluster_provenance", "cluster_members", "entity_members"]}
        #
        eCount = 0
        for databaseName in databaseNames:
            for collectionName in collectionNames[databaseName]:
                _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
                cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=validationLevel, saveSchema=True)
                #
                dL = self.__getSequenceClusterData(collectionName, levels=self.__levels, dataSetId=self.__dataSetId, dataLocator=self.__pathClusterData)
                # Raises exceptions for schema compliance.
                Draft4Validator.check_schema(cD)
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                for _, dD in enumerate(dL):
                    # logger.debug("Schema %s collection %s document %d" % (schemaName, collectionName, ii))
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info("schema %s collection %s path %s error: %s", databaseName, collectionName, error.path, error.message)
                            logger.info(">>> failing object is %r", dD)
                            eCount += 1
                            cCount += 1
                        #
                        logger.debug("schema %s collection %s count %d", databaseName, collectionName, cCount)
                    except Exception as e:
                        logger.exception("Validation error %s", str(e))

        return eCount

    def __fetchProvenance(self):
        """Test case for fetching a provenance dictionary content."""
        try:
            provKeyName = "rcsb_entity_sequence_cluster_prov"
            provU = ProvenanceProvider(self.__cfgOb, self.__cachePath, useCache=True)
            pD = provU.fetch()
            return pD[provKeyName] if provKeyName in pD else {}
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __getSequenceClusterData(self, collectionName, dataSetId=None, dataLocator=None, levels=None):
        """Test extraction on an example sequence cluster data set."""
        try:
            #
            if collectionName == "cluster_provenance":
                return [self.__fetchProvenance()]
            #
            entitySchemaName = "rcsb_entity_sequence_cluster_list"
            clusterSchemaName = "rcsb_entity_sequence_cluster_identifer_list"
            cdp = ClusterDataPrep(workPath=self.__cachePath, entitySchemaName=entitySchemaName, clusterSchemaName=clusterSchemaName)
            cifD, docBySequenceD, docByClusterD = cdp.extract(dataSetId, clusterSetLocator=dataLocator, levels=levels, clusterType="entity")
            self.assertEqual(len(cifD), 1)
            self.assertEqual(len(docBySequenceD), 1)
            self.assertEqual(len(docByClusterD), 1)
            if collectionName == "entity_members":
                return docBySequenceD[entitySchemaName]
            elif collectionName == "cluster_members":
                return docByClusterD[clusterSchemaName]

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return None
Beispiel #23
0
class SchemaDefCompareTests(unittest.TestCase):
    skipFlag = True

    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        #
        self.__validationLevels = self.__cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        self.__encodingTypes = self.__cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
        #
        buildAll = True
        if buildAll:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_DEPLOYED",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_DEPLOYED",
                sectionName="database_catalog_configuration")
            #
        else:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_TEST",
                sectionName="database_catalog_configuration")
            # self.__databaseNameList = ["repository_holdings"]
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_TEST",
                sectionName="database_catalog_configuration")
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    @unittest.skipIf(skipFlag, "Troubleshooting test")
    def testCompareSchemaDefs(self):
        try:
            difPathList = []
            for databaseName in self.__databaseNameList:
                for dataTyping in self.__dataTypingList:
                    logger.debug("Building schema %s with types %s",
                                 databaseName, dataTyping)
                    pth = self.__schP.schemaDefCompare(databaseName,
                                                       dataTyping)
                    if pth:
                        difPathList.append(pth)
            if difPathList:
                logger.info("Schema definition difference path list %r",
                            [os.path.split(pth)[1] for pth in difPathList])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skipIf(skipFlag, "Troubleshooting test")
    def testCompareCollectionSchema(self):
        try:
            difPathList = []
            for databaseName in self.__databaseNameList:
                dD = self.__schP.makeSchemaDef(databaseName,
                                               dataTyping="ANY",
                                               saveSchema=False)
                sD = SchemaDefAccess(dD)
                for cd in sD.getCollectionInfo():
                    collectionName = cd["NAME"]
                    for encodingType in self.__encodingTypes:
                        if encodingType.lower() != "json":
                            continue
                        for level in self.__validationLevels:
                            pth = self.__schP.jsonSchemaCompare(
                                databaseName, collectionName, encodingType,
                                level)
                            if pth:
                                difPathList.append(pth)
            if difPathList:
                logger.info("JSON schema difference path list %r",
                            [os.path.split(pth)[1] for pth in difPathList])
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Beispiel #24
0
class SchemaDefLoadercrateDbMultiTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefLoadercrateDbMultiTests, self).__init__(methodName)
        self.__verbose = True
        self.__createFlag = True

    def setUp(self):
        self.__verbose = True
        self.__numProc = 2
        self.__fileLimit = 100
        self.__chunkSize = 0
        self.__workPath = os.path.join(HERE, "test-output")
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName)
        self.__resourceName = "CRATE_DB"
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__workPath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__workPath)
        #
        #
        self.__tableIdSkipD = {
            "ATOM_SITE": True,
            "ATOM_SITE_ANISOTROP": True,
            "__LOAD_STATUS__": True
        }
        self.__ioObj = IoAdapter(verbose=self.__verbose)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testConnection(self):
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                self.assertNotEqual(client, None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaCreate(self):
        """Create table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaRemove(self):
        """Remove table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadChemCompMulti(self):
        self.__testLoadFilesMulti("chem_comp")

    def testLoadBirdMulti(self):
        self.__testLoadFilesMulti("bird")

    def testLoadPdbxMulti(self):
        self.__testLoadFilesMulti("pdbx")

    def __getPathList(self, fType):
        pathList = []
        if fType == "chem_comp":
            pathList = self.__rpP.getLocatorObjList("chem_comp")
        elif fType == "bird":
            pathList = self.__rpP.getLocatorObjList("bird")
            pathList.extend(self.__rpP.getLocatorObjList("bird_family"))
        elif fType == "pdbx":
            pathList = self.__rpP.getLocatorObjList("pdbx")
        return pathList

    def loadInsertMany(self, dataList, procName, optionsD, workingDir):

        try:
            _ = workingDir
            ret = None
            sd = optionsD["sd"]
            skipD = optionsD["skip"]
            ioObj = IoAdapter(verbose=self.__verbose)
            logger.debug("%s pathlist %r", procName, dataList)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = CrateDbLoader(schemaDefObj=sd,
                                    ioObj=ioObj,
                                    dbCon=client,
                                    workPath=self.__workPath,
                                    cleanUp=False,
                                    warnings="default",
                                    verbose=self.__verbose)
                ret = sdl.load(inputPathList=dataList,
                               loadType="crate-insert-many",
                               deleteOpt="selected",
                               tableIdSkipD=skipD)
            # all or nothing here
            if ret:
                return dataList, dataList, []
            else:
                return [], [], []
        except Exception as e:
            logger.info("Failing with dataList %r", dataList)
            logger.exception("Failing with %s", str(e))

        return [], [], []

    def __testLoadFilesMulti(self, contentType):
        """Test case - create load w/insert-many all chemical component definition data files - (multiproc test)"""
        numProc = self.__numProc
        chunkSize = self.__chunkSize
        try:
            #
            sd, _, _, _ = self.__schP.getSchemaInfo(contentType)
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)

            optD = {}
            optD["sd"] = sd
            if contentType == "pdbx":
                optD["skip"] = self.__tableIdSkipD
            else:
                optD["skip"] = {}

            #
            pathList = self.__getPathList(fType=contentType)
            logger.debug("Input path list %r", pathList)
            mpu = MultiProcUtil(verbose=True)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="loadInsertMany")
            ok, _, _, _ = mpu.runMulti(dataList=pathList,
                                       numProc=numProc,
                                       numResults=1,
                                       chunkSize=chunkSize)
            self.assertEqual(ok, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaCreate(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb")
            sqlL = []
            for tableId in tableIdList:
                if tableId in self.__tableIdSkipD:
                    continue
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(
                    sqlGen.createTableSQL(
                        databaseName=schemaDefObj.getVersionedDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema creation SQL string\n %s\n\n",
                         "\n".join(sqlL))
            logger.info("Creating schema using database %s",
                        schemaDefObj.getVersionedDatabaseName())
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                logger.debug("Schema create command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaRemove(self, schemaDefObj):
        """Test case -  remove table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb")
            sqlL = []
            for tableId in tableIdList:
                if tableId in self.__tableIdSkipD:
                    continue
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(
                    sqlGen.dropTableSQL(
                        databaseName=schemaDefObj.getVersionedDatabaseName(),
                        tableDefObj=tableDefObj))
                sqlL.extend(
                    sqlGen.dropTableSQL(
                        databaseName=schemaDefObj.getDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                logger.debug("Schema remove command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
class RepoHoldingsDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        self.__export = False
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testValidateOptsStrict(self):
        updateId = self.__updateId
        schemaLevel = "full"
        eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertTrue(eCount <= 1)

    @unittest.skip("Troubleshooting test")
    def testValidateOptsMin(self):
        updateId = self.__updateId
        schemaLevel = "min"
        eCount = self.__testValidateOpts(updateId, schemaLevel=schemaLevel)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertTrue(eCount <= 1)

    def __testValidateOpts(self, updateId, schemaLevel="full"):
        schemaNames = ["repository_holdings"]
        collectionNames = {
            "repository_holdings": [
                "repository_holdings_update_entry",
                "repository_holdings_current_entry",
                "repository_holdings_unreleased_entry",
                "repository_holdings_removed_entry",
                "repository_holdings_combined_entry",
            ],
            "entity_sequence_clusters":
            ["cluster_members", "cluster_provenance", "entity_members"],
        }
        #
        eCount = 0
        for schemaName in schemaNames:
            for collectionName in collectionNames[schemaName]:
                _ = self.__schP.makeSchemaDef(schemaName,
                                              dataTyping="ANY",
                                              saveSchema=True)
                cD = self.__schP.makeSchema(schemaName,
                                            collectionName,
                                            encodingType="JSON",
                                            level=schemaLevel,
                                            saveSchema=True)
                dL = self.__getRepositoryHoldingsDocuments(
                    schemaName, collectionName, updateId)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            collectionName + ".json")
                    self.__mU.doExport(savePath, dL, fmt="json", indent=3)
                # Raises exceptions for schema compliance.
                Draft4Validator.check_schema(cD)
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d",
                                 schemaName, collectionName, ii)
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info(
                                "schema %s collection %s path %s error: %s",
                                schemaName, collectionName, error.path,
                                error.message)
                            logger.info(">>>")
                            logger.info(">>> failing object is %r", dD)
                            logger.info(">>>")
                            eCount += 1
                            cCount += 1
                        #
                        logger.debug("schema %s collection %s count %d",
                                     schemaName, collectionName, cCount)
                    except Exception as e:
                        logger.exception("Validation error %s", str(e))

        return eCount

    def __getRepositoryHoldingsDocuments(self, schemaName, collectionName,
                                         updateId):
        """Test loading and processing operations for legacy holdings and status data."""
        rL = []
        try:
            rhdp = RepoHoldingsDataPrep(cfgOb=self.__cfgOb,
                                        sandboxPath=self.__sandboxPath,
                                        workPath=self.__cachePath)
            if collectionName == "repository_holdings_update_entry":
                rL = rhdp.getHoldingsUpdateEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("update data length %r", len(rL))
            #
            elif collectionName == "repository_holdings_current_entry":
                rL = rhdp.getHoldingsCurrentEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("holdings data length %r", len(rL))
            #
            elif collectionName == "repository_holdings_unreleased_entry":
                rL = rhdp.getHoldingsUnreleasedEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("unreleased data length %r", len(rL))
            #
            elif collectionName in ["repository_holdings_removed_entry"]:
                rL = rhdp.getHoldingsRemovedEntry(updateId=updateId)
                if collectionName == "repository_holdings_removed":
                    self.assertGreaterEqual(len(rL), 10)
                    logger.debug("removed data length %r", len(rL))
            elif collectionName == "repository_holdings_combined_entry":
                rL = rhdp.getHoldingsCombinedEntry(updateId=updateId)
                self.assertGreaterEqual(len(rL), 10)
                logger.debug("holdings data length %r", len(rL))

            #
        except Exception as e:
            logger.exception("%s %s failing with %s", schemaName,
                             collectionName, str(e))
            self.fail()

        return rL
class UniProtCoreEtlWorker(object):
    """Prepare and load UniProt 'core' sequence reference data collections."""
    def __init__(self,
                 cfgOb,
                 cachePath,
                 useCache=True,
                 numProc=2,
                 chunkSize=10,
                 readBackCheck=False,
                 documentLimit=None,
                 doValidate=False,
                 verbose=False):
        self.__cfgOb = cfgOb
        self.__cachePath = cachePath
        self.__useCache = useCache
        self.__readBackCheck = readBackCheck
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        self.__documentLimit = documentLimit
        #
        self.__resourceName = "MONGO_DB"
        self.__verbose = verbose
        self.__statusList = []
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=self.__useCache)
        self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
        self.__valInst = None
        self.__doValidate = doValidate
        #

    def __updateStatus(self, updateId, databaseName, collectionName, status,
                       startTimestamp):
        try:
            sFlag = "Y" if status else "N"
            desp = DataExchangeStatus()
            desp.setStartTime(tS=startTimestamp)
            desp.setObject(databaseName, collectionName)
            desp.setStatus(updateId=updateId, successFlag=sFlag)
            desp.setEndTime()
            self.__statusList.append(desp.getStatus())
            return True
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __getReferenceSequenceProvider(self):
        """ """
        try:
            rsaP = ReferenceSequenceAssignmentProvider(
                self.__cfgOb,
                databaseName="pdbx_core",
                collectionName="pdbx_core_polymer_entity",
                polymerType="Protein",
                referenceDatabaseName="UniProt",
                provSource="PDB",
                useCache=self.__useCache,
                cachePath=self.__cachePath,
                fetchLimit=self.__documentLimit,
                siftsAbbreviated="TEST",
            )
            ok = rsaP.testCache()
            return ok, rsaP
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None

    def load(self, updateId, extResource, loadType="full"):
        """Load sequence reference data"""
        try:
            self.__statusList = []
            desp = DataExchangeStatus()
            statusStartTimestamp = desp.setStartTime()
            #
            dList = indexL = []
            databaseName = collectionName = collectionVersion = None
            #
            if extResource == "UniProt":
                databaseName = "uniprot_core"
                # configName = self.__cfgOb.getDefaultSectionName()
                # dirPath = os.path.join(self.__cachePath, self.__cfgOb.get("EXDB_CACHE_DIR", self.__cfgOb.getDefaultSectionName()))
                #
                ok, rsP = self.__getReferenceSequenceProvider()
                if not ok:
                    return False
                #
                dList = rsP.getDocuments()
                logger.info("Resource %r extracted mapped document length %d",
                            extResource, len(dList))
                logger.debug("Objects %r", dList[:2])
                #
                cDL = self.__docHelper.getCollectionInfo(databaseName)
                collectionName = cDL[0]["NAME"]
                collectionVersion = cDL[0]["VERSION"]
                indexL = self.__docHelper.getDocumentIndexAttributes(
                    collectionName, "primary")
                logger.info(
                    "Database %r collection %r version %r index attributes %r",
                    databaseName, collectionName, collectionVersion, indexL)
                addValues = {}
            else:
                logger.error("Unsupported external resource %r", extResource)
            #
            if self.__doValidate:
                self.__valInst = self.__getValidator(databaseName,
                                                     collectionName,
                                                     schemaLevel="full")
                for dObj in dList:
                    self.__validateObj(databaseName,
                                       collectionName,
                                       dObj,
                                       label="Original")
            #
            dl = DocumentLoader(
                self.__cfgOb,
                self.__cachePath,
                self.__resourceName,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
                documentLimit=self.__documentLimit,
                verbose=self.__verbose,
                readBackCheck=self.__readBackCheck,
            )
            #
            ok = dl.load(databaseName,
                         collectionName,
                         loadType=loadType,
                         documentList=dList,
                         indexAttributeList=indexL,
                         keyNames=None,
                         addValues=addValues)
            okS = self.__updateStatus(updateId, databaseName, collectionName,
                                      ok, statusStartTimestamp)

            return ok and okS
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def getLoadStatus(self):
        return self.__statusList

    def __getValidator(self, databaseName, collectionName, schemaLevel="full"):
        # _ = self.__schP.makeSchemaDef(databaseName, dataTyping="ANY", saveSchema=True)
        # cD = self.__schP.makeSchema(databaseName, collectionName, encodingType="JSON", level=schemaLevel, saveSchema=True)
        logger.info("Fetch schema for %r %r validation level %r", databaseName,
                    collectionName, schemaLevel)
        cD = self.__schP.getJsonSchema(databaseName,
                                       collectionName,
                                       encodingType="JSON",
                                       level=schemaLevel)
        # Raises exceptions for schema compliance.
        Draft4Validator.check_schema(cD)
        valInst = Draft4Validator(cD, format_checker=FormatChecker())
        return valInst

    def __validateObj(self, databaseName, collectionName, rObj, label=""):
        try:
            eCount = 0
            tId = rObj["rcsb_id"] if rObj and "rcsb_id" in rObj else "anonymous"
            for error in sorted(self.__valInst.iter_errors(rObj), key=str):
                logger.info(
                    "Database %s collection %s (%s %r) path %s error: %s",
                    databaseName, collectionName, label, tId, error.path,
                    error.message)
                logger.debug(">>> Failing object is %r", rObj)
                if "rcsb_uniprot_feature" in rObj:
                    for dd in rObj["rcsb_uniprot_feature"]:
                        if "feature_id" in dd:
                            logger.info("feature_id %r", dd["feature_id"])
                        else:
                            logger.info("no feature_id keys %r",
                                        sorted(dd.keys()))
                            logger.info("description %r", dd["description"])
                eCount += 1
        except Exception as e:
            logger.exception("Validation failing %s", str(e))

        return eCount
Beispiel #27
0
class SchemaProviderTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=False)
        #
        self.__validationLevels = self.__cfgOb.getList(
            "VALIDATION_LEVELS_TEST",
            sectionName="database_catalog_configuration")
        self.__encodingTypes = self.__cfgOb.getList(
            "ENCODING_TYPES_TEST",
            sectionName="database_catalog_configuration")
        #
        buildAll = True
        if buildAll:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_DEPLOYED",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_DEPLOYED",
                sectionName="database_catalog_configuration")
            #
        else:
            self.__databaseNameList = self.__cfgOb.getList(
                "DATABASE_NAMES_TEST",
                sectionName="database_catalog_configuration")
            self.__dataTypingList = self.__cfgOb.getList(
                "DATATYPING_TEST",
                sectionName="database_catalog_configuration")
        #
        self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testSchemaAccessDefault(self):
        for databaseName in self.__databaseNameList:
            cDL = self.__docHelper.getCollectionInfo(databaseName)
            for cD in cDL:
                collectionName = cD["NAME"]
                for encodingType in self.__encodingTypes:
                    if encodingType.lower() == "rcsb":
                        continue
                    for level in self.__validationLevels:
                        logger.debug("Loading ->%s %s %s %s", databaseName,
                                     collectionName, encodingType, level)
                        sD = self.__schP.getJsonSchema(
                            databaseName,
                            collectionName,
                            encodingType=encodingType,
                            level=level)
                        self.assertTrue(sD is not None)
Beispiel #28
0
class DocumentLoader(object):
    def __init__(
        self,
        cfgOb,
        cachePath,
        resourceName="MONGO_DB",
        numProc=4,
        chunkSize=15,
        documentLimit=None,
        verbose=False,
        readBackCheck=False,
        maxStepLength=2000,
        schemaRebuildFlag=False,
    ):
        self.__verbose = verbose
        #
        # Limit the load length of each file type for testing  -  Set to None to remove -
        self.__documentLimit = documentLimit
        self.__maxStepLength = maxStepLength
        #
        # Controls for multiprocessing execution -
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        #
        self.__cfgOb = cfgOb
        self.__resourceName = resourceName
        #
        self.__cachePath = cachePath if cachePath else "."
        self.__schP = SchemaProvider(cfgOb,
                                     cachePath,
                                     useCache=True,
                                     rebuildFlag=schemaRebuildFlag)
        #
        self.__readBackCheck = readBackCheck
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"
        #
        #

    def load(self,
             databaseName,
             collectionName,
             loadType="full",
             documentList=None,
             indexAttributeList=None,
             keyNames=None,
             schemaLevel="full",
             addValues=None):
        """Driver method for loading MongoDb content -


        loadType:     "full" or "replace"

        """
        try:
            startTime = self.__begin(message="loading operation")
            #

            #
            optionsD = {}
            optionsD["collectionName"] = collectionName
            optionsD["databaseName"] = databaseName
            optionsD["readBackCheck"] = self.__readBackCheck
            optionsD["loadType"] = loadType
            optionsD["keyNames"] = keyNames
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            #
            docList = documentList[:self.
                                   __documentLimit] if self.__documentLimit else documentList
            logger.debug("Full document list length %d limit %r",
                         len(documentList), self.__documentLimit)
            numProc = self.__numProc
            chunkSize = self.__chunkSize if docList and self.__chunkSize < len(
                docList) else 0
            #
            if addValues:
                try:
                    for doc in docList:
                        for k, v in addValues.items():
                            doc[k] = v
                except Exception as e:
                    logger.error("Add values %r fails with %s", addValues,
                                 str(e))

            #
            indAtList = indexAttributeList if indexAttributeList else []
            bsonSchema = None
            if schemaLevel and schemaLevel in ["min", "full"]:
                bsonSchema = self.__schP.getJsonSchema(databaseName,
                                                       collectionName,
                                                       encodingType="BSON",
                                                       level=schemaLevel)
                logger.debug("Using schema validation for %r %r %r",
                             databaseName, collectionName, schemaLevel)

            if loadType == "full":
                self.__removeCollection(databaseName, collectionName)
                ok = self.__createCollection(databaseName,
                                             collectionName,
                                             indAtList,
                                             bsonSchema=bsonSchema)
                logger.info("Collection %s create status %r", collectionName,
                            ok)
            elif loadType == "append":
                # create only if object does not exist -
                ok = self.__createCollection(databaseName,
                                             collectionName,
                                             indexAttributeNames=indAtList,
                                             checkExists=True,
                                             bsonSchema=bsonSchema)
                logger.debug("Collection %s create status %r", collectionName,
                             ok)
                # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            numDocs = len(docList)
            logger.debug("Processing %d total documents", numDocs)
            numProc = min(numProc, numDocs)
            maxStepLength = self.__maxStepLength
            if numDocs > maxStepLength:
                numLists = int(numDocs / maxStepLength)
                subLists = [docList[i::numLists] for i in range(numLists)]
            else:
                subLists = [docList]
            #
            if subLists:
                logger.debug(
                    "Starting with numProc %d outer subtask count %d subtask length ~ %d",
                    numProc, len(subLists), len(subLists[0]))
            #
            failList = []
            for ii, subList in enumerate(subLists):
                logger.debug("Running outer subtask %d of %d length %d",
                             ii + 1, len(subLists), len(subList))
                #
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optionsD)
                mpu.set(workerObj=self, workerMethod="loadWorker")
                ok, failListT, _, _ = mpu.runMulti(dataList=subList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
                failList.extend(failListT)
            logger.debug("Completed load with failing document list %r",
                         failList)
            logger.debug("Document list length %d failed load list length %d",
                         len(docList), len(failList))
            #

            self.__end(startTime, "loading operation with status " + str(ok))

            #
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return False

    def loadWorker(self, dataList, procName, optionsD, workingDir):
        """Multi-proc worker method for MongoDb document loading -"""
        try:
            startTime = self.__begin(message=procName)
            readBackCheck = optionsD["readBackCheck"]
            loadType = optionsD["loadType"]

            collectionName = optionsD["collectionName"]
            databaseName = optionsD["databaseName"]
            keyNames = optionsD["keyNames"]
            #
            logger.debug("%s databaseName %s collectionName %s workingDir %s",
                         procName, databaseName, collectionName, workingDir)
            #
            if dataList:
                ok, successList, failedList = self.__loadDocuments(
                    databaseName,
                    collectionName,
                    dataList,
                    loadType=loadType,
                    readBackCheck=readBackCheck,
                    keyNames=keyNames)
            #
            logger.debug(
                "%s database %s collection %s inputList length %d successList length %d  failed %d",
                procName,
                databaseName,
                collectionName,
                len(dataList),
                len(successList),
                len(failedList),
            )
            #

            self.__end(startTime, procName + " with status " + str(ok))
            return successList, [], []

        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return [], [], []

    # -------------- -------------- -------------- -------------- -------------- -------------- --------------
    #                                        ---  Supporting code follows ---
    #

    def __begin(self, message=""):
        startTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting %s at %s", message, ts)
        return startTime

    def __end(self, startTime, message=""):
        endTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        delta = endTime - startTime
        logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)

    def __createCollection(self,
                           dbName,
                           collectionName,
                           indexAttributeNames=None,
                           checkExists=False,
                           bsonSchema=None):
        """Create database and collection and optionally a primary index -"""
        try:
            logger.debug("Create database %s collection %s", dbName,
                         collectionName)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                if checkExists and mg.databaseExists(
                        dbName) and mg.collectionExists(
                            dbName, collectionName):
                    ok1 = True
                else:
                    ok1 = mg.createCollection(dbName,
                                              collectionName,
                                              bsonSchema=bsonSchema)
                ok2 = mg.databaseExists(dbName)
                ok3 = mg.collectionExists(dbName, collectionName)
                okI = True
                if indexAttributeNames:
                    okI = mg.createIndex(dbName,
                                         collectionName,
                                         indexAttributeNames,
                                         indexName="primary",
                                         indexType="DESCENDING",
                                         uniqueFlag=False)

            return ok1 and ok2 and ok3 and okI
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __removeCollection(self, dbName, collectionName):
        """Drop collection within database"""
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                #
                logger.debug("Remove collection database %s collection %s",
                             dbName, collectionName)
                logger.debug("Starting databases = %r", mg.getDatabaseNames())
                logger.debug("Starting collections = %r",
                             mg.getCollectionNames(dbName))
                ok = mg.dropCollection(dbName, collectionName)
                logger.debug("Databases = %r", mg.getDatabaseNames())
                logger.debug("Post drop collections = %r",
                             mg.getCollectionNames(dbName))
                ok = mg.collectionExists(dbName, collectionName)
                logger.debug("Post drop collections = %r",
                             mg.getCollectionNames(dbName))
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False

    def __loadDocuments(self,
                        dbName,
                        collectionName,
                        docList,
                        loadType="full",
                        readBackCheck=False,
                        keyNames=None):
        #
        # Load database/collection with input document list -
        #
        failList = []
        rIdL = []
        successList = []
        logger.debug(
            "Loading dbName %s collectionName %s with document count %d keynames %r",
            dbName, collectionName, len(docList), keyNames)
        if keyNames:
            # map the document list to some document key if this is provided
            indD = {}
            indL = []
            try:
                for ii, doc in enumerate(docList):
                    dIdTup = self.__getKeyValues(doc, keyNames)
                    indD[dIdTup] = ii
                indL = list(range(len(docList)))
            except Exception as e:
                logger.exception("Failing ii %d d %r with %s", ii, doc, str(e))
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                mg = MongoDbUtil(client)
                #
                if loadType == "replace" and keyNames:
                    dTupL = mg.deleteList(dbName, collectionName, docList,
                                          keyNames)
                    logger.debug("Deleted document status %r", (dTupL, ))
                #
                rIdL = mg.insertList(dbName,
                                     collectionName,
                                     docList,
                                     keyNames=keyNames)
                logger.debug("Insert returns rIdL length %r", len(rIdL))

                # ---
                #  If there is a failure then determine the specific successes and failures -
                #
                successList = docList
                failList = []
                if len(rIdL) != len(docList):
                    if keyNames:
                        successIndList = []
                        for rId in rIdL:
                            rObj = mg.fetchOne(dbName, collectionName, "_id",
                                               rId)
                            dIdTup = self.__getKeyValues(rObj, keyNames)
                            successIndList.append(indD[dIdTup])
                        failIndList = list(set(indL) - set(successIndList))
                        failList = [docList[ii] for ii in failIndList]
                        successList = [docList[ii] for ii in successIndList]
                    else:
                        # fail the whole batch if we don't have visibility into each document
                        failList = docList
                        successList = []
                #
                rbStatus = True
                if readBackCheck and keyNames:
                    #
                    # Note that objects in docList are mutated by the insert operation with the additional key '_id',
                    # hence, it is possible to compare the fetched object with the input object.
                    #
                    for ii, rId in enumerate(rIdL):
                        rObj = mg.fetchOne(dbName, collectionName, "_id", rId)
                        dIdTup = self.__getKeyValues(rObj, keyNames)
                        jj = indD[dIdTup]
                        if rObj != docList[jj]:
                            rbStatus = False
                            break
                #
                if readBackCheck and not rbStatus:
                    return False, successList, failList
                #
            return len(rIdL) == len(docList), successList, failList
        except Exception as e:
            logger.exception("Failing %r %r (len=%d) %s with %s", dbName,
                             collectionName, len(docList), keyNames, str(e))
        return False, [], docList

    def __getKeyValues(self, dct, keyNames):
        """Return the tuple of values of corresponding to the input dictionary key names expressed in dot notation.

        Args:
            dct (dict): source dictionary object (nested)
            keyNames (list): list of dictionary keys in dot notatoin

        Returns:
            tuple: tuple of values corresponding to the input key names

        """
        rL = []
        try:
            for keyName in keyNames:
                rL.append(self.__getKeyValue(dct, keyName))
        except Exception as e:
            logger.exception("Failing for key names %r with %s", keyNames,
                             str(e))

        return tuple(rL)

    def __getKeyValue(self, dct, keyName):
        """Return the value of the corresponding key expressed in dot notation in the input dictionary object (nested)."""
        try:
            kys = keyName.split(".")
            for key in kys:
                try:
                    dct = dct[key]
                except KeyError:
                    return None
            return dct
        except Exception as e:
            logger.exception("Failing for key %r with %s", keyName, str(e))

        return None
    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__outputPath = os.path.join(HERE, "test-output")
        self.__savedOutputPath = os.path.join(HERE, "test-saved-output")

        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE",
                                                sectionName=configName,
                                                default="local")
        self.__fileLimit = 100 if self.__discoveryMode == "local" else 10
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__chemCompMockLen = 24
        self.__pdbxMockLen = 30
        # removes timestamped data items to allow diffs.)
        excludeExtras = ["rcsb_load_status"]
        # excludeExtras = []
        #
        self.__verbose = True
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__exportFlag = True
        self.__diffFlag = False
        #
        self.__simpleTestCaseList = [
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_no_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeCol,
                "styleType": "columnwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 0,
            },
        ]
        #
        self.__fullTestCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": None,
                "rejectLength": 2,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__fullTestCaseListA = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))