Ejemplos de RepositoryProvider en Python

Lenguaje de programación: Python

Namespace/Package Name: rcsb.utils.repository.RepositoryProvider

Clase / Tipo: RepositoryProvider

Ejemplos en hotexamples.com: 19

Python RepositoryProvider - 19 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de rcsb.utils.repository.RepositoryProvider.RepositoryProvider extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

RepositoryProvider(9)

getLocatorObjList(9)

getContainerList(5)

getLocatorPaths(2)

getLocatorIdcodes(1)

getLocatorsFromPaths(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: testSchemaDefLoaderDb.py Proyecto: rcsb/py-rcsb_db

 def setUp(self):
     self.__isMac = platform.system() == "Darwin"
     self.__excludeType = None if self.__isMac else "optional"
     self.__verbose = True
     #
     fileLimit = 100
     numProc = 2
     self.__cachePath = os.path.join(TOPDIR, "CACHE")
     self.__workPath = os.path.join(HERE, "test-output")
     mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                               "exdb-config-example.yml")
     #
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName,
                               mockTopPath=mockTopPath)
     self.__resourceName = "MYSQL_DB"
     #
     self.__schP = SchemaProvider(self.__cfgOb,
                                  self.__cachePath,
                                  useCache=True)
     self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                     numProc=numProc,
                                     fileLimit=fileLimit,
                                     cachePath=self.__cachePath)
     #
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

Ejemplo n.º 2

Mostrar archivo

 def setUp(self):
     self.__verbose = True
     self.__numProc = 2
     self.__fileLimit = 100
     self.__chunkSize = 0
     self.__workPath = os.path.join(HERE, "test-output")
     self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                               "exdb-config-example.yml")
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName)
     self.__resourceName = "CRATE_DB"
     self.__schP = SchemaProvider(self.__cfgOb,
                                  self.__workPath,
                                  useCache=True)
     self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                     numProc=self.__numProc,
                                     fileLimit=self.__fileLimit,
                                     cachePath=self.__workPath)
     #
     #
     self.__tableIdSkipD = {
         "ATOM_SITE": True,
         "ATOM_SITE_ANISOTROP": True,
         "__LOAD_STATUS__": True
     }
     self.__ioObj = IoAdapter(verbose=self.__verbose)
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

Ejemplo n.º 3

Mostrar archivo

Archivo: ScanRepoUtil.py Proyecto: rcsb/py-rcsb_utils_repository

    def __init__(self,
                 cfgOb,
                 attributeDataTypeD=None,
                 numProc=4,
                 chunkSize=15,
                 fileLimit=None,
                 maxStepLength=2000,
                 workPath=None):
        """
        Args:
            cfgOb (object): Configuration object (rcsb.utils.config.ConfigUtil)

            attributeDataTypeD
            dictPath (str): Path to supporting data dictionary

            numProc (int, optional): Number of parallel worker processes used.
            chunkSize (int, optional): Size of files processed in a single multi-proc process
            fileLimit (int, optional): maximum file scanned or None for no limit
            mockTopPath (str, optional): Path to directory containing mock repositories or None
            maxStepLength (int, optional): maximum number of multi-proc runs to perform
        """
        #
        self.__attributeDataTypeD = attributeDataTypeD if attributeDataTypeD else {}
        # Limit the load length of each file type for testing  -  Set to None to remove -
        self.__fileLimit = fileLimit
        self.__maxStepLength = maxStepLength
        #
        # Controls for multiprocessing execution -
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        #
        self.__cfgOb = cfgOb
        #
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"

        self.__workPath = workPath
        self.__mU = MarshalUtil(workPath=self.__workPath)
        self.__rpP = RepositoryProvider(self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__workPath)

Ejemplo n.º 4

Mostrar archivo

Archivo: testDictMethodRunner.py Proyecto: rcsb/py-rcsb_utils_dictionary

 def setUp(self):
     self.__export = True
     self.__numProc = 2
     self.__fileLimit = 200
     mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     self.__cachePath = os.path.join(TOPDIR, "CACHE")
     configPath = os.path.join(mockTopPath, "config",
                               "dbload-setup-example.yml")
     configName = "site_info_configuration"
     self.__configName = configName
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName,
                               mockTopPath=mockTopPath)
     self.__mU = MarshalUtil(workPath=self.__cachePath)
     self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                     numProc=self.__numProc,
                                     fileLimit=self.__fileLimit,
                                     cachePath=self.__cachePath)
     #
     self.__testCaseList = [
         {
             "contentType": "pdbx_core",
             "mockLength": 50,
             "mergeContent": ["vrpt"]
         },
         {
             "contentType": "bird_chem_comp_core",
             "mockLength": 17,
             "mergeContent": None
         },
     ]
     #
     self.__modulePathMap = self.__cfgOb.get(
         "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

Ejemplo n.º 5

Mostrar archivo

 def __init__(self, cfgOb, configName, cachePath, **kwargs):
     #
     self.__version = __version__
     self.__cfgOb = cfgOb
     self.__configName = configName
     self.__cachePath = cachePath
     self.__fileLimit = kwargs.get("fileLimit", None)
     self.__dirPath = os.path.join(cachePath, "neighbor-interactions")
     self.__numProc = kwargs.get("numProc", 2)
     self.__chunkSize = kwargs.get("chunkSize", 10)
     useCache = kwargs.get("useCache", True)
     #
     #  - Configuration for stash services -
     #    Local target directory name to be stashed.  (subdir of dirPath)
     #
     self.__stashDir = "ligand-target-neighbors"
     #
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                     numProc=self.__numProc,
                                     fileLimit=self.__fileLimit,
                                     cachePath=self.__cachePath)
     self.__neighborD = self.__reload(fmt="pickle", useCache=useCache)

Ejemplo n.º 6

Mostrar archivo

Archivo: testRepositoryProvider.py Proyecto: rcsb/py-rcsb_utils_repository

    def setUp(self):
        #
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(mockTopPath, "config",
                                  "dbload-setup-example.yml")
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=mockTopPath)
        self.__cachePath = os.path.join(TOPDIR, "CACHE")

        self.__numProc = 2
        self.__chunkSize = 20
        self.__fileLimit = None
        #
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

Ejemplo n.º 7

Mostrar archivo

Archivo: testSchemaDefDataPrep.py Proyecto: rcsb/py-rcsb_db

class SchemaDefDataPrepTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefDataPrepTests, self).__init__(methodName)
        self.__loadPathList = []
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__outputPath = os.path.join(HERE, "test-output")
        self.__savedOutputPath = os.path.join(HERE, "test-saved-output")

        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE",
                                                sectionName=configName,
                                                default="local")
        self.__fileLimit = 100 if self.__discoveryMode == "local" else 10
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__chemCompMockLen = 24
        self.__pdbxMockLen = 30
        # removes timestamped data items to allow diffs.)
        excludeExtras = ["rcsb_load_status"]
        # excludeExtras = []
        #
        self.__verbose = True
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__exportFlag = True
        self.__diffFlag = False
        #
        self.__simpleTestCaseList = [
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_no_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeCol,
                "styleType": "columnwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 0,
            },
        ]
        #
        self.__fullTestCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": None,
                "rejectLength": 2,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__fullTestCaseListA = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logger.info("Maximum resident memory size %.4f %s", rusageMax / 1.0e6,
                    unitS)
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", self.id(),
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def __timeStep(self, msg):
        endTime = time.time()
        logger.info("Completed %s at %s (%.4f seconds)", msg,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    endTime - self.__startTime)

    def testSimpleSchemaDefDataPrep(self):
        for tcD in self.__simpleTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__simpleSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"])

    def testFullSchemaDefDataPrep(self):
        for tcD in self.__fullTestCaseList:
            rejectLength = 0 if self.__discoveryMode == "remote" else tcD[
                "rejectLength"]
            mockLength = self.__fileLimit if self.__discoveryMode == "remote" else tcD[
                "mockLength"]
            if tcD["contentType"] == "bird_chem_comp_core" and self.__discoveryMode == "remote":
                logger.info("Skipping %r in discovery mode %r",
                            tcD["contentType"], self.__discoveryMode)
                continue
            self.__fullSchemaDataPrep(
                tcD["contentType"],
                tcD["filterType"],
                tcD["styleType"],
                mockLength,
                rejectLength=rejectLength,
                mergeContentTypes=tcD["mergeContentTypes"],
                excludeExtras=tcD["excludeExtras"],
            )

    def __simpleSchemaDataPrep(self,
                               contentType,
                               filterType,
                               styleType,
                               mockLength,
                               rejectLength=0,
                               dataSelectors=None,
                               mergeContentTypes=None):
        """Internal method for preparing file-based data NOT requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            dataSelectors = dataSelectors if dataSelectors else [
                "PUBLIC_RELEASE"
            ]
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName=contentType,
                                                    dataTyping="ANY")
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            #

            logger.debug("For %s mock length %d length of path list %d\n",
                         contentType, mockLength, len(inputPathList))
            self.assertGreaterEqual(len(inputPathList), mockLength)
            tableDataDictList, containerNameList, rejectList = sdp.fetchDocuments(
                inputPathList,
                styleType=styleType,
                filterType=filterType,
                dataSelectors=dataSelectors)
            logger.debug(
                "For %s mock length %d reject length %d length of tddl list %d\n",
                contentType, mockLength, rejectLength, len(tableDataDictList))
            self.assertGreaterEqual(len(tableDataDictList),
                                    mockLength - rejectLength)
            self.assertGreaterEqual(len(containerNameList),
                                    mockLength - rejectLength)

            if rejectList:
                logger.debug("For %s rejecting components %r", contentType,
                             rejectList)
            #
            self.assertEqual(len(rejectList), rejectLength)
            fName = "simple-prep-%s-%s.json" % (contentType, styleType)
            if self.__exportFlag:
                fPath = os.path.join(self.__outputPath, fName)
                self.__mU.doExport(fPath,
                                   tableDataDictList,
                                   fmt="json",
                                   indent=3)
            if self.__diffFlag:
                fPath = os.path.join(self.__savedOutputPath, fName)
                refDocList = self.__mU.doImport(fPath, fmt="json")
                self.assertEqual(len(refDocList), len(tableDataDictList))
                #
                jD = diff(refDocList,
                          tableDataDictList,
                          syntax="explicit",
                          marshal=True)
                if jD:
                    _, fn = os.path.split(fPath)
                    bn, _ = os.path.splitext(fn)
                    fPath = os.path.join(self.__outputPath, bn + "-diff.json")
                    logger.debug("jsondiff for %s %s = \n%s", contentType,
                                 styleType,
                                 pprint.pformat(jD, indent=3, width=100))
                    self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __logDocumentOrder(self, docList):
        for doc in docList:
            logger.debug("keys %r", list(doc.keys()))

    def __filterDocuments(self, docList, excludeList=None):
        excludeList = excludeList if excludeList else []
        for doc in docList:
            for excl in excludeList:
                if excl in doc:
                    del doc[excl]

    def __fullSchemaDataPrep(self,
                             contentType,
                             filterType,
                             styleType,
                             mockLength,
                             rejectLength=0,
                             dataSelectors=None,
                             mergeContentTypes=None,
                             excludeExtras=None):
        """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            excludeExtras = excludeExtras if excludeExtras else []
            _ = mockLength
            _ = rejectLength
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, collectionNameList, _ = self.__schP.getSchemaInfo(
                databaseName=contentType, dataTyping="ANY")
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            #
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
            #
            for collectionName in collectionNameList:
                tableIdExcludeList = sd.getCollectionExcluded(collectionName)
                tableIdIncludeList = sd.getCollectionSelected(collectionName)
                sliceFilter = sd.getCollectionSliceFilter(collectionName)
                sdp.setSchemaIdExcludeList(tableIdExcludeList)
                sdp.setSchemaIdIncludeList(tableIdIncludeList)
                #
                docList, _, _ = sdp.processDocuments(
                    containerList,
                    styleType=styleType,
                    sliceFilter=sliceFilter,
                    filterType=filterType,
                    dataSelectors=dataSelectors,
                    collectionName=collectionName)

                docList = sdp.addDocumentPrivateAttributes(
                    docList, collectionName)
                docList = sdp.addDocumentSubCategoryAggregates(
                    docList, collectionName)

                # Special exclusions for the test harness. (removes timestamped data items to allow diffs.)
                self.__filterDocuments(docList, excludeExtras)
                mergeS = "-".join(
                    mergeContentTypes) if mergeContentTypes else ""
                fName = "full-prep-%s-%s-%s-%s.json" % (
                    contentType, collectionName, mergeS, styleType)
                if self.__exportFlag:
                    self.__logDocumentOrder(docList)
                    fPath = os.path.join(self.__outputPath, fName)
                    self.__mU.doExport(fPath, docList, fmt="json", indent=3)
                    logger.debug("Exported %r", fPath)
                #
                if self.__diffFlag:
                    fPath = os.path.join(self.__savedOutputPath, fName)
                    refDocList = self.__mU.doImport(fPath, fmt="json")
                    self.assertEqual(len(refDocList), len(docList))
                    logger.debug("For %s %s len refDocList %d", contentType,
                                 collectionName, len(refDocList))
                    logger.debug("For %s %s len docList %d", contentType,
                                 collectionName, len(docList))
                    jD = diff(refDocList,
                              docList,
                              syntax="explicit",
                              marshal=True)
                    if jD:
                        _, fn = os.path.split(fPath)
                        bn, _ = os.path.splitext(fn)
                        fPath = os.path.join(self.__outputPath,
                                             bn + "-diff.json")
                        logger.debug("jsondiff for %s %s = \n%s", contentType,
                                     collectionName,
                                     pprint.pformat(jD, indent=3, width=100))
                        self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                    self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Ejemplo n.º 8

Mostrar archivo

Archivo: testSchemaDefLoaderDb.py Proyecto: rcsb/py-rcsb_db

class SchemaDefLoaderDbTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefLoaderDbTests, self).__init__(methodName)
        self.__verbose = True

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__verbose = True
        #
        fileLimit = 100
        numProc = 2
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__workPath = os.path.join(HERE, "test-output")
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__resourceName = "MYSQL_DB"
        #
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=numProc,
                                        fileLimit=fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __schemaCreate(self, schemaDefObj):
        """Create table schema using schema definition"""
        try:
            tableIdList = schemaDefObj.getSchemaIdList()
            sqlGen = SqlGenAdmin(self.__verbose)
            sqlL = sqlGen.createDatabaseSQL(schemaDefObj.getDatabaseName())
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getSchemaObject(tableId)
                sqlL.extend(
                    sqlGen.createTableSQL(
                        databaseName=schemaDefObj.getDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema creation SQL string\n %s\n\n",
                         "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                myQ = MyDbQuery(dbcon=client, verbose=self.__verbose)
                #
                # Permit warnings to support "drop table if exists" for missing tables.
                #
                myQ.setWarning("ignore")
                ret = myQ.sqlCommand(sqlCommandList=sqlL)
                logger.debug("\n\n+INFO mysql server returns %r\n", ret)
                self.assertTrue(ret)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    # ------------- - -------------------------------------------------------------------------------------------
    def testSchemaCreate(self):
        """Create table schema for BIRD, chemical component, and PDBx data."""
        cD = self.__schP.makeSchemaDef("bird",
                                       dataTyping="SQL",
                                       saveSchema=True)
        sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)
        #
        cD = self.__schP.makeSchemaDef("chem_comp",
                                       dataTyping="SQL",
                                       saveSchema=True)
        sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)
        #
        # cD = self.__schP.makeSchemaDef("pdbx", dataTyping="SQL", saveSchema=True)
        # sd = SchemaDefAccess(cD)
        self.__schemaCreate(sd)

    def testLoadBirdReference(self):
        try:
            cD = self.__schP.makeSchemaDef("bird",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="bird")
            inputPathList.extend(
                self.__rpP.getLocatorObjList(contentType="bird_family"))
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReLoadBirdReference(self):
        try:
            cD = self.__schP.makeSchemaDef("bird",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="bird")
            inputPathList.extend(
                self.__rpP.getLocatorObjList(contentType="bird_family"))
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                sdl.load(inputPathList=inputPathList, loadType="batch-file")
                #
                logger.debug(
                    "INFO BATCH FILE RELOAD TEST --------------------------------------------\n"
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file",
                              deleteOpt="all")
                self.assertTrue(ok)
                #
                logger.debug(
                    "\n\n\n+INFO BATCH INSERT RELOAD TEST --------------------------------------------\n"
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file",
                              deleteOpt="selected")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadChemCompReference(self):
        try:
            cD = self.__schP.makeSchemaDef("chem_comp",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(
                contentType="chem_comp")
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-file")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skip("Disable test - schema not optimized for mysql limitations")
    def testLoadPdbxFiles(self):
        try:
            cD = self.__schP.makeSchemaDef("pdbx",
                                           dataTyping="SQL",
                                           saveSchema=True)
            sd = SchemaDefAccess(cD)
            self.__schemaCreate(sd)

            inputPathList = self.__rpP.getLocatorObjList(contentType="pdbx")
            logger.debug("Input path list %r", inputPathList)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = SchemaDefLoader(
                    self.__cfgOb,
                    schemaDefObj=sd,
                    dbCon=client,
                    cachePath=self.__cachePath,
                    workPath=self.__workPath,
                    cleanUp=False,
                    warnings="error",
                    verbose=self.__verbose,
                    restoreUseStash=False,
                    restoreUseGit=True,
                    providerTypeExclude=self.__excludeType,
                )
                ok = sdl.load(inputPathList=inputPathList,
                              loadType="batch-insert",
                              deleteOpt="all")
                self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Ejemplo n.º 9

Mostrar archivo

class CockroachDbLoaderCockroachDbTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(CockroachDbLoaderCockroachDbTests, self).__init__(methodName)
        self.__verbose = True
        self.__createFlag = False

    def setUp(self):
        self.__verbose = True
        self.__numProc = 2
        self.__fileLimit = 100
        self.__workPath = os.path.join(HERE, "test-output")
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath, defaultSectionName=configName)
        self.__resourceName = "COCKROACH_DB"
        self.__schP = SchemaProvider(self.__cfgOb, self.__workPath, useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb, numProc=self.__numProc, fileLimit=self.__fileLimit, cachePath=self.__workPath)
        #
        self.__tableIdSkipD = {"ATOM_SITE": True, "ATOM_SITE_ANISOTROP": True}
        self.__ioObj = IoAdapter(verbose=self.__verbose)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testConnection(self):
        try:
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                self.assertNotEqual(client, None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaCreate(self):
        """Create table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaRemove(self):
        """Remove table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertBirdReference(self):

        try:

            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("bird")
            inputPathList.extend(self.__rpP.getLocatorObjList("bird_family"))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyBirdReference(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("bird")
            inputPathList.extend(self.__rpP.getLocatorObjList("bird_family"))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertChemCompReference(self):

        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("chem_comp")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyChemCompReference(self):

        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("chem_comp")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected")
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertPdbxExampleFiles(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("pdbx")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD)
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadInsertManyPdbxExampleFiles(self):
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)
            inputPathList = self.__rpP.getLocatorObjList("pdbx")
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                sdl = CockroachDbLoader(schemaDefObj=sd, ioObj=self.__ioObj, dbCon=client, workPath=self.__workPath, cleanUp=False, warnings="default", verbose=self.__verbose)
                ret = sdl.load(inputPathList=inputPathList, loadType="cockroach-insert-many", deleteOpt="selected", tableIdSkipD=self.__tableIdSkipD)
                self.assertEqual(ret, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaCreateSQL(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        sqlL = []
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlL = sqlGen.createDatabaseSQL(dbName)
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj))
            logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL))
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
        return sqlL

    def __schemaCreate(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlL = sqlGen.createDatabaseSQL(dbName)
            for tableId in tableIdList:
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(sqlGen.createTableSQL(databaseName=schemaDefObj.getVersionedDatabaseName(), tableDefObj=tableDefObj))

            logger.debug("\nSchema creation SQL string\n %s\n\n", "\n".join(sqlL))
            logger.info("Creating schema using database %s", schemaDefObj.getVersionedDatabaseName())
            #
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                # ret = crQ.sqlCommand(' '.join(sqlL))
                logger.info("Schema create command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaRemove(self, schemaDefObj):
        """Test case -  remove table schema using schema definition"""
        ret = 0
        try:
            dbName = schemaDefObj.getVersionedDatabaseName()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="CockroachDb")
            sqlL = sqlGen.removeDatabaseSQL(dbName)
            logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb, resourceName=self.__resourceName) as client:
                crQ = CockroachDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                # ret = crQ.sqlCommand(' '.join(sqlL))
                logger.debug("Schema remove command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Ejemplo n.º 10

Mostrar archivo

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        # self.__fileLimit = None
        self.__fileLimit = 20

        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH",
                                                   sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = False
        #
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-fails")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        self.__extraOpts = None
        # The following for extended parent/child info -
        # self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": [
                "pdbx_core_entity",
                "pdbx_core_entry",
                "pdbx_core_assembly",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_nonpolymer_entity_instance",
                "pdbx_core_branched_entity_instance",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_nonpolymer_entity_instance",
                "pdbx_core_branched_entity_instance",
            ],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "bird_chem_comp_core": ["bird_chem_comp_core"],
            "pdbx_core": [
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_polymer_entity",
                "pdbx_core_entry",
                "pdbx_core_assembly",
                "pdbx_core_nonpolymer_entity",
                "pdbx_core_branched_entity",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_branched_entity_instance",
            ],
        }
        self.__databaseNameModelD = {
            "pdbx_comp_model_core": [
                "pdbx_comp_model_core_entry",
                "pdbx_comp_model_core_assembly",
                "pdbx_comp_model_core_polymer_entity",
                "pdbx_comp_model_core_polymer_entity_instance",
                "pdbx_comp_model_core_nonpolymer_entity",
                "pdbx_comp_model_core_branched_entity",
                "pdbx_comp_model_core_branched_entity_instance",
            ],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

Ejemplo n.º 11

Mostrar archivo

class SchemaDataPrepValidateTests(unittest.TestCase):
    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        # self.__fileLimit = None
        self.__fileLimit = 20

        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=self.__configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__birdRepoPath = self.__cfgOb.getPath("BIRD_REPO_PATH",
                                                   sectionName=configName)
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__verbose = False
        #
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        self.__testDirPath = os.path.join(HERE, "test-output", "pdbx-fails")
        self.__testIhmDirPath = os.path.join(HERE, "test-output", "ihm-files")
        self.__export = True
        #
        self.__extraOpts = None
        # The following for extended parent/child info -
        # self.__extraOpts = 'addParentRefs|addPrimaryKey'
        #
        self.__alldatabaseNameD = {
            "ihm_dev": ["ihm_dev"],
            "pdbx": ["pdbx", "pdbx_ext"],
            "pdbx_core": [
                "pdbx_core_entity",
                "pdbx_core_entry",
                "pdbx_core_assembly",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_nonpolymer_entity_instance",
                "pdbx_core_branched_entity_instance",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_nonpolymer_entity_instance",
                "pdbx_core_branched_entity_instance",
            ],
            "bird": ["bird"],
            "bird_family": ["family"],
            "chem_comp": ["chem_comp"],
            "bird_chem_comp": ["bird_chem_comp"],
            "bird_chem_comp_core": ["bird_chem_comp_core"],
        }

        self.__databaseNameD = {
            "bird_chem_comp_core": ["bird_chem_comp_core"],
            "pdbx_core": [
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_polymer_entity",
                "pdbx_core_entry",
                "pdbx_core_assembly",
                "pdbx_core_nonpolymer_entity",
                "pdbx_core_branched_entity",
                "pdbx_core_polymer_entity_instance",
                "pdbx_core_branched_entity_instance",
            ],
        }
        self.__databaseNameModelD = {
            "pdbx_comp_model_core": [
                "pdbx_comp_model_core_entry",
                "pdbx_comp_model_core_assembly",
                "pdbx_comp_model_core_polymer_entity",
                "pdbx_comp_model_core_polymer_entity_instance",
                "pdbx_comp_model_core_nonpolymer_entity",
                "pdbx_comp_model_core_branched_entity",
                "pdbx_comp_model_core_branched_entity_instance",
            ],
        }
        self.__mergeContentTypeD = {"pdbx_core": ["vrpt"]}
        # self.__databaseNameD = {"chem_comp_core": ["chem_comp_core"], "bird_chem_comp_core": ["bird_chem_comp_core"]}
        # self.__databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_instance_validation"]}
        # self.__databaseNameD = {"pdbx_core": ["pdbx_core_entity_monomer"]}
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __modelFixture(self):
        fU = FileUtil()
        modelSourcePath = os.path.join(self.__mockTopPath, "AF")
        for iPath in glob.iglob(os.path.join(modelSourcePath, "*.cif.gz")):
            fn = os.path.basename(iPath)
            uId = fn.split("-")[1]
            h3 = uId[-2:]
            h2 = uId[-4:-2]
            h1 = uId[-6:-4]
            oPath = os.path.join(self.__cachePath, "computed-models", h1, h2,
                                 h3, fn)
            fU.put(iPath, oPath)

    def testValidateOptsRepo(self):
        # schemaLevel = "min"
        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(
            databaseNameD=self.__databaseNameD,
            inputPathList=inputPathList,
            schemaLevel=schemaLevel,
            mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        # expected errors
        # pdbx_core_entry (3JWB) path deque(['reflns_shell', 0, 'Rmerge_I_obs']) error: 33.9 is greater than or equal to the maximum of 10.0
        self.assertLessEqual(eCount, 2)

    def testValidateModels(self):
        self.__modelFixture()
        schemaLevel = "full"
        inputPathList = None
        eCount = self.__testValidateOpts(
            databaseNameD=self.__databaseNameModelD,
            inputPathList=inputPathList,
            schemaLevel=schemaLevel,
            mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        self.assertLessEqual(eCount, 0)

    @unittest.skip("Disable troubleshooting test")
    def testValidateOptsList(self):
        schemaLevel = "min"
        inputPathList = self.__mU.doImport(
            os.path.join(HERE, "test-output", "failed-path.list"), "list")
        # inputPathList = glob.glob(self.__testDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {
            "pdbx_core": [
                "pdbx_core_entity", "pdbx_core_entry",
                "pdbx_core_entity_instance",
                "pdbx_core_entity_instance_validation"
            ]
        }
        for ii, subList in enumerate(chunkList(inputPathList[::-1], 40)):
            if ii < 5:
                continue
            eCount = self.__testValidateOpts(
                databaseNameD=databaseNameD,
                inputPathList=subList,
                schemaLevel=schemaLevel,
                mergeContentTypeD=self.__mergeContentTypeD)
            logger.info(
                "Chunk %d total validation errors schema level %s : %d", ii,
                schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)

    @unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmRepo(self):
        schemaLevel = "min"
        inputPathList = None
        self.__export = False

        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        databaseNameD = {"ihm_dev": ["ihm_dev"]}
        eCount = self.__testValidateOpts(
            databaseNameD=databaseNameD,
            inputPathList=inputPathList,
            schemaLevel=schemaLevel,
            mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    @unittest.skip("Disable IHM troubleshooting test")
    def testValidateOptsIhmList(self):
        schemaLevel = "full"

        inputPathList = glob.glob(self.__testIhmDirPath + "/*.cif")
        if not inputPathList:
            return True
        databaseNameD = {"ihm_dev_full": ["ihm_dev_full"]}
        eCount = self.__testValidateOpts(
            databaseNameD=databaseNameD,
            inputPathList=inputPathList,
            schemaLevel=schemaLevel,
            mergeContentTypeD=self.__mergeContentTypeD)
        logger.info("Total validation errors schema level %s : %d",
                    schemaLevel, eCount)
        # self.assertGreaterEqual(eCount, 20)
        #

    def __testValidateOpts(self,
                           databaseNameD,
                           inputPathList=None,
                           schemaLevel="full",
                           mergeContentTypeD=None):
        #
        eCount = 0
        for databaseName in databaseNameD:
            mergeContentTypes = mergeContentTypeD[
                databaseName] if databaseName in mergeContentTypeD else None
            _ = self.__schP.makeSchemaDef(databaseName,
                                          dataTyping="ANY",
                                          saveSchema=True)
            pthList = inputPathList if inputPathList else self.__rpP.getLocatorObjList(
                databaseName, mergeContentTypes=mergeContentTypes)
            for collectionName in databaseNameD[databaseName]:
                cD = self.__schP.makeSchema(databaseName,
                                            collectionName,
                                            encodingType="JSON",
                                            level=schemaLevel,
                                            saveSchema=True,
                                            extraOpts=None)
                #
                dL, cnL = self.__testPrepDocumentsFromContainers(
                    pthList,
                    databaseName,
                    collectionName,
                    styleType="rowwise_by_name_with_cardinality",
                    mergeContentTypes=mergeContentTypes)
                # Raises exceptions for schema compliance.
                try:
                    Draft4Validator.check_schema(cD)
                except Exception as e:
                    logger.error("%s %s schema validation fails with %s",
                                 databaseName, collectionName, str(e))
                #
                valInfo = Draft4Validator(cD, format_checker=FormatChecker())
                logger.info("Validating %d documents from %s %s", len(dL),
                            databaseName, collectionName)
                for ii, dD in enumerate(dL):
                    logger.debug("Schema %s collection %s document %d",
                                 databaseName, collectionName, ii)
                    try:
                        cCount = 0
                        for error in sorted(valInfo.iter_errors(dD), key=str):
                            logger.info(
                                "schema %s collection %s (%s) path %s error: %s",
                                databaseName, collectionName, cnL[ii],
                                error.path, error.message)
                            logger.debug("Failing document %d : %r", ii,
                                         list(dD.items()))
                            eCount += 1
                            cCount += 1
                        if cCount > 0:
                            logger.info(
                                "schema %s collection %s container %s error count %d",
                                databaseName, collectionName, cnL[ii], cCount)
                    except Exception as e:
                        logger.exception("Validation processing error %s",
                                         str(e))

        return eCount

    def __testPrepDocumentsFromContainers(
            self,
            inputPathList,
            databaseName,
            collectionName,
            styleType="rowwise_by_name_with_cardinality",
            mergeContentTypes=None):
        """Test case -  create loadable PDBx data from repository files"""
        try:

            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName)
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(databaseName)
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=self.__fTypeRow)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")
            #
            tableIdExcludeList = sd.getCollectionExcluded(collectionName)
            tableIdIncludeList = sd.getCollectionSelected(collectionName)
            sliceFilter = sd.getCollectionSliceFilter(collectionName)
            sdp.setSchemaIdExcludeList(tableIdExcludeList)
            sdp.setSchemaIdIncludeList(tableIdIncludeList)
            #
            logger.debug("%s (%r) exclude list %r", collectionName,
                         sliceFilter, tableIdExcludeList)
            logger.debug("%s (%r) include list %r", collectionName,
                         sliceFilter, tableIdIncludeList)
            docList, containerNameList, _ = sdp.processDocuments(
                containerList,
                styleType=styleType,
                filterType=self.__fTypeRow,
                dataSelectors=["PUBLIC_RELEASE"],
                sliceFilter=sliceFilter,
                collectionName=collectionName)

            docList = sdp.addDocumentPrivateAttributes(docList, collectionName)
            docList = sdp.addDocumentSubCategoryAggregates(
                docList, collectionName)
            #
            mergeS = "-".join(mergeContentTypes) if mergeContentTypes else ""
            if self.__export and docList:
                fp = os.path.join(
                    HERE, "test-output", "prep-%s-%s-%s.json" %
                    (databaseName, collectionName, mergeS))
                self.__mU.doExport(fp, docList, fmt="json", indent=3)
                logger.debug("Exported %r", fp)
            #
            return docList, containerNameList

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Ejemplo n.º 12

Mostrar archivo

    def __init__(
        self,
        cfgOb,
        schemaDefObj,
        cfgSectionName="site_info_configuration",
        dbCon=None,
        cachePath=".",
        workPath=".",
        cleanUp=False,
        warnings="default",
        verbose=True,
        restoreUseStash=True,
        restoreUseGit=True,
        providerTypeExclude=True,
    ):
        self.__verbose = verbose
        self.__debug = False
        self.__cfgOb = cfgOb
        sectionName = cfgSectionName
        self.__sD = schemaDefObj

        #
        self.__dbCon = dbCon
        self.__cachePath = cachePath
        self.__workPath = workPath
        self.__pathList = []
        self.__cleanUp = cleanUp
        #
        self.__colSep = "&##&\t"
        self.__rowSep = "$##$\n"
        #
        #
        self.__fTypeRow = "skip-max-width"
        self.__fTypeCol = "skip-max-width"
        #
        self.__warningAction = warnings
        dtf = DataTransformFactory(schemaDefAccessObj=self.__sD,
                                   filterType=self.__fTypeRow)
        self.__sdp = SchemaDefDataPrep(schemaDefAccessObj=self.__sD,
                                       dtObj=dtf,
                                       workPath=self.__cachePath,
                                       verbose=self.__verbose)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        cachePath=self.__cachePath)
        #
        schemaName = self.__sD.getName()
        modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP",
                                         sectionName=sectionName)
        dP = DictionaryApiProviderWrapper(self.__cachePath,
                                          cfgOb=self.__cfgOb,
                                          configName=sectionName,
                                          useCache=True)
        dictApi = dP.getApiByName(schemaName)
        rP = DictMethodResourceProvider(
            self.__cfgOb,
            cachePath=self.__cachePath,
            restoreUseStash=restoreUseStash,
            restoreUseGit=restoreUseGit,
            providerTypeExclude=providerTypeExclude)
        self.__dmh = DictMethodRunner(dictApi,
                                      modulePathMap=modulePathMap,
                                      resourceProvider=rP)

Ejemplo n.º 13

Mostrar archivo

class SchemaDefLoader(object):
    """Map PDBx/mmCIF instance data to SQL loadable data using external schema definition."""
    def __init__(
        self,
        cfgOb,
        schemaDefObj,
        cfgSectionName="site_info_configuration",
        dbCon=None,
        cachePath=".",
        workPath=".",
        cleanUp=False,
        warnings="default",
        verbose=True,
        restoreUseStash=True,
        restoreUseGit=True,
        providerTypeExclude=True,
    ):
        self.__verbose = verbose
        self.__debug = False
        self.__cfgOb = cfgOb
        sectionName = cfgSectionName
        self.__sD = schemaDefObj

        #
        self.__dbCon = dbCon
        self.__cachePath = cachePath
        self.__workPath = workPath
        self.__pathList = []
        self.__cleanUp = cleanUp
        #
        self.__colSep = "&##&\t"
        self.__rowSep = "$##$\n"
        #
        #
        self.__fTypeRow = "skip-max-width"
        self.__fTypeCol = "skip-max-width"
        #
        self.__warningAction = warnings
        dtf = DataTransformFactory(schemaDefAccessObj=self.__sD,
                                   filterType=self.__fTypeRow)
        self.__sdp = SchemaDefDataPrep(schemaDefAccessObj=self.__sD,
                                       dtObj=dtf,
                                       workPath=self.__cachePath,
                                       verbose=self.__verbose)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        cachePath=self.__cachePath)
        #
        schemaName = self.__sD.getName()
        modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP",
                                         sectionName=sectionName)
        dP = DictionaryApiProviderWrapper(self.__cachePath,
                                          cfgOb=self.__cfgOb,
                                          configName=sectionName,
                                          useCache=True)
        dictApi = dP.getApiByName(schemaName)
        rP = DictMethodResourceProvider(
            self.__cfgOb,
            cachePath=self.__cachePath,
            restoreUseStash=restoreUseStash,
            restoreUseGit=restoreUseGit,
            providerTypeExclude=providerTypeExclude)
        self.__dmh = DictMethodRunner(dictApi,
                                      modulePathMap=modulePathMap,
                                      resourceProvider=rP)

    def setWarning(self, action):
        if action in ["error", "ignore", "default"]:
            self.__warningAction = action
            return True
        else:
            self.__warningAction = "default"
            return False

    def setDelimiters(self, colSep=None, rowSep=None):
        """Set column and row delimiters for intermediate data files used for
        batch-file loading operations.
        """
        self.__colSep = colSep if colSep is not None else "&##&\t"
        self.__rowSep = rowSep if rowSep is not None else "$##$\n"
        return True

    def load(self,
             inputPathList=None,
             containerList=None,
             loadType="batch-file",
             deleteOpt=None,
             tableIdSkipD=None):
        """Load data for each table defined in the current schema definition object.
        Data are extracted from the input file list.

        Data source options:

          inputPathList = [<full path of target input file>, ....]

        or

          containerList = [ data container, ...]


        loadType  =  ['batch-file' | 'batch-insert']
        deleteOpt = 'selected' | 'all'

        tableIdSkipD - searchable container with tableIds to be skipped on loading -

        Loading is performed using the current database server connection.

        Intermediate data files for 'batch-file' loading are created in the current working path.

        Returns True for success or False otherwise.

        """
        tableIdSkipD = tableIdSkipD if tableIdSkipD is not None else {}
        if inputPathList is not None:
            cL = self.__rpP.getContainerList(inputPathList)
            #
            # Apply dynamic methods here -
            #
            for cA in cL:
                self.__dmh.apply(cA)
            tableDataDict, containerNameList = self.__sdp.process(cL)

        elif containerList is not None:
            tableDataDict, containerNameList = self.__sdp.process(
                containerList)
        #
        #
        if loadType in ["batch-file", "batch-file-append"]:
            append = True if loadType == "batch-file-append" else False
            exportList = self.__exportTdd(tableDataDict,
                                          colSep=self.__colSep,
                                          rowSep=self.__rowSep,
                                          append=append)
            for tableId, loadPath in exportList:
                if tableId in tableIdSkipD:
                    continue
                self.__batchFileImport(tableId,
                                       loadPath,
                                       sqlFilePath=None,
                                       containerNameList=containerNameList,
                                       deleteOpt=deleteOpt)
                if self.__cleanUp:
                    self.__cleanUpFile(loadPath)
            return True
        elif loadType == "batch-insert":
            for tableId, rowList in tableDataDict.items():
                if tableId in tableIdSkipD:
                    continue
                if deleteOpt in ["all", "selected"] or rowList:
                    self.__batchInsertImport(
                        tableId,
                        rowList=rowList,
                        containerNameList=containerNameList,
                        deleteOpt=deleteOpt)
            return True
        else:
            pass

        return False

    def __cleanUpFile(self, filePath):
        try:
            os.remove(filePath)
        except Exception:
            pass

    def makeLoadFilesMulti(self, dataList, procName, optionsD, workingDir):
        """Create a loadable data file for each table defined in the current schema
        definition object.   Data is extracted from the input file list.

        Load files are creating in the current working path.

        Return the containerNames for the input path list, and path list for load files that are created.

        """
        _ = workingDir
        try:
            pn = procName.split("-")[-1]
        except Exception:
            pn = procName

        exportFormat = optionsD[
            "exportFormat"] if "exportFormat" in optionsD else "tdd"
        r1, r2 = self.makeLoadFiles(inputPathList=dataList,
                                    partName=pn,
                                    exportFormat=exportFormat)
        return dataList, r1, r2, []

    def makeLoadFiles(self,
                      inputPathList,
                      append=False,
                      partName="1",
                      exportFormat="tdd"):
        """Create a loadable data file for each table defined in the current schema
        definition object.   Data is extracted from the input file list.

        Load files are created in the current working path.

        Return the containerNames for the input path list, and path list for load files that are created.

        """
        cL = self.__rpP.getContainerList(inputPathList)
        for cA in cL:
            self.__dmh.apply(cA)
        tableDataDict, containerNameList = self.__sdp.process(cL)
        if exportFormat == "tdd":
            return containerNameList, self.__exportTdd(tableDataDict,
                                                       colSep=self.__colSep,
                                                       rowSep=self.__rowSep,
                                                       append=append,
                                                       partName=partName)
        elif exportFormat == "csv":
            return containerNameList, self.__exportCsv(tableDataDict,
                                                       append=append,
                                                       partName=partName)
        else:
            return [], []

    def __exportCsv(self, tableDict, append=False, partName="1"):
        """ """
        modeOpt = "a" if append else "w"

        exportList = []
        for tableId, rowList in tableDict.items():
            if not rowList:
                continue
            tObj = self.__sD.getSchemaObject(tableId)
            schemaAttributeIdList = tObj.getAttributeIdList()
            attributeNameList = tObj.getAttributeNameList()
            #
            fn = os.path.join(self.__workPath,
                              tableId + "-" + partName + ".csv")
            with open(fn, modeOpt, newline="", encoding="utf-8") as ofh:
                csvWriter = csv.writer(ofh)
                csvWriter.writerow(attributeNameList)
                for rD in rowList:
                    csvWriter.writerow(
                        [rD[aId] for aId in schemaAttributeIdList])

            exportList.append((tableId, fn))
        return exportList

    def __exportTdd(self,
                    tableDict,
                    colSep="&##&\t",
                    rowSep="$##$\n",
                    append=False,
                    partName="1"):
        modeOpt = "a" if append else "w"

        exportList = []
        for tableId, rowList in tableDict.items():
            tObj = self.__sD.getSchemaObject(tableId)
            schemaAttributeIdList = tObj.getAttributeIdList()
            #
            if rowList:
                fn = os.path.join(self.__workPath,
                                  tableId + "-" + partName + ".tdd")
                ofh = open(fn, modeOpt, encoding="utf-8")
                for rD in rowList:
                    # logger.info("%r" % colSep.join([str(rD[aId]) for aId in schemaAttributeIdList]))
                    ofh.write("%s%s" % (colSep.join(
                        [str(rD[aId])
                         for aId in schemaAttributeIdList]), rowSep))
                ofh.close()
                exportList.append((tableId, fn))
        return exportList

    def loadBatchFiles(self,
                       loadList=None,
                       containerNameList=None,
                       deleteOpt=None):
        """Load data for each table defined in the current schema definition object using

        Data source options:

          loadList = [(tableId, <full path of load file), ....]
          containerNameList = [ data namecontainer, ...]

        deleteOpt = 'selected' | 'all','truncate'

        Loading is performed using the current database server connection.

        Returns True for success or False otherwise.

        """
        #
        startTime = time.time()
        for tableId, loadPath in loadList:
            ok = self.__batchFileImport(tableId,
                                        loadPath,
                                        sqlFilePath=None,
                                        containerNameList=containerNameList,
                                        deleteOpt=deleteOpt)
            if not ok:
                break
            if self.__cleanUp:
                self.__cleanUpFile(loadPath)
        #
        endTime = time.time()
        logger.debug("Completed with status %r at %s (%.3f seconds)\n", ok,
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - startTime)
        return ok

    def delete(self, tableId, containerNameList=None, deleteOpt="all"):
        #
        startTime = time.time()
        sqlCommandList = self.__getSqlDeleteList(
            tableId, containerNameList=containerNameList, deleteOpt=deleteOpt)

        myQ = MyDbQuery(dbcon=self.__dbCon, verbose=self.__verbose)
        myQ.setWarning(self.__warningAction)
        ret = myQ.sqlCommand(sqlCommandList=sqlCommandList)
        #
        #
        endTime = time.time()

        logger.debug("Delete table %s server returns %r\n", tableId, ret)
        logger.debug("Completed at %s (%.3f seconds)\n",
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - startTime)
        return ret

    def __getSqlDeleteList(self,
                           tableId,
                           containerNameList=None,
                           deleteOpt="all"):
        """Return the SQL delete commands for the input table and container name list."""
        databaseName = self.__sD.getDatabaseName()
        sqlGen = SqlGenAdmin(self.__verbose)

        databaseName = self.__sD.getDatabaseName()
        tableDefObj = self.__sD.getSchemaObject(tableId)
        tableName = tableDefObj.getName()

        sqlDeleteList = []
        if deleteOpt in ["selected", "delete"
                         ] and containerNameList is not None:
            deleteAttributeName = tableDefObj.getDeleteAttributeName()
            sqlDeleteList = sqlGen.deleteFromListSQL(databaseName,
                                                     tableName,
                                                     deleteAttributeName,
                                                     containerNameList,
                                                     chunkSize=50)
        elif deleteOpt in ["all", "truncate"]:
            sqlDeleteList = [sqlGen.truncateTableSQL(databaseName, tableName)]

        logger.debug("Delete SQL for %s : %r\n", tableId, sqlDeleteList)
        return sqlDeleteList

    def __batchFileImport(self,
                          tableId,
                          tableLoadPath,
                          sqlFilePath=None,
                          containerNameList=None,
                          deleteOpt="all"):
        """Batch load the input table using data in the input loadable data file.

        if sqlFilePath is provided then any generated SQL commands are preserved in this file.

        deleteOpt None|'selected'| 'all' or 'truncate'
        """
        startTime = time.time()
        databaseName = self.__sD.getDatabaseName()
        sqlGen = SqlGenAdmin(self.__verbose)

        databaseName = self.__sD.getDatabaseName()
        tableDefObj = self.__sD.getSchemaObject(tableId)
        # tableName = tableDefObj.getName()

        #
        if deleteOpt:
            sqlCommandList = self.__getSqlDeleteList(
                tableId,
                containerNameList=containerNameList,
                deleteOpt=deleteOpt)
        else:
            sqlCommandList = []

        if os.access(tableLoadPath, os.R_OK):
            tableDefObj = self.__sD.getSchemaObject(tableId)

            sqlCommandList.append(
                sqlGen.importTable(databaseName,
                                   tableDefObj,
                                   importPath=tableLoadPath))

            if self.__verbose:
                logger.debug("SQL import command\n%s\n", sqlCommandList)
            #

        if sqlFilePath is not None:
            try:
                with open(sqlFilePath, "w", encoding="utf-8") as ofh:
                    ofh.write("%s" % "\n".join(sqlCommandList))
            except Exception:
                pass
        #
        myQ = MyDbQuery(dbcon=self.__dbCon, verbose=self.__verbose)
        myQ.setWarning(self.__warningAction)
        ret = myQ.sqlCommand(sqlCommandList=sqlCommandList)
        #
        #
        endTime = time.time()
        logger.debug("Table %s server returns %r\n", tableId, ret)
        logger.debug("Completed at %s (%.3f seconds)\n",
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - startTime)
        return ret

    def loadBatchData(self,
                      tableId,
                      rowList=None,
                      containerNameList=None,
                      deleteOpt="selected"):
        return self.__batchInsertImport(tableId,
                                        rowList=rowList,
                                        containerNameList=containerNameList,
                                        deleteOpt=deleteOpt)

    def __batchInsertImport(self,
                            tableId,
                            rowList=None,
                            containerNameList=None,
                            deleteOpt="selected"):
        """Load the input table using batch inserts of the input list of dictionaries (i.e. d[attributeId]=value).

        The containerNameList corresponding to the data within loadable data in rowList can be provided
        if 'selected' deletions are to performed prior to the the batch data inserts.

        deleteOpt = ['selected','all'] where 'selected' deletes rows corresponding to the input container
                    list before insert.   The 'all' options truncates the table prior to insert.

                    Deletions are performed in the absence of loadable data.

        """
        startTime = time.time()

        myQ = MyDbQuery(dbcon=self.__dbCon, verbose=self.__verbose)
        myQ.setWarning(self.__warningAction)
        sqlGen = SqlGenAdmin(self.__verbose)
        #
        databaseName = self.__sD.getDatabaseName()
        tableDefObj = self.__sD.getSchemaObject(tableId)
        tableName = tableDefObj.getName()
        tableAttributeIdList = tableDefObj.getAttributeIdList()
        tableAttributeNameList = tableDefObj.getAttributeNameList()
        #
        sqlDeleteList = None
        if deleteOpt in ["selected", "delete"
                         ] and containerNameList is not None:
            deleteAttributeName = tableDefObj.getDeleteAttributeName()
            sqlDeleteList = sqlGen.deleteFromListSQL(databaseName,
                                                     tableName,
                                                     deleteAttributeName,
                                                     containerNameList,
                                                     chunkSize=10)
            if self.__verbose:
                logger.debug("Delete SQL for %s : %r\n", tableId,
                             sqlDeleteList)
        elif deleteOpt in ["all", "truncate"]:
            sqlDeleteList = [sqlGen.truncateTableSQL(databaseName, tableName)]

        sqlInsertList = []
        for row in rowList:
            vList = []
            aList = []
            for tid, nm in zip(tableAttributeIdList, tableAttributeNameList):
                # if len(row[id]) > 0 and row[id] != r'\N':
                if row[tid] is not None and row[tid] != r"\N":
                    vList.append(row[tid])
                    aList.append(nm)
            sqlInsertList.append(
                (sqlGen.insertTemplateSQL(databaseName, tableName,
                                          aList), vList))

        ret = myQ.sqlBatchTemplateCommand(sqlInsertList,
                                          prependSqlList=sqlDeleteList)
        if ret:
            logger.debug("Batch insert completed for table %s rows %d\n",
                         tableName, len(sqlInsertList))
        else:
            logger.error("Batch insert fails for table %s length %d\n",
                         tableName, len(sqlInsertList))

        endTime = time.time()
        if self.__verbose:
            logger.debug("Completed at %s (%.3f seconds)\n",
                         time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                         endTime - startTime)

        return ret

    def __deleteFromTable(self, tableIdList, deleteValue):
        """Delete data from the input table list where the schema table delete attribute
        has the input value "deleteValue".

        """
        databaseName = self.__sD.getDatabaseName()
        sqlList = []
        sqlGen = SqlGenAdmin(self.__verbose)
        for tableId in tableIdList:
            tableName = self.__sD.getSchemaName(tableId)
            tableDefObj = self.__sD.getSchemaObject(tableId)
            atName = tableDefObj.getDeleteAttributeName()
            sqlTemp = sqlGen.deleteTemplateSQL(databaseName, tableName,
                                               [atName])
            sqlList.append(sqlTemp % deleteValue)
        #
        return sqlList

Ejemplo n.º 14

Mostrar archivo

class SchemaDefLoadercrateDbMultiTests(unittest.TestCase):
    def __init__(self, methodName="runTest"):
        super(SchemaDefLoadercrateDbMultiTests, self).__init__(methodName)
        self.__verbose = True
        self.__createFlag = True

    def setUp(self):
        self.__verbose = True
        self.__numProc = 2
        self.__fileLimit = 100
        self.__chunkSize = 0
        self.__workPath = os.path.join(HERE, "test-output")
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName)
        self.__resourceName = "CRATE_DB"
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__workPath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__workPath)
        #
        #
        self.__tableIdSkipD = {
            "ATOM_SITE": True,
            "ATOM_SITE_ANISOTROP": True,
            "__LOAD_STATUS__": True
        }
        self.__ioObj = IoAdapter(verbose=self.__verbose)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testConnection(self):
        try:
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                self.assertNotEqual(client, None)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaCreate(self):
        """Create table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaCreate(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testSchemaRemove(self):
        """Remove table schema (live) for BIRD, chemical component, and PDBx data."""
        try:
            sd, _, _, _ = self.__schP.getSchemaInfo("bird")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("chem_comp")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
            sd, _, _, _ = self.__schP.getSchemaInfo("pdbx")
            ret = self.__schemaRemove(schemaDefObj=sd)
            self.assertEqual(ret, True)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testLoadChemCompMulti(self):
        self.__testLoadFilesMulti("chem_comp")

    def testLoadBirdMulti(self):
        self.__testLoadFilesMulti("bird")

    def testLoadPdbxMulti(self):
        self.__testLoadFilesMulti("pdbx")

    def __getPathList(self, fType):
        pathList = []
        if fType == "chem_comp":
            pathList = self.__rpP.getLocatorObjList("chem_comp")
        elif fType == "bird":
            pathList = self.__rpP.getLocatorObjList("bird")
            pathList.extend(self.__rpP.getLocatorObjList("bird_family"))
        elif fType == "pdbx":
            pathList = self.__rpP.getLocatorObjList("pdbx")
        return pathList

    def loadInsertMany(self, dataList, procName, optionsD, workingDir):

        try:
            _ = workingDir
            ret = None
            sd = optionsD["sd"]
            skipD = optionsD["skip"]
            ioObj = IoAdapter(verbose=self.__verbose)
            logger.debug("%s pathlist %r", procName, dataList)
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                sdl = CrateDbLoader(schemaDefObj=sd,
                                    ioObj=ioObj,
                                    dbCon=client,
                                    workPath=self.__workPath,
                                    cleanUp=False,
                                    warnings="default",
                                    verbose=self.__verbose)
                ret = sdl.load(inputPathList=dataList,
                               loadType="crate-insert-many",
                               deleteOpt="selected",
                               tableIdSkipD=skipD)
            # all or nothing here
            if ret:
                return dataList, dataList, []
            else:
                return [], [], []
        except Exception as e:
            logger.info("Failing with dataList %r", dataList)
            logger.exception("Failing with %s", str(e))

        return [], [], []

    def __testLoadFilesMulti(self, contentType):
        """Test case - create load w/insert-many all chemical component definition data files - (multiproc test)"""
        numProc = self.__numProc
        chunkSize = self.__chunkSize
        try:
            #
            sd, _, _, _ = self.__schP.getSchemaInfo(contentType)
            if self.__createFlag:
                self.__schemaCreate(schemaDefObj=sd)

            optD = {}
            optD["sd"] = sd
            if contentType == "pdbx":
                optD["skip"] = self.__tableIdSkipD
            else:
                optD["skip"] = {}

            #
            pathList = self.__getPathList(fType=contentType)
            logger.debug("Input path list %r", pathList)
            mpu = MultiProcUtil(verbose=True)
            mpu.setOptions(optionsD=optD)
            mpu.set(workerObj=self, workerMethod="loadInsertMany")
            ok, _, _, _ = mpu.runMulti(dataList=pathList,
                                       numProc=numProc,
                                       numResults=1,
                                       chunkSize=chunkSize)
            self.assertEqual(ok, True)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaCreate(self, schemaDefObj):
        """Test case -  create table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb")
            sqlL = []
            for tableId in tableIdList:
                if tableId in self.__tableIdSkipD:
                    continue
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(
                    sqlGen.createTableSQL(
                        databaseName=schemaDefObj.getVersionedDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema creation SQL string\n %s\n\n",
                         "\n".join(sqlL))
            logger.info("Creating schema using database %s",
                        schemaDefObj.getVersionedDatabaseName())
            #
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                logger.debug("Schema create command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def __schemaRemove(self, schemaDefObj):
        """Test case -  remove table schema using schema definition"""
        ret = 0
        try:
            tableIdList = schemaDefObj.getTableIdList()
            sqlGen = SqlGenAdmin(self.__verbose, serverType="cratedb")
            sqlL = []
            for tableId in tableIdList:
                if tableId in self.__tableIdSkipD:
                    continue
                tableDefObj = schemaDefObj.getTable(tableId)
                sqlL.extend(
                    sqlGen.dropTableSQL(
                        databaseName=schemaDefObj.getVersionedDatabaseName(),
                        tableDefObj=tableDefObj))
                sqlL.extend(
                    sqlGen.dropTableSQL(
                        databaseName=schemaDefObj.getDatabaseName(),
                        tableDefObj=tableDefObj))

            logger.debug("Schema Remove SQL string\n %s", "\n".join(sqlL))
            with Connection(cfgOb=self.__cfgOb,
                            resourceName=self.__resourceName) as client:
                crQ = CrateDbQuery(dbcon=client, verbose=self.__verbose)
                ret = crQ.sqlCommandList(sqlCommandList=sqlL)
                logger.debug("Schema remove command returns %r\n", ret)
            return ret
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Ejemplo n.º 15

Mostrar archivo

Archivo: testRepositoryProvider.py Proyecto: rcsb/py-rcsb_utils_repository

class RepositoryProviderTests(unittest.TestCase):
    def setUp(self):
        #
        #
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(mockTopPath, "config",
                                  "dbload-setup-example.yml")
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=self.__configName,
                                  mockTopPath=mockTopPath)
        self.__cachePath = os.path.join(TOPDIR, "CACHE")

        self.__numProc = 2
        self.__chunkSize = 20
        self.__fileLimit = None
        #
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)\n", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testRepoUtils(self):
        """Test case - repository locator path utilities"""
        for contentType in ["bird_chem_comp_core", "pdbx_core", "ihm_dev"]:
            mergeContentTypes = None
            if contentType in ["pdbx_core"]:
                mergeContentTypes = ["vrpt"]
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            pathList = self.__rpP.getLocatorPaths(locatorObjList)
            locatorObjList2 = self.__rpP.getLocatorsFromPaths(
                locatorObjList, pathList)
            logger.info("%s pathList length %d", contentType, len(pathList))
            self.assertEqual(len(locatorObjList), len(pathList))
            self.assertEqual(len(locatorObjList), len(locatorObjList2))
            #
        for contentType in ["bird_chem_comp_core", "pdbx_core", "ihm_dev"]:
            mergeContentTypes = None
            if contentType in ["pdbx_core"]:
                mergeContentTypes = ["vrpt"]
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            pathList = self.__rpP.getLocatorPaths(locatorObjList)
            self.assertEqual(len(locatorObjList), len(pathList))
            #
            lCount = len(pathList)
            idCodes = self.__rpP.getLocatorIdcodes(contentType, locatorObjList)
            self.assertEqual(len(locatorObjList), len(idCodes))
            excludeList = idCodes[:int(len(idCodes) / 2)]
            logger.debug("excludeList (%d) %r", len(excludeList), excludeList)
            fL = self.__rpP.getLocatorObjList(
                contentType=contentType,
                mergeContentTypes=mergeContentTypes,
                excludeIds=excludeList)
            logger.debug("fL (%d)", len(fL))
            self.assertEqual(lCount, len(fL) + len(excludeList))

Ejemplo n.º 16

Mostrar archivo

Archivo: testDictMethodRunner.py Proyecto: rcsb/py-rcsb_utils_dictionary

class DictMethodRunnerTests(unittest.TestCase):
    def setUp(self):
        self.__export = True
        self.__numProc = 2
        self.__fileLimit = 200
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        configPath = os.path.join(mockTopPath, "config",
                                  "dbload-setup-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        self.__testCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": 50,
                "mergeContent": ["vrpt"]
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": 17,
                "mergeContent": None
            },
        ]
        #
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def __runContentType(self, contentType, mockLength, mergeContent):
        """Read and process test fixture data files from the input content type."""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContent)
            containerList = self.__rpP.getContainerList(locatorObjList)
            #
            logger.debug("Length of locator list %d\n", len(locatorObjList))
            self.assertGreaterEqual(len(locatorObjList), mockLength)
            for container in containerList:
                cName = container.getName()
                #
                # if cName not in ["1B5F"]:
                #    continue
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMethodRunner(self):
        """Test method runner for multiple content types."""
        for tD in self.__testCaseList:
            self.__runContentType(tD["contentType"], tD["mockLength"],
                                  tD["mergeContent"])

    def testMethodRunnerSetup(self):
        """Test the setup methods for method runner class"""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName("pdbx")
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            ok = dmh is not None
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Ejemplo n.º 17

Mostrar archivo

class NeighborInteractionProvider(object):
    """Generators and accessors for non-polymer instance target interactions."""
    def __init__(self, cfgOb, configName, cachePath, **kwargs):
        #
        self.__version = __version__
        self.__cfgOb = cfgOb
        self.__configName = configName
        self.__cachePath = cachePath
        self.__fileLimit = kwargs.get("fileLimit", None)
        self.__dirPath = os.path.join(cachePath, "neighbor-interactions")
        self.__numProc = kwargs.get("numProc", 2)
        self.__chunkSize = kwargs.get("chunkSize", 10)
        useCache = kwargs.get("useCache", True)
        #
        #  - Configuration for stash services -
        #    Local target directory name to be stashed.  (subdir of dirPath)
        #
        self.__stashDir = "ligand-target-neighbors"
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        self.__neighborD = self.__reload(fmt="pickle", useCache=useCache)
        #

    def testCache(self, minCount=0):
        try:
            if minCount == 0:
                return True
            if self.__neighborD and minCount and len(
                    self.__neighborD["entries"]) >= minCount:
                logger.info(
                    "Target neighbor data for (%d) entries created %r version %r",
                    len(self.__neighborD["entries"]),
                    self.__neighborD["created"], self.__neighborD["version"])
                return True
        except Exception:
            pass
        return False

    def getLigandNeighborIndex(self, entryId):
        """Return the target neighbors for the non-polymer instances for the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {ligandAsymId: {(targetAsymId, targetAuthSeqId): nnIndex1, (): nnIndex2}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandNeighborIndexD"]
        except Exception:
            pass
        return {}

    def getTargetNeighborIndex(self, entryId):
        """Return the ligand neighbors for the polymer or branched entity instances in the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {(targetAsymId, targetAuthSeqId): {(ligandAsymId): nnIndex1, (): nnIndex2}

        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["targetNeighborIndexD"]
        except Exception:
            pass
        return {}

    def getNearestNeighborList(self, entryId):
        """Return the list of neares neighbors for the entry.

        Args:
            entryId (str): entry identifier

        Returns:
            list: [LigandTargetInstance(), ...]

        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["nearestNeighbors"]
        except Exception:
            pass
        return []

    def getLigandNeighborBoundState(self, entryId):
        """Return the dicitonary of ligand instances with isBound boolean status.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {ligandAsymId: True if isBound,  ...  }
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandIsBoundD"]
        except Exception:
            pass
        return {}

    def getAtomCounts(self, entryId):
        """Return the non-polymer instance atom counts for the input entry (all reported atoms).

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandAtomCountD"]
        except Exception:
            pass
        return {}

    def getHydrogenAtomCounts(self, entryId):
        """Return the non-polymer instance hydrogen atom counts for the input entry.

        Args:
            entryId (str): entry identifier

        Returns:
            (dict): {asymId: {'FL': count, 'altA': count, 'altB': count, ... }}
        """
        try:
            return self.__neighborD["entries"][
                entryId.upper()]["ligandHydrogenAtomCountD"]
        except Exception:
            pass
        return {}

    def hasEntry(self, entryId):
        """Return if the input entry is stored in the cache of non-polymer instance target interactions.

        Args:
            entryId (str): entry identifier

        Returns:
            (bool): True if entry is in the cache or False otherwise
        """
        try:
            return entryId in self.__neighborD["entries"]
        except Exception:
            pass
        return False

    def getEntries(self):
        """Return a list of entry identifier for which non-polymer instance target interactions are stored.

        Returns:
            (list): [entryId, entryId, ... ]
        """
        try:
            return list(self.__neighborD["entries"].keys())
        except Exception:
            pass
        return []

    def generate(self,
                 distLimit=5.0,
                 updateOnly=False,
                 fmt="pickle",
                 indent=0):
        """Generate and export non-polymer target interactions for all of the structures in the repository.

        Args:
            distLimit (float, optional): interaction distance. Defaults to 5.0.
            updateOnly (bool):  only calculate interactions for new entries.  Defaults to False.
            fmt (str, optional): export file format. Defaults to "pickle".
            indent (int, optional): json format indent. Defaults to 0.

        Returns:
            bool: True for success or False otherwise
        """
        ok = False
        try:
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            tD = self.__calculateNeighbors(distLimit=distLimit,
                                           numProc=self.__numProc,
                                           chunkSize=self.__chunkSize,
                                           updateOnly=updateOnly)
            self.__neighborD = {
                "version": self.__version,
                "created": tS,
                "entries": tD
            }
            kwargs = {
                "indent": indent
            } if fmt == "json" else {
                "pickleProtocol": 4
            }
            targetFilePath = self.__getTargetFilePath(fmt=fmt)
            ok = self.__mU.doExport(targetFilePath,
                                    self.__neighborD,
                                    fmt=fmt,
                                    **kwargs)
            logger.info("Wrote %r status %r", targetFilePath, ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def reload(self, fmt="pickle"):
        self.__neighborD = self.__reload(fmt=fmt, useCache=True)
        return self.__neighborD is not None

    def __reload(self, fmt="pickle", useCache=True):
        """Reload from the current cache file."""
        try:
            targetFilePath = self.__getTargetFilePath(fmt=fmt)
            tS = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
            neighborD = {
                "version": self.__version,
                "created": tS,
                "entries": {}
            }
            logger.debug("useCache %r targetFilePath %r", useCache,
                         targetFilePath)
            #
            if useCache and self.__mU.exists(targetFilePath):
                neighborD = self.__mU.doImport(targetFilePath, fmt=fmt)
                if fmt != "pickle":
                    for _, nD in neighborD["entries"].items():
                        nD["nearestNeighbors"] = [
                            LigandTargetInstance(*neighbor)
                            for neighbor in nD["nearestNeighbors"]
                        ]
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return neighborD

    def __getTargetFilePath(self, fmt="pickle"):
        ext = "pic" if fmt == "pickle" else "json"
        pth = os.path.join(self.__dirPath, "ligand-target-neighbors",
                           "neighbor-data." + ext)
        return pth

    def __calculateNeighbors(self,
                             distLimit=5.0,
                             numProc=2,
                             chunkSize=10,
                             updateOnly=False):
        """Calculate non-polymer target interactions for all repository structure files.

        Args:
            distLimit (float, optional): interaction distance limit. Defaults to 5.0.
            numProc (int, optional): number of processes to use. Defaults to 2.
            chunkSize (int, optional): incremental chunk size used for distribute work processes. Defaults to 10.

        Returns:
            (dict): {entryId: {asymId: [TargetLigandInteraction()], ...}, ...}
        """
        contentType = "pdbx"
        mergeContent = None
        rD = {}
        exD = {}
        #
        # updateOnly - will reuse any existing data loaded when this is instantiated
        #              otherwise the cache context is cleared before the calculation.
        if updateOnly:
            exD = {k: True for k in self.getEntries()}
            rD = self.__neighborD[
                "entries"] if "entries" in self.__neighborD else {}
        #
        locatorObjList = self.__rpP.getLocatorObjList(
            contentType=contentType,
            mergeContentTypes=mergeContent,
            excludeIds=exD)
        logger.info("Starting with %d numProc %d updateOnly (%r)",
                    len(locatorObjList), self.__numProc, updateOnly)
        #
        rWorker = TargetInteractionWorker(self.__rpP)
        mpu = MultiProcUtil(verbose=True)
        optD = {"distLimit": distLimit}
        mpu.setOptions(optD)
        mpu.set(workerObj=rWorker, workerMethod="build")
        ok, failList, resultList, _ = mpu.runMulti(dataList=locatorObjList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        if failList:
            logger.info("Target interaction build failures (%d): %r",
                        len(failList), failList)
        #
        for (entryId, nD) in resultList[0]:
            rD[entryId] = nD
        #
        logger.info(
            "Completed with multi-proc status %r failures %r total entries with data (%d)",
            ok, len(failList), len(rD))
        return rD

    def toStash(self):
        ok = False
        try:
            userName = self.__cfgOb.get("_STASH_AUTH_USERNAME",
                                        sectionName=self.__configName)
            password = self.__cfgOb.get("_STASH_AUTH_PASSWORD",
                                        sectionName=self.__configName)
            basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH",
                                        sectionName=self.__configName)
            url = self.__cfgOb.get("STASH_SERVER_URL",
                                   sectionName=self.__configName)
            urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                           sectionName=self.__configName)
            ok = self.__toStash(url,
                                basePath,
                                userName=userName,
                                password=password)
            ok = self.__toStash(urlFallBack,
                                basePath,
                                userName=userName,
                                password=password)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return ok

    def __toStash(self,
                  url,
                  stashRemoteDirPath,
                  userName=None,
                  password=None,
                  remoteStashPrefix=None):
        """Copy tar and gzipped bundled cache data to remote server/location.

        Args:
            url (str): server URL (e.g. sftp://hostname.domain) None for local host
            stashRemoteDirPath (str): path to target directory on remote server
            userName (str, optional): server username. Defaults to None.
            password (str, optional): server password. Defaults to None.
            remoteStashPrefix (str, optional): channel prefix. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        ok = False
        try:
            stU = StashUtil(os.path.join(self.__dirPath, "stash"),
                            "ligand-target-neighbors")
            ok = stU.makeBundle(self.__dirPath, [self.__stashDir])
            if ok:
                ok = stU.storeBundle(url,
                                     stashRemoteDirPath,
                                     remoteStashPrefix=remoteStashPrefix,
                                     userName=userName,
                                     password=password)
        except Exception as e:
            logger.error("Failing with url %r stashDirPath %r: %s", url,
                         stashRemoteDirPath, str(e))
        return ok

    def fromStash(self):
        try:
            minCount = 10
            userName = self.__cfgOb.get("_STASH_AUTH_USERNAME",
                                        sectionName=self.__configName)
            password = self.__cfgOb.get("_STASH_AUTH_PASSWORD",
                                        sectionName=self.__configName)
            basePath = self.__cfgOb.get("_STASH_SERVER_BASE_PATH",
                                        sectionName=self.__configName)
            url = self.__cfgOb.get("STASH_SERVER_URL",
                                   sectionName=self.__configName)
            #
            ok = self.__fromStash(url,
                                  basePath,
                                  userName=userName,
                                  password=password)
            ok = self.reload()
            ok = self.testCache(minCount=minCount)
            if not ok:
                urlFallBack = self.__cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                               sectionName=self.__configName)
                ok = self.__fromStash(urlFallBack,
                                      basePath,
                                      userName=userName,
                                      password=password)
                ok = self.testCache(minCount=minCount)
                ok = self.reload()
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return ok

    def __fromStash(self,
                    url,
                    stashRemoteDirPath,
                    userName=None,
                    password=None,
                    remoteStashPrefix=None):
        """Restore local cache from a tar and gzipped bundle to fetched from a remote server/location.

        Args:
            url (str): server URL (e.g. sftp://hostname.domain) None for local host
            stashRemoteDirPath (str): path to target directory on remote server
            userName (str, optional): server username. Defaults to None.
            password (str, optional): server password. Defaults to None.
            remoteStashPrefix (str, optional): channel prefix. Defaults to None.

        Returns:
            (bool): True for success or False otherwise
        """
        ok = False
        try:
            stU = StashUtil(os.path.join(self.__dirPath, "stash"),
                            "ligand-target-neighbors")
            ok = stU.fetchBundle(self.__dirPath,
                                 url,
                                 stashRemoteDirPath,
                                 remoteStashPrefix=remoteStashPrefix,
                                 userName=userName,
                                 password=password)
        except Exception as e:
            logger.error("Failing with url %r stashDirPath %r: %s", url,
                         stashRemoteDirPath, str(e))
        return ok

    def convert(self, fmt1="json", fmt2="pickle"):
        #
        targetFilePath = self.__getTargetFilePath(fmt=fmt1)
        self.__neighborD = self.__mU.doImport(targetFilePath, fmt=fmt1)
        #
        targetFilePath = self.__getTargetFilePath(fmt=fmt2)
        ok = self.__mU.doExport(targetFilePath,
                                self.__neighborD,
                                fmt=fmt2,
                                pickleProtocol=4)
        return ok

Ejemplo n.º 18

Mostrar archivo

Archivo: testSchemaDefDataPrep.py Proyecto: rcsb/py-rcsb_db

    def setUp(self):
        self.__isMac = platform.system() == "Darwin"
        self.__excludeType = None if self.__isMac else "optional"
        self.__numProc = 2
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__outputPath = os.path.join(HERE, "test-output")
        self.__savedOutputPath = os.path.join(HERE, "test-saved-output")

        configPath = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__configName = configName
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        self.__discoveryMode = self.__cfgOb.get("DISCOVERY_MODE",
                                                sectionName=configName,
                                                default="local")
        self.__fileLimit = 100 if self.__discoveryMode == "local" else 10
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__cachePath)
        #
        #
        self.__fTypeRow = "drop-empty-attributes|drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__fTypeCol = "drop-empty-tables|skip-max-width|convert-iterables|normalize-enums|translateXMLCharRefs"
        self.__chemCompMockLen = 24
        self.__pdbxMockLen = 30
        # removes timestamped data items to allow diffs.)
        excludeExtras = ["rcsb_load_status"]
        # excludeExtras = []
        #
        self.__verbose = True
        self.__modulePathMap = self.__cfgOb.get(
            "DICT_METHOD_HELPER_MODULE_PATH_MAP", sectionName=configName)
        #
        self.__exportFlag = True
        self.__diffFlag = False
        #
        self.__simpleTestCaseList = [
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_no_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeCol,
                "styleType": "columnwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "chem_comp",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 2,
            },
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name",
                "mergeContentTypes": None,
                "rejectLength": 0,
            },
        ]
        #
        self.__fullTestCaseList = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
            {
                "contentType": "bird_chem_comp_core",
                "mockLength": self.__chemCompMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": None,
                "rejectLength": 2,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__fullTestCaseListA = [
            {
                "contentType": "pdbx_core",
                "mockLength": self.__pdbxMockLen,
                "filterType": self.__fTypeRow,
                "styleType": "rowwise_by_name_with_cardinality",
                "mergeContentTypes": ["vrpt"],
                "rejectLength": 0,
                "excludeExtras": excludeExtras,
            },
        ]
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

Ejemplo n.º 19

Mostrar archivo

Archivo: ScanRepoUtil.py Proyecto: rcsb/py-rcsb_utils_repository

class ScanRepoUtil(object):
    """Tools for for scanning repositories and collecting coverage and type data information."""
    def __init__(self,
                 cfgOb,
                 attributeDataTypeD=None,
                 numProc=4,
                 chunkSize=15,
                 fileLimit=None,
                 maxStepLength=2000,
                 workPath=None):
        """
        Args:
            cfgOb (object): Configuration object (rcsb.utils.config.ConfigUtil)

            attributeDataTypeD
            dictPath (str): Path to supporting data dictionary

            numProc (int, optional): Number of parallel worker processes used.
            chunkSize (int, optional): Size of files processed in a single multi-proc process
            fileLimit (int, optional): maximum file scanned or None for no limit
            mockTopPath (str, optional): Path to directory containing mock repositories or None
            maxStepLength (int, optional): maximum number of multi-proc runs to perform
        """
        #
        self.__attributeDataTypeD = attributeDataTypeD if attributeDataTypeD else {}
        # Limit the load length of each file type for testing  -  Set to None to remove -
        self.__fileLimit = fileLimit
        self.__maxStepLength = maxStepLength
        #
        # Controls for multiprocessing execution -
        self.__numProc = numProc
        self.__chunkSize = chunkSize
        #
        self.__cfgOb = cfgOb
        #
        self.__mpFormat = "[%(levelname)s] %(asctime)s %(processName)s-%(module)s.%(funcName)s: %(message)s"

        self.__workPath = workPath
        self.__mU = MarshalUtil(workPath=self.__workPath)
        self.__rpP = RepositoryProvider(self.__cfgOb,
                                        numProc=self.__numProc,
                                        fileLimit=self.__fileLimit,
                                        cachePath=self.__workPath)

    def scanContentType(self,
                        contentType,
                        mergeContentTypes=None,
                        scanType="full",
                        inputPathList=None,
                        scanDataFilePath=None,
                        failedFilePath=None,
                        saveInputFileListPath=None):
        """Driver method for repository scan operation

        Args:
            contentType (str):  one of 'bird','bird_family','bird_chem_comp', chem_comp','pdbx'
            scanType (str, optional): 'full' [or 'incr' to be supported]
            inputPathList (list, optional):  list of input file paths to scan
            scanDataFilePath (str, optional): file path for serialized scan data (Pickle format)
            failedFilePath (str, optional): file path for list of files that fail scanning operation
            saveInputFileListPath str, optional): Path to store file path list that is scanned

        Returns:
            bool: True for success or False otherwise

        """
        try:
            startTime = self.__begin(message="scanning operation")
            #
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType,
                inputPathList=inputPathList,
                mergeContentTypes=mergeContentTypes)
            #
            if saveInputFileListPath:
                self.__mU.doExport(saveInputFileListPath,
                                   self.__rpP.getLocatorPaths(locatorObjList),
                                   fmt="list")
                logger.debug("Saving %d paths in %s", len(locatorObjList),
                             saveInputFileListPath)
            #
            optD = {}
            optD["contentType"] = contentType
            optD["logSize"] = True
            optD["scanType"] = scanType
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            #
            numProc = self.__numProc
            chunkSize = self.__chunkSize if locatorObjList and self.__chunkSize < len(
                locatorObjList) else 0
            #
            # ---------------- - ---------------- - ---------------- - ---------------- - ---------------- -
            numPaths = len(locatorObjList)
            logger.debug("Processing %d total paths", numPaths)
            numProc = min(numProc, numPaths)
            maxStepLength = self.__maxStepLength
            if numPaths > maxStepLength:
                numLists = int(numPaths / maxStepLength)
                subLists = [
                    locatorObjList[i::numLists] for i in range(numLists)
                ]
            else:
                subLists = [locatorObjList]
            #
            if subLists:
                logger.debug(
                    "Starting with numProc %d outer subtask count %d subtask length ~ %d",
                    numProc, len(subLists), len(subLists[0]))
            #
            numResults = 1
            failList = []
            retLists = [[] for ii in range(numResults)]
            diagList = []
            for ii, subList in enumerate(subLists):
                logger.info("Running outer subtask %d of %d length %d", ii + 1,
                            len(subLists), len(subList))
                #
                mpu = MultiProcUtil(verbose=True)
                mpu.setOptions(optionsD=optD)
                mpu.set(workerObj=self, workerMethod="scanWorker")
                ok, failListT, retListsT, diagListT = mpu.runMulti(
                    dataList=subList,
                    numProc=numProc,
                    numResults=numResults,
                    chunkSize=chunkSize)
                failList.extend(failListT)
                # retLists is a list of lists -
                logger.debug("status %r fail len %r ret len %r", ok,
                             len(failListT), len(retListsT))
                for jj in range(numResults):
                    retLists[jj].extend(retListsT[jj])
                diagList.extend(diagListT)
            logger.debug("Scan failed path list %r", failList)
            logger.debug(
                "Scan path list success length %d load list failed length %d",
                len(locatorObjList), len(failList))
            logger.debug("Returned metadata length %r", len(retLists[0]))
            #
            if failedFilePath and failList:
                wOk = self.__mU.doExport(failedFilePath,
                                         self.__rpP.getLocatorPaths(failList),
                                         fmt="list")
                logger.debug("Writing scan failure path list to %s status %r",
                             failedFilePath, wOk)
            #
            if scanType == "incr":
                scanDataD = self.__mU.doImport(scanDataFilePath,
                                               fmt="pickle",
                                               default=None)
                logger.debug("Imported scan data with keys %r",
                             list(scanDataD.keys()))
            else:
                scanDataD = {}
            #
            if scanDataFilePath and retLists[0]:
                for ssTup in retLists[0]:
                    cId = ssTup.containerId
                    if scanType == "full" and cId in scanDataD:
                        logger.error("Duplicate container id %s in %r and %r",
                                     cId, ssTup.fromPath,
                                     scanDataD[cId].fromPath)
                    #
                    scanDataD[cId] = ssTup

                ok = self.__mU.doExport(scanDataFilePath,
                                        scanDataD,
                                        fmt="pickle")
                tscanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
                ok = tscanDataD == scanDataD

            self.__end(startTime, "scanning operation with status " + str(ok))

            #
            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        return False

    def evalScan(self,
                 scanDataFilePath,
                 evalJsonFilePath,
                 evalType="data_type"):

        scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
        if evalType in ["data_type"]:
            rD = self.__evalScanDataType(scanDataD)
        elif evalType in ["data_coverage"]:
            rD, _ = self.__evalScanDataCoverage(scanDataD)
        else:
            logger.debug("Unknown evalType %r", evalType)
        ok = self.__mU.doExport(evalJsonFilePath, rD, fmt="json")

        return ok

    def evalScanItem(self, scanDataFilePath, evalFilePath):
        scanDataD = self.__mU.doImport(scanDataFilePath, fmt="pickle")
        _, cL = self.__evalScanDataCoverage(scanDataD)
        ok = self.__mU.doExport(evalFilePath, cL, fmt="list")
        return ok

    def __evalScanDataType(self, scanDataD):
        """
        ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec')
        ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict')

        """
        # for populated sD[category] -> d[atName]->{minWidth: , maxWidth:, minPrec:, maxPrec: , count}
        sD = {}
        for cId in scanDataD:
            ssTup = scanDataD[cId]
            dD = ssTup.scanCategoryDict
            for catName in dD:
                if catName not in sD:
                    sD[catName] = {}
                for svTup in dD[catName]:
                    if svTup.atName not in sD[catName]:
                        sD[catName][svTup.atName] = {
                            "minWidth": svTup.minWidth,
                            "maxWidth": svTup.maxWidth,
                            "minPrec": svTup.minPrec,
                            "maxPrec": svTup.maxPrec,
                            "count": 1
                        }
                        continue
                    sD[catName][svTup.atName]["minWidth"] = min(
                        sD[catName][svTup.atName]["minWidth"], svTup.minWidth)
                    sD[catName][svTup.atName]["maxWidth"] = max(
                        sD[catName][svTup.atName]["maxWidth"], svTup.maxWidth)
                    sD[catName][svTup.atName]["minPrec"] = min(
                        sD[catName][svTup.atName]["minPrec"], svTup.minPrec)
                    sD[catName][svTup.atName]["maxPrec"] = max(
                        sD[catName][svTup.atName]["maxPrec"], svTup.maxPrec)
                    sD[catName][svTup.atName]["count"] += 1
        return sD

    def __evalScanDataCoverage(self, scanDataD):
        """
        ScanValue = collections.namedtuple('ScanValue', 'containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec')
        ScanSummary = collections.namedtuple('ScanSummary', 'containerId, fromPath, scanDate, scanCategoryDict')

        """

        # for populated sD[category] -> d[atName]->{count: #, instances: [id,id,id]}
        sD = {}
        for cId in scanDataD:
            ssTup = scanDataD[cId]
            dD = ssTup.scanCategoryDict
            for catName in dD:
                if catName not in sD:
                    sD[catName] = {}
                for svTup in dD[catName]:
                    if svTup.atName not in sD[catName]:
                        sD[catName][svTup.atName] = {
                            "count": 0,
                            "instances": []
                        }
                    sD[catName][svTup.atName]["instances"].append(
                        svTup.containerId)
                    sD[catName][svTup.atName]["count"] += 1
        cL = []
        for catName, aD in sD.items():
            for atName, tD in aD.items():
                cL.append("%s\t%s" %
                          ("_" + catName + "." + atName, tD["count"]))
        return sD, cL

    def scanWorker(self, dataList, procName, optionsD, workingDir):
        """Multi-proc worker method for scanning repository data files-"""
        try:
            _ = workingDir
            startTime = self.__begin(message=procName)
            # Recover common options

            scanType = optionsD["scanType"]
            contentType = optionsD["contentType"]
            #
            successList = []
            retList = []

            containerList = self.__getContainerList(dataList)
            for container in containerList:
                ret = self.__scanContainer(container)
                successList.append(ret.fromPath)
                retList.append(ret)
            #

            logger.debug(
                "%s scanType %s contentType %spathlist length %d containerList length %d",
                procName, scanType, contentType, len(dataList),
                len(containerList))

            ok = len(successList) == len(dataList)
            #
            self.__end(startTime, procName + " with status " + str(ok))
            return successList, retList, []

        except Exception as e:
            logger.error("Failing with dataList %r", dataList)
            logger.exception("Failing with %s", str(e))

        return [], [], []

    def __getContainerList(self, locatorObjList):
        """"""
        utcnow = datetime.datetime.utcnow()
        ts = utcnow.strftime("%Y-%m-%d:%H:%M:%S")
        cL = []
        myContainerList = self.__rpP.getContainerList(locatorObjList)
        for loc in locatorObjList:
            myContainerList = self.__rpP.getContainerList([loc])
            lPathL = self.__rpP.getLocatorPaths([loc])
            for cA in myContainerList:
                dc = DataCategory("rcsb_load_status",
                                  ["name", "load_date", "locator"],
                                  [[cA.getName(), ts, lPathL[0]]])
                logger.debug("data category %r", dc)
                cA.append(dc)
                cL.append(cA)
        return cL

    def __scanContainer(self, container):
        """Scan the input container for

        Get the file name -
        """
        cName = container.getName()
        loadStatusObj = container.getObj("rcsb_load_status")
        lName = loadStatusObj.getValue(attributeName="name", rowIndex=0)
        lFilePath = loadStatusObj.getValue(attributeName="locator", rowIndex=0)
        lDate = loadStatusObj.getValue(attributeName="load_date", rowIndex=0)
        #
        oD = {}
        for objName in container.getObjNameList():
            if objName == "rcsb_load_status":
                continue
            obj = container.getObj(objName)
            afD = self.__attributeDataTypeD[
                objName] if objName in self.__attributeDataTypeD else {}
            atNameList = obj.getAttributeList()
            wMin = {atName: 100000 for atName in atNameList}
            wMax = {atName: -1 for atName in atNameList}
            pMin = {atName: 100000 for atName in atNameList}
            pMax = {atName: -1 for atName in atNameList}
            for row in obj.getRowList():
                for ii, val in enumerate(row):
                    valLen = len(val)
                    if (valLen == 0) or (val == "?") or (val == "."):
                        continue
                    atName = atNameList[ii]
                    wMin[atName] = min(wMin[atName], valLen)
                    wMax[atName] = max(wMax[atName], valLen)
                    if atName in afD and afD[atName] == "float":
                        vPrec = 0
                        try:
                            fields = val.split(".")
                            vPrec = len(fields[1])
                            pMin[atName] = min(pMin[atName], vPrec)
                            pMax[atName] = max(pMax[atName], vPrec)
                        except Exception as e:
                            logger.debug("Failed to process float %s %r %r %s",
                                         atName, val, vPrec, str(e))
                            pMin[atName] = 0
                            pMax[atName] = 0
                        logger.debug("Got float for %s %r %r", atName, val,
                                     vPrec)
                    else:
                        pMin[atName] = 0
                        pMax[atName] = 0

            # ScanValue - containerId, catName, atName, minWidth, maxWidth, minPrec, maxPrec
            oD[objName] = [
                ScanValue(cName, objName, atN, wMin[atN], wMax[atN], pMin[atN],
                          pMax[atN]) for atN in wMax if wMax[atN] != -1
            ]
        # ScanSummary containerId, fromPath, scanCategoryDict
        #
        ret = ScanSummary(lName, lFilePath, lDate, oD)
        #
        return ret

    def __begin(self, message=""):
        startTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        logger.debug("Starting %s at %s", message, ts)
        return startTime

    def __end(self, startTime, message=""):
        endTime = time.time()
        ts = time.strftime("%Y %m %d %H:%M:%S", time.localtime())
        delta = endTime - startTime
        logger.debug("Completed %s at %s (%.4f seconds)", message, ts, delta)