Ejemplo n.º 1
0
    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
        #
        self.__pathPdbxDictionaryFile = self.__cfgOb.getPath(
            "PDBX_DICT_LOCATOR", sectionName=configName)
        self.__pathRcsbDictionaryFile = self.__cfgOb.getPath(
            "RCSB_DICT_LOCATOR", sectionName=configName)
        self.__pathVrptDictionaryFile = self.__cfgOb.getPath(
            "VRPT_DICT_LOCATOR", sectionName=configName)

        # self.__mU = MarshalUtil()
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__dP = DictionaryApiProviderWrapper(self.__cachePath,
                                                 cfgob=self.__cfgOb,
                                                 configName=configName,
                                                 useCache=True)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
    def __runContentType(self, contentType, mockLength, mergeContent):
        """Read and process test fixture data files from the input content type."""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            locatorObjList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContent)
            containerList = self.__rpP.getContainerList(locatorObjList)
            #
            logger.debug("Length of locator list %d\n", len(locatorObjList))
            self.assertGreaterEqual(len(locatorObjList), mockLength)
            for container in containerList:
                cName = container.getName()
                #
                # if cName not in ["1B5F"]:
                #    continue
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Ejemplo n.º 3
0
 def __fetchDictionaryApi(self,
                          cfgOb,
                          configName,
                          cachePath,
                          useCache=None,
                          **kwargs):
     logger.debug("configName %s cachePath %s kwargs %r", configName,
                  cachePath, kwargs)
     schemaName = kwargs.get("schemaName", "pdbx_core")
     self.__dApiW = DictionaryApiProviderWrapper(cfgOb,
                                                 cachePath,
                                                 useCache=useCache)
     dictApi = self.__dApiW.getApiByName(schemaName)
     # numRev = dictApi.getDictionaryRevisionCount()
     return dictApi
Ejemplo n.º 4
0
def scanRepo(
    cfgOb,
    contentType,
    scanDataFilePath,
    numProc,
    chunkSize,
    fileLimit,
    scanType="full",
    inputPathList=None,
    pathListFilePath=None,
    dataCoverageFilePath=None,
    dataCoverageItemFilePath=None,
    dataTypeFilePath=None,
    failedFilePath=None,
    cachePath=None,
):
    """Utility method to scan the data repository of the input content type and store type and coverage details."""
    try:
        #
        # configName = cfgOb.getDefaultSectionName()
        dP = DictionaryApiProviderWrapper(cachePath, cfgOb=cfgOb, useCache=True)
        dictApi = dP.getApiByName(contentType)
        ###
        categoryList = sorted(dictApi.getCategoryList())
        dictSchema = {catName: sorted(dictApi.getAttributeNameList(catName)) for catName in categoryList}
        attributeDataTypeD = OrderedDict()
        for catName in categoryList:
            aD = {}
            for atName in dictSchema[catName]:
                aD[atName] = dictApi.getTypeCode(catName, atName)
            attributeDataTypeD[catName] = aD
        ###
        #
        sr = ScanRepoUtil(cfgOb, attributeDataTypeD=attributeDataTypeD, numProc=numProc, chunkSize=chunkSize, fileLimit=fileLimit, workPath=cachePath)
        ok = sr.scanContentType(
            contentType, scanType=scanType, inputPathList=inputPathList, scanDataFilePath=scanDataFilePath, failedFilePath=failedFilePath, saveInputFileListPath=pathListFilePath
        )
        if dataTypeFilePath:
            ok = sr.evalScan(scanDataFilePath, dataTypeFilePath, evalType="data_type")
        if dataCoverageFilePath:
            ok = sr.evalScan(scanDataFilePath, dataCoverageFilePath, evalType="data_coverage")
        if dataCoverageItemFilePath:
            ok = sr.evalScanItem(scanDataFilePath, dataCoverageItemFilePath)

        return ok
    except Exception as e:
        logger.exception("Failing with %s", str(e))
    def testMethodRunnerSetup(self):
        """Test the setup methods for method runner class"""
        try:
            dP = DictionaryApiProviderWrapper(self.__cfgOb,
                                              self.__cachePath,
                                              useCache=True)
            dictApi = dP.getApiByName("pdbx")
            rP = DictMethodResourceProvider(self.__cfgOb,
                                            configName=self.__configName,
                                            cachePath=self.__cachePath,
                                            siftsAbbreviated="TEST")
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            ok = dmh is not None
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Ejemplo n.º 6
0
 def testWrapperByLocators(self):
     """Test case - get dictionary API by locator list"""
     try:
         dp = DictionaryApiProviderWrapper(self.__cfgOb,
                                           self.__cachePath,
                                           useCache=False)
         dApi = dp.getApiByLocators(self.__dictLocators)
         ok = dApi.testCache()
         self.assertTrue(ok)
         title = dApi.getDictionaryTitle()
         logger.debug("Title %r", title)
         self.assertEqual(
             title, "mmcif_pdbx.dic,rcsb_mmcif_ext.dic,vrpt_mmcif_ext.dic")
         # revL = dApi.getDictionaryHistory()
         numRev = dApi.getDictionaryRevisionCount()
         logger.debug("Number of dictionary revisions (numRev) %r", numRev)
         self.assertGreater(numRev, 220)
         #
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
Ejemplo n.º 7
0
    def __testPrepDocumentsFromContainers(
            self,
            inputPathList,
            databaseName,
            collectionName,
            styleType="rowwise_by_name_with_cardinality",
            mergeContentTypes=None):
        """Test case -  create loadable PDBx data from repository files"""
        try:

            sd, _, _, _ = self.__schP.getSchemaInfo(databaseName)
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(databaseName)
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=self.__fTypeRow)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
                if self.__export:
                    savePath = os.path.join(HERE, "test-output",
                                            cName + "-with-method.cif")
                    self.__mU.doExport(savePath, [container], fmt="mmcif")
            #
            tableIdExcludeList = sd.getCollectionExcluded(collectionName)
            tableIdIncludeList = sd.getCollectionSelected(collectionName)
            sliceFilter = sd.getCollectionSliceFilter(collectionName)
            sdp.setSchemaIdExcludeList(tableIdExcludeList)
            sdp.setSchemaIdIncludeList(tableIdIncludeList)
            #
            logger.debug("%s (%r) exclude list %r", collectionName,
                         sliceFilter, tableIdExcludeList)
            logger.debug("%s (%r) include list %r", collectionName,
                         sliceFilter, tableIdIncludeList)
            docList, containerNameList, _ = sdp.processDocuments(
                containerList,
                styleType=styleType,
                filterType=self.__fTypeRow,
                dataSelectors=["PUBLIC_RELEASE"],
                sliceFilter=sliceFilter,
                collectionName=collectionName)

            docList = sdp.addDocumentPrivateAttributes(docList, collectionName)
            docList = sdp.addDocumentSubCategoryAggregates(
                docList, collectionName)
            #
            mergeS = "-".join(mergeContentTypes) if mergeContentTypes else ""
            if self.__export and docList:
                fp = os.path.join(
                    HERE, "test-output", "prep-%s-%s-%s.json" %
                    (databaseName, collectionName, mergeS))
                self.__mU.doExport(fp, docList, fmt="json", indent=3)
                logger.debug("Exported %r", fp)
            #
            return docList, containerNameList

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Ejemplo n.º 8
0
    def __init__(
        self,
        cfgOb,
        schemaDefObj,
        cfgSectionName="site_info_configuration",
        dbCon=None,
        cachePath=".",
        workPath=".",
        cleanUp=False,
        warnings="default",
        verbose=True,
        restoreUseStash=True,
        restoreUseGit=True,
        providerTypeExclude=True,
    ):
        self.__verbose = verbose
        self.__debug = False
        self.__cfgOb = cfgOb
        sectionName = cfgSectionName
        self.__sD = schemaDefObj

        #
        self.__dbCon = dbCon
        self.__cachePath = cachePath
        self.__workPath = workPath
        self.__pathList = []
        self.__cleanUp = cleanUp
        #
        self.__colSep = "&##&\t"
        self.__rowSep = "$##$\n"
        #
        #
        self.__fTypeRow = "skip-max-width"
        self.__fTypeCol = "skip-max-width"
        #
        self.__warningAction = warnings
        dtf = DataTransformFactory(schemaDefAccessObj=self.__sD,
                                   filterType=self.__fTypeRow)
        self.__sdp = SchemaDefDataPrep(schemaDefAccessObj=self.__sD,
                                       dtObj=dtf,
                                       workPath=self.__cachePath,
                                       verbose=self.__verbose)
        self.__rpP = RepositoryProvider(cfgOb=self.__cfgOb,
                                        cachePath=self.__cachePath)
        #
        schemaName = self.__sD.getName()
        modulePathMap = self.__cfgOb.get("DICT_METHOD_HELPER_MODULE_PATH_MAP",
                                         sectionName=sectionName)
        dP = DictionaryApiProviderWrapper(self.__cachePath,
                                          cfgOb=self.__cfgOb,
                                          configName=sectionName,
                                          useCache=True)
        dictApi = dP.getApiByName(schemaName)
        rP = DictMethodResourceProvider(
            self.__cfgOb,
            cachePath=self.__cachePath,
            restoreUseStash=restoreUseStash,
            restoreUseGit=restoreUseGit,
            providerTypeExclude=providerTypeExclude)
        self.__dmh = DictMethodRunner(dictApi,
                                      modulePathMap=modulePathMap,
                                      resourceProvider=rP)
Ejemplo n.º 9
0
class DictMethodResourceProvider(SingletonClass):
    """Resource provider for DictMethodHelper tools."""
    def __init__(self, cfgOb, **kwargs):
        """Resource provider for dictionary method runner.

        Arguments:
            cfgOb {object} -- instance ConfigUtils class

        Keyword agruments:
            configName {string} -- configuration section name (default: default section name)
            cachePath {str} -- path used for temporary file management (default: '.')

        """
        self.__cfgOb = cfgOb

        self.__configName = kwargs.get("configName",
                                       self.__cfgOb.getDefaultSectionName())
        self.__cachePath = kwargs.get("cachePath", ".")
        #
        self.__taxU = None
        self.__ecU = None
        self.__scopU = None
        self.__cathU = None
        self.__dbU = None
        self.__residU = None
        self.__psimodU = None
        self.__ccU = None
        self.__ccmU = None
        self.__commonU = None
        self.__dApiW = None
        self.__atcP = None
        # self.__siftsAbbreviated = kwargs.get("siftsAbbreviated", "PROD")
        self.__siftsAbbreviated = kwargs.get("siftsAbbreviated", "TEST")
        self.__ssP = None
        self.__vrptP = None
        self.__crP = None
        self.__jtaP = None
        self.__pcP = None
        self.__phP = None
        self.__rlsP = None
        self.__niP = None
        self.__glyP = None
        self.__ggP = None
        self.__pfP = None
        self.__birdP = None
        #
        #
        # self.__wsPattern = re.compile(r"\s+", flags=re.UNICODE | re.MULTILINE)
        # self.__re_non_digit = re.compile(r"[^\d]+")
        #
        self.__resourcesD = {
            "SiftsSummaryProvider instance": self.__fetchSiftsSummaryProvider,
            "Dictionary API instance (pdbx_core)": self.__fetchDictionaryApi,
            "TaxonomyProvider instance": self.__fetchTaxonomyProvider,
            "ScopProvider instance": self.__fetchScopProvider,
            "CathProvider instance": self.__fetchCathProvider,
            "EnzymeProvider instance": self.__fetchEnzymeProvider,
            "DrugBankProvider instance": self.__fetchDrugBankProvider,
            "ResidProvider instance": self.__fetchResidProvider,
            "PsiModProvider instance": self.__fetchPsiModProvider,
            "ChemCompModelProvider instance":
            self.__fetchChemCompModelProvider,
            "ChemCompProvider instance": self.__fetchChemCompProvider,
            "AtcProvider instance": self.__fetchAtcProvider,
            "DictMethodCommonUtils instance": self.__fetchCommonUtils,
            "ValidationProvider instance": self.__fetchValidationProvider,
            "CitationReferenceProvider instance":
            self.__fetchCitationReferenceProvider,
            "JournalTitleAbbreviationProvider instance":
            self.__fetchJournalTitleAbbreviationProvider,
            "PubChemProvider instance": self.__fetchPubChemProvider,
            "PharosProvider instance": self.__fetchPharosProvider,
            "RcsbLigandScoreProvider instance":
            self.__fetchRcsbLigandScoreProvider,
            "NeighborInteractionProvider instance":
            self.__fetchNeighborInteractionProvider,
            "GlycanProvider instance": self.__fetchGlycanProvider,
            "GlyGenProvider instance": self.__fetchGlyGenProvider,
            "PfamProvider instance": self.__fetchPfamProvider,
            "BirdProvider instance": self.__fetchBirdProvider,
        }
        logger.debug("Dictionary resource provider init completed")
        #

    def echo(self, msg):
        logger.info(msg)

    def getReferenceSequenceAlignmentOpt(self):
        return self.__cfgOb.get("REFERENCE_SEQUENCE_ALIGNMENTS",
                                sectionName=self.__configName,
                                default="SIFTS")

    def getResource(self, resourceName, default=None, useCache=True, **kwargs):
        """Return the named input resource or the default value.

        Arguments:
            resourceName {str} -- resource name
            useCache (bool, optional): use current cace. Defaults to True.

        Keyword Arguments:
            default {obj} -- default return value for missing resources (default: {None})

        Returns:
            [obj] -- resource object
        """
        logger.debug("Requesting resource %r", resourceName)
        if resourceName in self.__resourcesD:
            return self.__resourcesD[resourceName](self.__cfgOb,
                                                   self.__configName,
                                                   self.__cachePath,
                                                   useCache=useCache,
                                                   **kwargs)
        else:
            logger.error("Request for unsupported resource %r returning %r",
                         resourceName, default)
        #
        return default

    def cacheResources(self, useCache=False, **kwargs):
        """Update and optionally clear all resource caches.

        Args:
            useCache (bool, optional): use current cace. Defaults to False.

        Returns:
            bool: True for success or False otherwise
        """
        ret = True
        tName = "CHECKING" if useCache else "REBUILDING"
        logger.info("Begin %s cache for %d resources", tName,
                    len(self.__resourcesD))
        #
        for resourceName in self.__resourcesD:
            startTime = time.time()
            logger.debug("Caching resources for %r", resourceName)
            tU = self.__resourcesD[resourceName](self.__cfgOb,
                                                 self.__configName,
                                                 self.__cachePath,
                                                 useCache=useCache,
                                                 **kwargs)
            ok = tU.testCache()
            if not ok:
                logger.error("%s %s fails", tName, resourceName)
            ret = ret and ok
            if not ret:
                logger.info(
                    "%s resource %r step status %r cumulative status %r",
                    tName, resourceName, ok, ret)
            self.__resourceUsageReport(resourceName, startTime)
        #
        logger.info("Completed %s %d resources with status %r", tName,
                    len(self.__resourcesD), ret)
        return ret

    def __resourceUsageReport(self, resourceName, startTime):
        unitS = "MB" if platform.system() == "Darwin" else "GB"
        rusageMax = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        # logger.info("Maximum total resident memory size %.3f %s", rusageMax / 10 ** 6, unitS)
        endTime = time.time()
        logger.info(
            "Step %s completed at %s (%.4f secs/%.3f %s)",
            resourceName,
            time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
            endTime - startTime,
            rusageMax / 10**6,
            unitS,
        )

    def __fetchCitationReferenceProvider(self,
                                         cfgOb,
                                         configName,
                                         cachePath,
                                         useCache=True,
                                         **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__crP:
            cachePath = os.path.join(
                cachePath,
                cfgOb.get("CITATION_REFERENCE_CACHE_DIR",
                          sectionName=configName))
            self.__crP = CitationReferenceProvider(cachePath=cachePath,
                                                   useCache=useCache,
                                                   **kwargs)
        return self.__crP

    def __fetchJournalTitleAbbreviationProvider(self,
                                                cfgOb,
                                                configName,
                                                cachePath,
                                                useCache=True,
                                                **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__jtaP:
            cachePath = os.path.join(
                cachePath,
                cfgOb.get("CITATION_REFERENCE_CACHE_DIR",
                          sectionName=configName))
            self.__jtaP = JournalTitleAbbreviationProvider(cachePath=cachePath,
                                                           useCache=useCache,
                                                           **kwargs)
        return self.__jtaP

    def __fetchTaxonomyProvider(self,
                                cfgOb,
                                configName,
                                cachePath,
                                useCache=True,
                                **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__taxU:
            taxonomyDataPath = os.path.join(
                cachePath,
                cfgOb.get("NCBI_TAXONOMY_CACHE_DIR", sectionName=configName))
            self.__taxU = TaxonomyProvider(taxDirPath=taxonomyDataPath,
                                           useCache=useCache,
                                           **kwargs)
        return self.__taxU

    def __fetchScopProvider(self,
                            cfgOb,
                            configName,
                            cachePath,
                            useCache=True,
                            **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__scopU:
            structDomainDataPath = os.path.join(
                cachePath,
                cfgOb.get("STRUCT_DOMAIN_CLASSIFICATION_CACHE_DIR",
                          sectionName=configName))
            self.__scopU = ScopClassificationProvider(
                scopDirPath=structDomainDataPath, useCache=useCache, **kwargs)
        return self.__scopU

    def __fetchCathProvider(self,
                            cfgOb,
                            configName,
                            cachePath,
                            useCache=True,
                            **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__cathU:
            structDomainDataPath = os.path.join(
                cachePath,
                cfgOb.get("STRUCT_DOMAIN_CLASSIFICATION_CACHE_DIR",
                          sectionName=configName))
            self.__cathU = CathClassificationProvider(
                cathDirPath=structDomainDataPath, useCache=useCache, **kwargs)
        return self.__cathU

    def __fetchEnzymeProvider(self,
                              cfgOb,
                              configName,
                              cachePath,
                              useCache=True,
                              **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__ecU:
            enzymeDataPath = os.path.join(
                cachePath,
                cfgOb.get("ENZYME_CLASSIFICATION_CACHE_DIR",
                          sectionName=configName))
            self.__ecU = EnzymeDatabaseProvider(enzymeDirPath=enzymeDataPath,
                                                useCache=useCache,
                                                **kwargs)
        return self.__ecU

    #
    def __fetchDrugBankProvider(self,
                                cfgOb,
                                configName,
                                cachePath,
                                useCache=True,
                                **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__dbU:
            # If a mock DrugBank URL is provided add this as an argument.
            mockUrlTarget = cfgOb.getPath("DRUGBANK_MOCK_URL_TARGET",
                                          sectionName=configName)
            logger.info("Using mock DrugBank source file %r", mockUrlTarget)
            if mockUrlTarget:
                kwargs["urlTarget"] = mockUrlTarget
                logger.info("Using mock DrugBank source file %r",
                            mockUrlTarget)
            un = cfgOb.get("_DRUGBANK_AUTH_USERNAME", sectionName=configName)
            pw = cfgOb.get("_DRUGBANK_AUTH_PASSWORD", sectionName=configName)
            self.__dbU = DrugBankProvider(cachePath=cachePath,
                                          useCache=useCache,
                                          username=un,
                                          password=pw,
                                          **kwargs)
        return self.__dbU

    #
    def __fetchResidProvider(self,
                             cfgOb,
                             configName,
                             cachePath,
                             useCache=True,
                             **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__residU:
            # dbDataPath = os.path.join(cachePath, cfgOb.get("RESID_CACHE_DIR", sectionName=configName))
            self.__residU = ResidProvider(cachePath=cachePath,
                                          useCache=useCache,
                                          **kwargs)
        return self.__residU

    def __fetchPsiModProvider(self,
                              cfgOb,
                              configName,
                              cachePath,
                              useCache=True,
                              **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__psimodU:
            self.__psimodU = PsiModProvider(cachePath=cachePath,
                                            useCache=useCache,
                                            **kwargs)
        return self.__psimodU

    def __fetchChemCompModelProvider(self,
                                     cfgOb,
                                     configName,
                                     cachePath,
                                     useCache=True,
                                     **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__ccmU:
            # dirPath = os.path.join(cachePath, cfgOb.get("CHEM_COMP_CACHE_DIR", sectionName=configName))
            self.__ccmU = ChemCompModelProvider(cachePath=cachePath,
                                                useCache=useCache,
                                                **kwargs)
        return self.__ccmU

    def __fetchChemCompProvider(self,
                                cfgOb,
                                configName,
                                cachePath,
                                useCache=True,
                                **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__ccU:
            # dirPath = os.path.join(cachePath, cfgOb.get("CHEM_COMP_CACHE_DIR", sectionName=configName))
            self.__ccU = ChemCompProvider(cachePath=cachePath,
                                          useCache=useCache,
                                          **kwargs)
        return self.__ccU

    def __fetchAtcProvider(self,
                           cfgOb,
                           configName,
                           cachePath,
                           useCache=True,
                           **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__atcP:
            # dirPath = os.path.join(cachePath, cfgOb.get("ATC_CACHE_DIR", sectionName=configName))
            self.__atcP = AtcProvider(cachePath=cachePath,
                                      useCache=useCache,
                                      **kwargs)
        return self.__atcP

    def __fetchSiftsSummaryProvider(self,
                                    cfgOb,
                                    configName,
                                    cachePath,
                                    useCache=True,
                                    **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__ssP:
            srcDirPath = os.path.join(
                cachePath,
                cfgOb.getPath("SIFTS_SUMMARY_DATA_PATH",
                              sectionName=configName))
            cacheDirPath = os.path.join(
                cachePath,
                cfgOb.get("SIFTS_SUMMARY_CACHE_DIR", sectionName=configName))
            logger.debug("ssP %r %r", srcDirPath, cacheDirPath)
            self.__ssP = SiftsSummaryProvider(
                srcDirPath=srcDirPath,
                cacheDirPath=cacheDirPath,
                useCache=useCache,
                abbreviated=self.__siftsAbbreviated,
                **kwargs)
            logger.debug("ssP entry count %d", self.__ssP.getEntryCount())
        return self.__ssP

    def __fetchValidationProvider(self,
                                  cfgOb,
                                  configName,
                                  cachePath,
                                  useCache=True,
                                  **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__vrptP:
            urlTarget = cfgOb.get("VRPT_DICT_MAPPING_LOCATOR",
                                  sectionName=configName)
            dirPath = os.path.join(
                cachePath,
                cfgOb.get("DICTIONARY_CACHE_DIR", sectionName=configName))
            self.__vrptP = ValidationReportProvider(dirPath=dirPath,
                                                    urlTarget=urlTarget,
                                                    useCache=useCache)
        #
        return self.__vrptP

    def __fetchCommonUtils(self,
                           cfgOb,
                           configName,
                           cachePath,
                           useCache=None,
                           **kwargs):
        logger.debug("configName %s cachePath %r kwargs %r", configName,
                     cachePath, kwargs)
        _ = cfgOb
        _ = useCache
        if not self.__commonU:
            self.__commonU = DictMethodCommonUtils(**kwargs)
        return self.__commonU

    def __fetchDictionaryApi(self,
                             cfgOb,
                             configName,
                             cachePath,
                             useCache=None,
                             **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        schemaName = kwargs.get("schemaName", "pdbx_core")
        self.__dApiW = DictionaryApiProviderWrapper(cfgOb,
                                                    cachePath,
                                                    useCache=useCache)
        dictApi = self.__dApiW.getApiByName(schemaName)
        # numRev = dictApi.getDictionaryRevisionCount()
        return dictApi

    def __fetchPubChemProvider(self,
                               cfgOb,
                               configName,
                               cachePath,
                               useCache=True,
                               **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__pcP:
            #
            try:
                minCount = 0
                userName = cfgOb.get("_STASH_AUTH_USERNAME",
                                     sectionName=configName)
                password = cfgOb.get("_STASH_AUTH_PASSWORD",
                                     sectionName=configName)
                basePath = cfgOb.get("_STASH_SERVER_BASE_PATH",
                                     sectionName=configName)
                url = cfgOb.get("STASH_SERVER_URL", sectionName=configName)
                urlFallBack = cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                        sectionName=configName)
                #
                pcP = PubChemProvider(cachePath=cachePath, useCache=useCache)
                ok = pcP.fromStash(url,
                                   basePath,
                                   userName=userName,
                                   password=password)
                ok = pcP.reload()
                ok = pcP.testCache(minCount=10)
                if not ok:
                    ok = pcP.fromStash(urlFallBack,
                                       basePath,
                                       userName=userName,
                                       password=password)
                    ok = pcP.testCache(minCount=minCount)
                #
                if pcP:
                    self.__pcP = pcP
                    riD = pcP.getIdentifiers()
                    logger.info("Fetched PubChem mapping dictionary (%d)",
                                len(riD))
            except Exception as e:
                logger.exception("Failing with %s", str(e))
            #
        return self.__pcP

    def __fetchPharosProvider(self,
                              cfgOb,
                              configName,
                              cachePath,
                              useCache=True,
                              **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__phP:
            # --
            try:
                minCount = 0
                userName = cfgOb.get("_STASH_AUTH_USERNAME",
                                     sectionName=configName)
                password = cfgOb.get("_STASH_AUTH_PASSWORD",
                                     sectionName=configName)
                basePath = cfgOb.get("_STASH_SERVER_BASE_PATH",
                                     sectionName=configName)
                url = cfgOb.get("STASH_SERVER_URL", sectionName=configName)
                urlFallBack = cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                        sectionName=configName)
                #
                phP = PharosProvider(cachePath=cachePath, useCache=useCache)
                ok = phP.fromStash(url,
                                   basePath,
                                   userName=userName,
                                   password=password)
                ok = phP.reload()
                ok = phP.testCache(minCount=10)
                if not ok:
                    ok = phP.fromStash(urlFallBack,
                                       basePath,
                                       userName=userName,
                                       password=password)
                    ok = phP.testCache(minCount=minCount)
                #
                if phP:
                    self.__phP = phP
                    riD = phP.getIdentifiers()
                    logger.info("Fetched Pharos ChEMBL identifiers (%d)",
                                len(riD))
            except Exception as e:
                logger.warning("Failing with %s", str(e))
            #
        return self.__phP

    def __fetchRcsbLigandScoreProvider(self,
                                       cfgOb,
                                       configName,
                                       cachePath,
                                       useCache=None,
                                       **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        _ = cfgOb
        if not self.__rlsP:
            self.__rlsP = RcsbLigandScoreProvider(cachePath=cachePath,
                                                  useCache=useCache)
        return self.__rlsP

    def __fetchNeighborInteractionProvider(self,
                                           cfgOb,
                                           configName,
                                           cachePath,
                                           useCache=True,
                                           **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__niP:
            # --
            try:
                minCount = 10
                niP = NeighborInteractionProvider(cfgOb,
                                                  configName,
                                                  cachePath=cachePath,
                                                  useCache=useCache)
                ok = niP.fromStash()
                ok = niP.reload()
                ok = niP.testCache(minCount=minCount)
                if ok:
                    self.__niP = niP
            except Exception as e:
                logger.warning("Failing with %s", str(e))
            #
        return self.__niP

    def __fetchGlycanProvider(self,
                              cfgOb,
                              configName,
                              cachePath,
                              useCache=True,
                              **kwargs):
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__glyP:
            #
            try:
                minCount = 0
                userName = cfgOb.get("_STASH_AUTH_USERNAME",
                                     sectionName=configName)
                password = cfgOb.get("_STASH_AUTH_PASSWORD",
                                     sectionName=configName)
                basePath = cfgOb.get("_STASH_SERVER_BASE_PATH",
                                     sectionName=configName)
                url = cfgOb.get("STASH_SERVER_URL", sectionName=configName)
                urlFallBack = cfgOb.get("STASH_SERVER_FALLBACK_URL",
                                        sectionName=configName)
                #
                gP = GlycanProvider(cachePath=cachePath, useCache=useCache)
                ok = gP.fromStash(url,
                                  basePath,
                                  userName=userName,
                                  password=password)
                ok = gP.reload()
                ok = gP.testCache(minCount=10)
                if not ok:
                    ok = gP.fromStash(urlFallBack,
                                      basePath,
                                      userName=userName,
                                      password=password)
                    ok = gP.testCache(minCount=minCount)
                #
                if gP:
                    self.__glyP = gP
                    riD = gP.getIdentifiers()
                    logger.info("Fetched glycan mapping dictionary (%d)",
                                len(riD))
            except Exception as e:
                logger.exception("Failing with %s", str(e))
            #
        return self.__glyP

    def __fetchGlyGenProvider(self,
                              cfgOb,
                              configName,
                              cachePath,
                              useCache=True,
                              **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__ggP:
            self.__ggP = GlyGenProvider(cachePath=cachePath,
                                        useCache=useCache,
                                        **kwargs)
        return self.__ggP

    def __fetchPfamProvider(self,
                            cfgOb,
                            configName,
                            cachePath,
                            useCache=True,
                            **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__pfP:
            self.__pfP = PfamProvider(cachePath=cachePath,
                                      useCache=useCache,
                                      **kwargs)
        return self.__pfP

    def __fetchBirdProvider(self,
                            cfgOb,
                            configName,
                            cachePath,
                            useCache=True,
                            **kwargs):
        _ = cfgOb
        logger.debug("configName %s cachePath %s kwargs %r", configName,
                     cachePath, kwargs)
        if not self.__birdP:
            self.__birdP = BirdProvider(cachePath=cachePath,
                                        useCache=useCache,
                                        **kwargs)
        return self.__birdP
Ejemplo n.º 10
0
class SchemaSearchContextsTests(unittest.TestCase):
    skipFlag = True

    def setUp(self):
        self.__verbose = True
        mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                  "exdb-config-example.yml")
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=mockTopPath)
        self.__docHelper = DocumentDefinitionHelper(cfgOb=self.__cfgOb)
        #
        self.__pathPdbxDictionaryFile = self.__cfgOb.getPath(
            "PDBX_DICT_LOCATOR", sectionName=configName)
        self.__pathRcsbDictionaryFile = self.__cfgOb.getPath(
            "RCSB_DICT_LOCATOR", sectionName=configName)
        self.__pathVrptDictionaryFile = self.__cfgOb.getPath(
            "VRPT_DICT_LOCATOR", sectionName=configName)

        # self.__mU = MarshalUtil()
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__dP = DictionaryApiProviderWrapper(self.__cachePath,
                                                 cfgob=self.__cfgOb,
                                                 configName=configName,
                                                 useCache=True)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testSearchGroups(self):
        ok = self.__docHelper.checkSearchGroups()
        self.assertTrue(ok)

    @unittest.skipIf(skipFlag, "Troubleshooting test")
    def testUnUsedIndexedItems(self):
        """Enumerate items that are indexed by have no search group assignments.

        collection_attribute_search_contexts
        """

        groupNameList = self.__docHelper.getSearchGroups()
        logger.info("Search groups (%d)", len(groupNameList))
        #
        nestedSearchableD = self.__assembleNestedCategorySearchables()
        nestedSearchableD.update(self.__assembleNestedSubCategorySearchables())
        #
        attribContextD = {}
        tD = self.__docHelper.getAllAttributeSearchContexts()
        for (catName, atName), contextL in tD.items():
            attribContextD.setdefault((catName, atName),
                                      []).extend([t[0] for t in contextL])
        logger.info("search context attribContextD %d", len(attribContextD))

        lookupD = {}
        # if (catName, atName) in nestedSearchableD:
        for groupName in groupNameList:
            # get attributes in group
            attributeTupList = self.__docHelper.getSearchGroupAttributes(
                groupName)
            # logger.info("")
            # logger.info("%s (%2d):", groupName, len(attributeTupList))
            for catName, atName in attributeTupList:
                lookupD.setdefault((catName, atName), []).append(groupName)
        #
        logger.info("Search group lookup len %d", len(lookupD))
        for (catName, atName), contextL in attribContextD.items():
            # logger.info("%s.%s contexL %r", catName, atName, contextL)

            if "full-text" in contextL:
                if (catName,
                        atName) in lookupD or (catName,
                                               atName) in nestedSearchableD:
                    continue
                logger.info("%s.%s contexL %r", catName, atName, contextL)

        #

        return True

    @unittest.skipIf(skipFlag, "Troubleshooting test")
    def testExpandSearchGroups(self):
        """Expand search groups and metadata content as these would be display in RCSB search menu."""
        _, afD = self.__getContentFeatures()
        groupNameList = self.__docHelper.getSearchGroups()
        logger.info("Search groups (%d)", len(groupNameList))
        #
        nestedSearchableD = self.__assembleNestedCategorySearchables()
        nestedSearchableD.update(self.__assembleNestedSubCategorySearchables())
        #
        for groupName in groupNameList:
            # get attributes in group
            attributeTupList = self.__docHelper.getSearchGroupAttributes(
                groupName)
            logger.info("")
            logger.info("%s (%2d):", groupName, len(attributeTupList))
            # Get search context and brief descriptions -
            for catName, atName in attributeTupList:
                searchContextTupL = self.__docHelper.getSearchContexts(
                    catName, atName)
                if not searchContextTupL:
                    logger.warning("Missing search context for %s.%s", catName,
                                   atName)
                descriptionText = self.__docHelper.getAttributeDescription(
                    catName, atName, contextType="brief")
                if not descriptionText:
                    logger.warning("Missing brief description %s.%s", catName,
                                   atName)
                #
                fD = afD[catName][atName] if catName in afD and atName in afD[
                    catName] else {}
                logger.debug("%s %s fD %r", catName, atName, fD)
                units = fD["UNITS"] if "UNITS" in fD else None
                #
                uS = ""
                if units:
                    uS = "(units=%s)" % units
                #
                nS = "(%s.%s)" % (catName, atName)
                if (catName, atName) in nestedSearchableD:
                    for dS in nestedSearchableD[(catName, atName)]:
                        logger.info(
                            "- %-55s: %s %s (%s)", dS, nS, uS,
                            ",".join([tup[0] for tup in searchContextTupL]))
                else:
                    logger.info(
                        "- %-55s: %s %s (%s)", descriptionText, nS, uS,
                        ",".join([tup[0] for tup in searchContextTupL]))

        return True

    def __assembleNestedCategorySearchables(self):
        """Assemble dictionary of searchable items in nested categories.

        Returns:
            (dict): {(category, atName): ["Materialized brief description", ... ]
        """
        # cfD, afD = self.__getContentFeatures()
        _, afD = self.__getContentFeatures()
        logger.info("")
        searchableCategoryD = {}
        groupNameList = self.__docHelper.getSearchGroups()
        logger.debug("Search group count (%d)", len(groupNameList))
        for groupName in groupNameList:
            # get attributes in group
            attributeTupList = self.__docHelper.getSearchGroupAttributes(
                groupName)
            for catName, atName in attributeTupList:
                searchableCategoryD.setdefault(catName, []).append(atName)
        logger.debug("Searchable category count (%d)",
                     len(searchableCategoryD))
        #
        retD = {}
        for catName in searchableCategoryD:
            nestedContextDL = self.__docHelper.getNestedContexts(catName)
            if not nestedContextDL:
                # not nested skip
                continue
            elif len(nestedContextDL) > 1:
                logger.warning("Multiple nested contexts for category %s",
                               catName)
            #
            for nestedContextD in nestedContextDL:
                contextPath = nestedContextD[
                    "FIRST_CONTEXT_PATH"] if "FIRST_CONTEXT_PATH" in nestedContextD else None
                if not contextPath:
                    logger.warning(
                        "Missing context path for nested category %s", catName)
                    continue
                #
                contextName = nestedContextD["CONTEXT_NAME"]

                #
                cpCatName = contextPath.split(".")[0]
                cpAtName = contextPath.split(".")[1]
                nestedPathSearchContext = self.__docHelper.getSearchContexts(
                    cpCatName, cpAtName)
                logger.debug("Nested (%r) context path for %r %r", contextName,
                             cpCatName, cpAtName)
                if not nestedPathSearchContext:
                    logger.warning(
                        "Missing nested (%r) search context for %r %r",
                        contextName, cpCatName, cpAtName)
                #
                nfD = afD[cpCatName][
                    cpAtName] if cpCatName in afD and cpAtName in afD[
                        cpCatName] else {}
                logger.debug("FeatureD %r", nfD)
                # --
                enumMapD = {}
                enumDL = nfD["ENUMS_ANNOTATED"]
                if not enumDL:
                    logger.warning("Missing nested enums %s.%s", cpCatName,
                                   cpAtName)
                else:
                    logger.debug("All context enums count %d", len(enumDL))
                    for enumD in enumDL:
                        logger.info("%s.%s enumD %r", cpCatName, cpAtName,
                                    enumD)
                        if "name" not in enumD:
                            logger.warning(
                                "Missing nested enum (name) for %s.%s",
                                cpCatName, cpAtName)
                    #
                    enumMapD = {
                        enumD["value"]:
                        enumD["name"] if "name" in enumD else enumD["detail"]
                        for enumD in enumDL
                    }
                # --
                nestedDescriptionText = self.__docHelper.getAttributeDescription(
                    cpCatName, cpAtName, contextType="brief")
                if not nestedDescriptionText:
                    logger.warning("Missing brief nested description %s.%s",
                                   cpCatName, cpAtName)
                else:
                    logger.debug("Nested context description: %r",
                                 nestedDescriptionText)
                # --
                cvDL = nestedContextD["CONTEXT_ATTRIBUTE_VALUES"] if "CONTEXT_ATTRIBUTE_VALUES" in nestedContextD else []
                if not cvDL:
                    logger.warning("Missing context attribute values for %s",
                                   catName)
                    # if no context values defined then use: all enums x searchable attributes in this category
                    #
                    # Template:  enum detail + search attribute brief description text
                    for enumD in enumDL:
                        for atName in searchableCategoryD[catName]:
                            briefDescr = self.__docHelper.getAttributeDescription(
                                catName, atName, contextType="brief")
                            # subCategories = nfD["SUB_CATEGORIES"] if "SUB_CATEGORIES" in nfD else None
                            tS = enumD["detail"] + " " + briefDescr
                            retD.setdefault((catName, atName), []).append(tS)
                else:
                    # Only use context values from the full enum list with specified search paths.
                    #
                    # Template:  context value (enum detail) + search path attribute (brief description text)
                    #  cVDL.append({"CONTEXT_VALUE": tD["CONTEXT_VALUE"], "SEARCH_PATHS": tD["SEARCH_PATHS"]})
                    #
                    for cvD in cvDL:
                        enumV = cvD["CONTEXT_VALUE"]
                        enumDetail = enumMapD[
                            enumV] if enumV in enumMapD else None
                        if not enumDetail:
                            logger.warning(
                                "%s %s missing detail for enum value %s",
                                catName, cpAtName, enumV)
                        for sp in cvD["SEARCH_PATHS"]:
                            if sp.count(".") > 1:
                                k = sp.rfind(".")
                                sp = sp[:k] + "_" + sp[k + 1:]
                            cnS = sp.split(".")[0]
                            anS = sp.split(".")[1]
                            briefDescr = self.__docHelper.getAttributeDescription(
                                cnS, anS, contextType="brief")
                            tS = enumDetail + " " + briefDescr
                            logger.debug("%s,%s tS %r", cnS, anS, tS)
                            retD.setdefault((cnS, anS), []).append(tS)
                        for aD in cvD["ATTRIBUTES"]:
                            sp = aD["PATH"]
                            if sp.count(".") > 1:
                                k = sp.rfind(".")
                                sp = sp[:k] + "_" + sp[k + 1:]
                            cnS = sp.split(".")[0]
                            anS = sp.split(".")[1]
                            briefDescr = self.__docHelper.getAttributeDescription(
                                cnS, anS, contextType="brief")
                            tS = enumDetail + " " + briefDescr
                            logger.debug("%s,%s tS %r", cnS, anS, tS)
                            retD.setdefault((cnS, anS), []).append(tS)
                            exL = aD["EXAMPLES"]
                            logger.info("%s,%s sp %r examplesL %r", cnS, anS,
                                        sp, exL)
        #
        for k, vL in retD.items():
            for v in vL:
                logger.debug("%s : %r", k, v)
        #
        return retD

    def __assembleNestedSubCategorySearchables(self):
        """Assemble dictionary of searchable items in nested subcategories.

        Returns:
            (dict): {(category, atName): ["Materialized brief description", ... ]
        """
        _, afD = self.__getContentFeatures()
        # logger.info("")
        searchableCategoryD = {}
        groupNameList = self.__docHelper.getSearchGroups()
        logger.debug("Search group count (%d)", len(groupNameList))
        for groupName in groupNameList:
            # get attributes in group
            attributeTupList = self.__docHelper.getSearchGroupAttributes(
                groupName)
            for catName, atName in attributeTupList:
                searchableCategoryD.setdefault(catName, []).append(atName)
        logger.debug("Searchable category count (%d)",
                     len(searchableCategoryD))
        #
        subcatNestedD = {}
        tD = self.__docHelper.getAllSubCategoryNestedContexts()
        for k, v in tD.items():
            for kk, vv in v.items():
                if kk in subcatNestedD:
                    logger.warning(
                        "Duplicate nested subcategory specifications in %r %r",
                        k, kk)
                # only take cases with an context path ...
                if "FIRST_CONTEXT_PATH" in vv:
                    subcatNestedD[kk[0]] = (kk[1], vv)
        #  cat = (subcat, {nested context dict})
        #
        retD = {}
        for catName in searchableCategoryD:
            if catName not in subcatNestedD:
                continue
            subCatName, nestedContextD = subcatNestedD[catName]
            #
            contextPath = nestedContextD[
                "FIRST_CONTEXT_PATH"] if "FIRST_CONTEXT_PATH" in nestedContextD else None
            if not contextPath:
                logger.warning("Missing context path for nested category %s",
                               catName)
                continue
            #
            if contextPath.count(".") > 1:
                k = contextPath.rfind(".")
                contextPath = contextPath[:k] + "_" + contextPath[k + 1:]
            logger.debug("%s subcategory %s context path %r", catName,
                         subCatName, contextPath)
            contextName = nestedContextD["CONTEXT_NAME"]
            cpCatName = contextPath.split(".")[0]
            cpAtName = contextPath.split(".")[1]
            nestedPathSearchContext = self.__docHelper.getSearchContexts(
                cpCatName, cpAtName)
            logger.debug("Nested (%r) context path for %r %r", contextName,
                         cpCatName, cpAtName)
            if not nestedPathSearchContext:
                logger.warning("Missing nested (%r) search context for %r %r",
                               contextName, cpCatName, cpAtName)
            #
            nfD = afD[cpCatName][
                cpAtName] if cpCatName in afD and cpAtName in afD[
                    cpCatName] else {}
            logger.debug("FeatureD %r", nfD)
            # --
            enumMapD = {}
            enumDL = nfD["ENUMS_ANNOTATED"]
            if not enumDL:
                logger.warning("Missing nested enums %s.%s", cpCatName,
                               cpAtName)
            else:
                logger.debug("All context enums count %d", len(enumDL))
                for enumD in enumDL:
                    if "name" not in enumD:
                        logger.warning("Missing nested enum (name) for %s.%s",
                                       cpCatName, cpAtName)
                #
                enumMapD = {
                    enumD["value"]:
                    enumD["name"] if "name" in enumD else enumD["detail"]
                    for enumD in enumDL
                }
            # --
            nestedDescriptionText = self.__docHelper.getAttributeDescription(
                cpCatName, cpAtName, contextType="brief")
            if not nestedDescriptionText:
                logger.warning("Missing brief nested description %s.%s",
                               cpCatName, cpAtName)
            else:
                logger.debug("Nested context description: %r",
                             nestedDescriptionText)
                # --
            cvDL = nestedContextD["CONTEXT_ATTRIBUTE_VALUES"] if "CONTEXT_ATTRIBUTE_VALUES" in nestedContextD else []
            #
            if not cvDL:
                logger.warning("Missing context attribute values for %s",
                               catName)
                # if no context values defined then use: all enums x searchable attributes in this category
                #
                # Template:  enum detail + search attribute brief description text
                for enumD in enumDL:
                    for atName in searchableCategoryD[catName]:
                        nnfD = afD[catName][atName]
                        subCatL = [d["id"] for d in nnfD["SUB_CATEGORIES"]
                                   ] if "SUB_CATEGORIES" in nnfD else None
                        logger.debug("%s.%s %s subCatL %r", catName, atName,
                                     subCatName, subCatL)
                        if subCatL and subCatName in subCatL:
                            briefDescr = self.__docHelper.getAttributeDescription(
                                catName, atName, contextType="brief")
                            tS = enumD["detail"] + " " + briefDescr
                            retD.setdefault((catName, atName), []).append(tS)
            else:
                # Only use context values from the full enum list with specified search paths.
                #
                # Template:  context value (enum detail) + search path attribute (brief description text)
                #  cVDL.append({"CONTEXT_VALUE": tD["CONTEXT_VALUE"], "SEARCH_PATHS": tD["SEARCH_PATHS"]})
                #
                for cvD in cvDL:
                    enumV = cvD["CONTEXT_VALUE"]
                    enumDetail = enumMapD[enumV] if enumV in enumMapD else None
                    if not enumDetail:
                        logger.warning(
                            "%s %s missing detail for enum value %s", catName,
                            cpAtName, enumV)
                    for sp in cvD["SEARCH_PATHS"]:
                        if sp.count(".") > 1:
                            k = sp.rfind(".")
                            sp = sp[:k] + "_" + sp[k + 1:]
                        cnS = sp.split(".")[0]
                        anS = sp.split(".")[1]
                        briefDescr = self.__docHelper.getAttributeDescription(
                            cnS, anS, contextType="brief")
                        tS = enumDetail + " " + briefDescr
                        retD.setdefault((cnS, anS), []).append(tS)
                    for aD in cvD["ATTRIBUTES"]:
                        sp = aD["PATH"]
                        if sp.count(".") > 1:
                            k = sp.rfind(".")
                            sp = sp[:k] + "_" + sp[k + 1:]
                        cnS = sp.split(".")[0]
                        anS = sp.split(".")[1]
                        briefDescr = self.__docHelper.getAttributeDescription(
                            cnS, anS, contextType="brief")
                        tS = enumDetail + " " + briefDescr
                        retD.setdefault((cnS, anS), []).append(tS)
                        exL = aD["EXAMPLES"]
                        logger.debug("%s,%s sp %r exL %r", cnS, anS, sp, exL)
        #
        for k, vL in retD.items():
            for v in vL:
                logger.debug("%s : %r", k, v)
        #
        return retD

    def __getContentFeatures(self):
        """Get category and attribute features"""
        try:
            cH = ContentDefinitionHelper(cfgOb=self.__cfgOb)
            dictApi = self.__dP.getApiByLocators(dictLocators=[
                self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile
            ])
            # logger.info("units = %r", dictApi.getUnits("pdbx_nmr_spectrometer", "manufacturer"))
            sdi = ContentDefinition(dictApi,
                                    databaseName="pdbx_core",
                                    contentDefHelper=cH)
            catNameL = sdi.getCategories()
            cfD = {}
            afD = {}
            for catName in catNameL:
                cfD[catName] = sdi.getCategoryFeatures(catName)
                afD[catName] = sdi.getAttributeFeatures(catName)
            #
            return cfD, afD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return None, None
Ejemplo n.º 11
0
    def __fullSchemaDataPrep(self,
                             contentType,
                             filterType,
                             styleType,
                             mockLength,
                             rejectLength=0,
                             dataSelectors=None,
                             mergeContentTypes=None,
                             excludeExtras=None):
        """Internal method for preparing file-based data requiring dynamic methods, slicing, or key injection.

        Args:
            contentType (str): Content type name
            filterType (str): List of data processing options (separated by '|') (e.g. "drop-empty-attributes|drop-empty-tables|skip-max-width|...)
            styleType (str): organization of output document (e.g. rowise-by-name)
            mockLength (int): Expected length of the test data for the input content type
            rejectLength (int, optional): number of input data sets rejected by the dataselection criteria. Defaults to 0.
            dataSelectors (list of str, optional): data selection criteria. Defaults to None.
            mergeContentTypes (list of str, optional): list content types to merge with the input data set. Defaults to None. (e.g. ['vrpt'])
        """
        try:
            excludeExtras = excludeExtras if excludeExtras else []
            _ = mockLength
            _ = rejectLength
            dD = self.__schP.makeSchemaDef(contentType,
                                           dataTyping="ANY",
                                           saveSchema=True)
            _ = SchemaDefAccess(dD)
            inputPathList = self.__rpP.getLocatorObjList(
                contentType=contentType, mergeContentTypes=mergeContentTypes)
            sd, _, collectionNameList, _ = self.__schP.getSchemaInfo(
                databaseName=contentType, dataTyping="ANY")
            #
            dP = DictionaryApiProviderWrapper(self.__cachePath,
                                              cfgOb=self.__cfgOb,
                                              configName=self.__configName,
                                              useCache=True)
            dictApi = dP.getApiByName(contentType)
            #
            rP = DictMethodResourceProvider(
                self.__cfgOb,
                configName=self.__configName,
                cachePath=self.__cachePath,
                restoreUseStash=False,
                restoreUseGit=True,
                providerTypeExclude=self.__excludeType,
            )
            dmh = DictMethodRunner(dictApi,
                                   modulePathMap=self.__modulePathMap,
                                   resourceProvider=rP)
            #
            dtf = DataTransformFactory(schemaDefAccessObj=sd,
                                       filterType=filterType)
            sdp = SchemaDefDataPrep(schemaDefAccessObj=sd,
                                    dtObj=dtf,
                                    workPath=self.__cachePath,
                                    verbose=self.__verbose)
            containerList = self.__rpP.getContainerList(inputPathList)
            for container in containerList:
                cName = container.getName()
                logger.debug("Processing container %s", cName)
                dmh.apply(container)
            #
            for collectionName in collectionNameList:
                tableIdExcludeList = sd.getCollectionExcluded(collectionName)
                tableIdIncludeList = sd.getCollectionSelected(collectionName)
                sliceFilter = sd.getCollectionSliceFilter(collectionName)
                sdp.setSchemaIdExcludeList(tableIdExcludeList)
                sdp.setSchemaIdIncludeList(tableIdIncludeList)
                #
                docList, _, _ = sdp.processDocuments(
                    containerList,
                    styleType=styleType,
                    sliceFilter=sliceFilter,
                    filterType=filterType,
                    dataSelectors=dataSelectors,
                    collectionName=collectionName)

                docList = sdp.addDocumentPrivateAttributes(
                    docList, collectionName)
                docList = sdp.addDocumentSubCategoryAggregates(
                    docList, collectionName)

                # Special exclusions for the test harness. (removes timestamped data items to allow diffs.)
                self.__filterDocuments(docList, excludeExtras)
                mergeS = "-".join(
                    mergeContentTypes) if mergeContentTypes else ""
                fName = "full-prep-%s-%s-%s-%s.json" % (
                    contentType, collectionName, mergeS, styleType)
                if self.__exportFlag:
                    self.__logDocumentOrder(docList)
                    fPath = os.path.join(self.__outputPath, fName)
                    self.__mU.doExport(fPath, docList, fmt="json", indent=3)
                    logger.debug("Exported %r", fPath)
                #
                if self.__diffFlag:
                    fPath = os.path.join(self.__savedOutputPath, fName)
                    refDocList = self.__mU.doImport(fPath, fmt="json")
                    self.assertEqual(len(refDocList), len(docList))
                    logger.debug("For %s %s len refDocList %d", contentType,
                                 collectionName, len(refDocList))
                    logger.debug("For %s %s len docList %d", contentType,
                                 collectionName, len(docList))
                    jD = diff(refDocList,
                              docList,
                              syntax="explicit",
                              marshal=True)
                    if jD:
                        _, fn = os.path.split(fPath)
                        bn, _ = os.path.splitext(fn)
                        fPath = os.path.join(self.__outputPath,
                                             bn + "-diff.json")
                        logger.debug("jsondiff for %s %s = \n%s", contentType,
                                     collectionName,
                                     pprint.pformat(jD, indent=3, width=100))
                        self.__mU.doExport(fPath, jD, fmt="json", indent=3)
                    self.assertEqual(len(jD), 0)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Ejemplo n.º 12
0
class ContentDefinitionTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config", "exdb-config-example.yml")
        self.__configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig, defaultSectionName=self.__configName, mockTopPath=self.__mockTopPath)
        #
        #
        self.__pathPdbxDictionaryFile = self.__cfgOb.getPath("PDBX_DICT_LOCATOR", sectionName=self.__configName)
        self.__pathRcsbDictionaryFile = self.__cfgOb.getPath("RCSB_DICT_LOCATOR", sectionName=self.__configName)
        self.__pathVrptDictionaryFile = self.__cfgOb.getPath("VRPT_DICT_LOCATOR", sectionName=self.__configName)

        self.__mU = MarshalUtil()
        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__dP = DictionaryApiProviderWrapper(self.__cachePath, cfgOb=self.__cfgOb, configName=self.__configName, useCache=True)
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()), endTime - self.__startTime)

    def testDefaults(self):
        """Test the default case of using only dictionary content."""
        try:
            dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile])
            ok = dictApi.testCache()
            self.assertTrue(ok)
            sdi = ContentDefinition(dictApi)
            nS = sdi.getSchemaNames()
            logger.debug("schema name length %d", len(nS))
            self.assertGreaterEqual(len(nS), 600)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testHelper(self):
        """Test the dictionary content supplemented by helper function"""
        try:
            cH = ContentDefinitionHelper(cfgOb=self.__cfgOb)
            dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile])
            sdi = ContentDefinition(dictApi, databaseName="chem_comp", contentDefHelper=cH)
            catNameL = sdi.getCategories()
            cfD = {}
            afD = {}
            for catName in catNameL:
                cfD[catName] = sdi.getCategoryFeatures(catName)
                afD[catName] = sdi.getAttributeFeatures(catName)

            #
            logger.debug("Dictionary category name length %d", len(catNameL))
            self.assertGreaterEqual(len(catNameL), 600)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testExtensionWithHelper(self):
        """Test the dictionary content supplemented by helper function"""
        try:
            cH = ContentDefinitionHelper(cfgOb=self.__cfgOb)
            dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile])
            sdi = ContentDefinition(dictApi, databaseName="pdbx_core", contentDefHelper=cH)
            catNameL = sdi.getCategories()
            cfD = {}
            afD = {}
            for catName in catNameL:
                cfD[catName] = sdi.getCategoryFeatures(catName)
                afD[catName] = sdi.getAttributeFeatures(catName)

            #
            logger.debug("Dictionary category name length %d", len(catNameL))
            self.assertGreaterEqual(len(catNameL), 650)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testRepoWithHelper(self):
        """Test the dictionary content supplemented by helper function for auxiliary schema"""
        try:
            cH = ContentDefinitionHelper(cfgOb=self.__cfgOb)
            dictApi = self.__dP.getApiByLocators(dictLocators=[self.__pathPdbxDictionaryFile, self.__pathRcsbDictionaryFile, self.__pathVrptDictionaryFile])
            sdi = ContentDefinition(dictApi, databaseName="repository_holdings", contentDefHelper=cH)
            catNameL = sdi.getCategories()
            cfD = {}
            afD = {}
            for catName in catNameL:
                cfD[catName] = sdi.getCategoryFeatures(catName)
                afD[catName] = sdi.getAttributeFeatures(catName)

            #
            logger.debug("Dictionary category name length %d", len(catNameL))
            self.assertGreaterEqual(len(catNameL), 680)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()