Example #1
0
    def updateDefaultDataTypeMap(self, filePath, mapD, dataTyping="ANY"):
        """Update data file containing application default data type mapping with any
         updates from the input type mapping dictionary

        mapD['cif_type_code'] -> ['application_name', 'app_type_code', 'app_precision_default', 'app_width_default', 'type_code']

               data_rcsb_data_type_map
                 loop_
                 _pdbx_data_type_application_map.application_name
                 _pdbx_data_type_application_map.type_code
                 _pdbx_data_type_application_map.app_type_code
                 _pdbx_data_type_application_map.app_precision_default
                 _pdbx_data_type_application_map.app_width_default
                 # .... type mapping data ...
        """
        try:
            #
            mD = copy.deepcopy(mapD)
            mU = MarshalUtil(workPath=self.__workPath)
            containerList = mU.doImport(filePath,
                                        fmt="mmcif",
                                        enforceAscii=True,
                                        useCharRefs=True,
                                        raiseExceptions=True)
            for container in containerList:
                if container.getName() == "rcsb_data_type_map":
                    catObj = container.getObj("pdbx_data_type_application_map")
                    rIL = []
                    for ii in range(catObj.getRowCount()):
                        dD = catObj.getRowAttributeDict(ii)
                        if dD["application_name"] == dataTyping:
                            rIL.append(ii)
                            mD[dD["type_code"]] = {
                                k: dD[k]
                                for k in [
                                    "application_name", "app_type_code",
                                    "app_precision_default",
                                    "app_width_default", "type_code"
                                ]
                            }
                            continue
                    ok = catObj.removeRows(rIL)
                    atNameL = catObj.getAttributeList()
                    for ky in mapD:
                        row = [mapD[ky][atN] for atN in atNameL]
                        catObj.append(row)
            #
            # Write updated data file
            mU = MarshalUtil(workPath=self.__workPath)
            ok = mU.doExport(filePath,
                             containerList,
                             fmt="mmcif",
                             enforceAscii=True,
                             useCharRefs=True,
                             raiseExceptions=True)

            return ok
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return False
Example #2
0
    def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None):
        """Return JSON schema (w/ BSON types) object for the input collection and level.and

        Args:
            databaseName (str): database name
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

        Returns:
            dict: Schema object

        """
        sObj = None
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
        #
        if self.__rebuildFlag:
            filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
            self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts)
        else:
            filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache)
        mU = MarshalUtil(workPath=self.__workPath)
        if filePath and mU.exists(filePath):
            mU = MarshalUtil(workPath=self.__workPath)
            sObj = mU.doImport(filePath, fmt="json")
        else:
            logger.debug("Failed to read schema for %s %r", collectionName, level)
        return sObj
 def exportProteinSequenceDetails(self, filePath, fmt="json", minSeqLen=0):
     """Export protein sequence and taxonomy data (required to build protein sequence fasta file)"""
     rD, missingSrcD = self.getProteinSequenceDetails(minSeqLen=minSeqLen)
     # ----
     mU = MarshalUtil()
     ok1 = mU.doExport(filePath, rD, fmt=fmt, indent=3)
     #
     pth, _ = os.path.split(filePath)
     mU = MarshalUtil()
     ok2 = mU.doExport(os.path.join(pth, "missingSrcNames.json"),
                       missingSrcD,
                       fmt="json")
     logger.info(
         "Exporting (%d) protein sequence records with missing source count (%d) status %r",
         len(rD), len(missingSrcD), ok1 and ok2)
 def __init__(
     self,
     cfgOb,
     databaseName="pdbx_core",
     collectionName="pdbx_core_polymer_entity",
     polymerType="Protein",
     referenceDatabaseName="UniProt",
     provSource="PDB",
     maxChunkSize=100,
     fetchLimit=None,
     **kwargs
 ):
     self.__cfgOb = cfgOb
     self.__polymerType = polymerType
     self.__mU = MarshalUtil()
     #
     self.__maxChunkSize = maxChunkSize
     self.__statusList = []
     #
     self.__pfP = self.__fetchPfamProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__ipP = self.__fetchInterProProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__ssP = self.__fetchSiftsSummaryProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__goP = self.__fetchGoProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__ecP = self.__fetchEcProvider(self.__cfgOb, self.__cfgOb.getDefaultSectionName(), **kwargs)
     self.__refIdMapD, self.__matchD, self.__refD = self.__reload(databaseName, collectionName, polymerType, referenceDatabaseName, provSource, fetchLimit, **kwargs)
 def __rebuildCache(self, **kwargs):
     mU = MarshalUtil()
     # source directory path
     srcDirPath = kwargs.get("srcDirPath", None)
     # cache details
     cacheKwargs = kwargs.get("cacheKwargs", {"fmt": "pickle"})
     useCache = kwargs.get("useCache", True)
     entrySaveLimit = kwargs.get("entrySaveLimit", None)
     abbreviated = str(kwargs.get("abbreviated", "TEST")).upper()
     #
     # cacheDirPath = kwargs.get("cacheDirPath", None)
     cacheDirPath = self.__cacheDirPath
     pyVersion = sys.version_info[0]
     ext = "pic" if cacheKwargs["fmt"] == "pickle" else "json"
     saveFilePath = os.path.join(cacheDirPath, "sifts-summary-py%s.%s" % (str(pyVersion), ext))
     #
     ssD = {}
     try:
         if useCache and os.access(saveFilePath, os.R_OK):
             ssD = mU.doImport(saveFilePath, **cacheKwargs)
         else:
             if not srcDirPath:
                 logger.error("Missing SIFTS source path details")
                 return ssD
             ssD = self.__getSummaryMapping(srcDirPath, abbreviated=abbreviated)
             if entrySaveLimit:
                 ssD = {k: ssD[k] for k in list(ssD.keys())[:entrySaveLimit]}
             mU.mkdir(cacheDirPath)
             ok = mU.doExport(saveFilePath, ssD, **cacheKwargs)
             logger.debug("Saving SIFTS summary serialized data file %s (%d) status %r", saveFilePath, len(ssD), ok)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     return ssD
 def __getApi(self, dictLocators, **kwargs):
     """ Return an instance of a dictionary API instance for the input dictionary locator list.
     """
     consolidate = kwargs.get("consolidate", True)
     replaceDefinition = kwargs.get("replaceDefinitions", True)
     verbose = kwargs.get("verbose", True)
     #
     ok = self.__reload(dictLocators,
                        self.__dirPath,
                        useCache=self.__useCache)
     #
     dApi = None
     if ok:
         mU = MarshalUtil()
         containerList = []
         for dictLocator in dictLocators:
             cacheFilePath = os.path.join(
                 self.__dirPath, self.__fileU.getFileName(dictLocator))
             containerList.extend(
                 mU.doImport(cacheFilePath, fmt="mmcif-dict"))
         #
         dApi = DictionaryApi(containerList=containerList,
                              consolidate=consolidate,
                              replaceDefinition=replaceDefinition,
                              verbose=verbose)
     return dApi
    def readDefaultDataTypeMap(self, locator, dataTyping="ANY"):
        """Read data file containing application default data type mapping

              data_rcsb_data_type_map
                loop_
                _pdbx_data_type_application_map.application_name
                _pdbx_data_type_application_map.type_code
                _pdbx_data_type_application_map.app_type_code
                _pdbx_data_type_application_map.app_precision_default
                _pdbx_data_type_application_map.app_width_default
                # .... type mapping data ...

        Return (dict):  map[cifType] -> appType, width, precision
                    mapD['cif_type_code'] -> ['application_name', 'app_type_code', 'app_precision_default', 'app_width_default', 'type_code']
        """
        try:
            #
            mapD = {}
            mU = MarshalUtil(workPath=self.__workPath)
            containerList = mU.doImport(locator, fmt="mmcif", enforceAscii=True, useCharRefs=True, raiseExceptions=True)

            for container in containerList:
                if container.getName() == "rcsb_data_type_map":
                    catObj = container.getObj("pdbx_data_type_application_map")
                    for ii in range(catObj.getRowCount()):
                        dD = catObj.getRowAttributeDict(ii)
                        if dD["application_name"] == dataTyping:
                            mapD[dD["type_code"]] = {k: dD[k] for k in ["app_type_code", "application_name", "type_code"]}
                            mapD[dD["type_code"]].update({k: int(dD[k]) for k in ["app_precision_default", "app_width_default"]})
            return mapD
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        return {}
 def setUp(self):
     self.__startTime = time.time()
     self.__workPath = os.path.join(HERE, "test-output")
     self.__dataPath = os.path.join(HERE, "test-data")
     self.__cachePath = os.path.join(HERE, "test-output", "CACHE")
     self.__buildTypeList = [
         "oe-iso-smiles", "oe-smiles", "cactvs-iso-smiles", "cactvs-smiles",
         "inchi"
     ]
     #
     self.__mU = MarshalUtil(workPath=self.__cachePath)
     # Set the external environment for the wrapper class-
     self.__testFlagFull = False
     self.__testStash = False
     if self.__testFlagFull:
         os.environ["CHEM_SEARCH_CACHE_PATH"] = os.path.join(
             self.__cachePath)
         os.environ["CHEM_SEARCH_CC_PREFIX"] = "cc-full"
     else:
         os.environ["CHEM_SEARCH_CACHE_PATH"] = os.path.join(
             self.__cachePath)
         os.environ["CHEM_SEARCH_CC_PREFIX"] = "cc-abbrev"
     #
     self.__numMolsTest = 20
     logger.debug("Running tests on version %s", __version__)
     logger.info("Starting %s at %s", self.id(),
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Example #9
0
    def __init__(self, **kwargs):
        urlTargetPfam = kwargs.get(
            "urlTargetPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
        )
        urlTargetPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/Pfam-A.clans.tsv.gz"
        self.__version = "34.0"
        dirName = "pfam"
        cachePath = kwargs.get("cachePath", ".")
        dirPath = os.path.join(cachePath, dirName)
        super(PfamProvider, self).__init__(cachePath, [dirName])
        useCache = kwargs.get("useCache", True)
        #
        self.__mU = MarshalUtil(workPath=dirPath)
        self.__pfamD = self.__rebuildCache(urlTargetPfam, urlTargetPfamFB,
                                           dirPath, useCache)

        urlTargetMapPfam = kwargs.get(
            "urlTargetMapPfam",
            "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/database_files/pdb_pfamA_reg.txt.gz"
        )
        urlTargetMapPfamFB = "https://github.com/rcsb/py-rcsb_exdb_assets/raw/master/fall_back/Pfam/pdb_pfamA_reg.txt.gz"
        self.__pfamMapD = self.__rebuildMappingCache(urlTargetMapPfam,
                                                     urlTargetMapPfamFB,
                                                     dirPath, useCache)
Example #10
0
    def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False):
        """Create the schema definition file for a given database (i.e., the files under 'schema_definitions')

        Args:
            databaseName (str): database name (e.g., 'pdbx_comp_model_core')
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)
            saveSchema (bool, optional): whether to save the schema to schemaCachePath or not (default False)

        Returns:
            dict: schema definition dictionary

        """
        schemaDef = None
        try:
            smb = SchemaDefBuild(databaseName,
                                 self.__cfgOb,
                                 cachePath=self.__cachePath)
            schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb")
            if schemaDef and saveSchema:
                schemaLocator = self.__getSchemaDefLocator(
                    databaseName, dataTyping=dataTyping)
                localPath = os.path.join(
                    self.__schemaCachePath,
                    self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath,
                            schemaDef,
                            fmt="json",
                            indent=3,
                            enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s failing with %s",
                             databaseName, str(e))
        return schemaDef
Example #11
0
 def __parseFasta(self,
                  fastaPath,
                  taxonPath,
                  cachePath,
                  dirPath,
                  addTaxonomy=False):
     # input paths
     chemblTargetRawPath = os.path.join(dirPath, "chembl_targets_raw.fa.gz")
     mU = MarshalUtil(workPath=cachePath)
     oD = {}
     uD = {}
     missTax = 0
     taxonL = []
     try:
         if addTaxonomy:
             umP = UniProtIdMappingProvider(cachePath)
             umP.reload(useCache=True)
         #
         fD = mU.doImport(chemblTargetRawPath,
                          fmt="fasta",
                          commentStyle="default")
         #
         for seqId, sD in fD.items():
             chemblId = seqId.strip().split(" ")[0].strip()
             unpId = seqId[seqId.find("[") + 1:seqId.find("]")]
             seq = sD["sequence"]
             cD = {
                 "sequence": seq,
                 "uniprotId": unpId,
                 "chemblId": chemblId
             }
             if addTaxonomy:
                 taxId = umP.getMappedId(unpId, mapName="NCBI-taxon")
                 cD["taxId"] = taxId if taxId else -1
                 if not taxId:
                     missTax += 1
             #
             seqId = ""
             cL = []
             for k, v in cD.items():
                 if k in ["sequence"]:
                     continue
                 cL.append(str(v))
                 cL.append(str(k))
             seqId = "|".join(cL)
             oD[seqId] = cD
             if addTaxonomy:
                 taxonL.append("%s\t%s" % (seqId, taxId))
             #
             uD.setdefault(unpId, []).append(chemblId)
         #
         ok1 = mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True)
         ok3 = True
         if addTaxonomy:
             ok3 = mU.doExport(taxonPath, taxonL, fmt="list")
         return ok1 & ok3
     except Exception as e:
         logger.exception("Failing with %s", str(e))
     #
     return False
Example #12
0
 def testExtractAndSerialize(self):
     """ Test extraction on an example sequence cluster data set.
     """
     try:
         cdp = ClusterDataPrep(workPath=self.__workPath)
         cifD, docBySequenceD, docByClusterD = cdp.extract(
             self.__dataSetId,
             clusterSetLocator=self.__pathClusterData,
             levels=self.__levels,
             clusterType="entity")
         mU = MarshalUtil(workPath=self.__workPath)
         ok = mU.doExport(self.__pathSaveStyleCif,
                          cifD,
                          fmt="json",
                          indent=3)
         self.assertTrue(ok)
         ok = mU.doExport(self.__pathSaveStyleDocSequence,
                          docBySequenceD,
                          fmt="json",
                          indent=3)
         self.assertTrue(ok)
         ok = mU.doExport(self.__pathSaveStyleDocCluster,
                          docByClusterD,
                          fmt="json",
                          indent=3)
         self.assertTrue(ok)
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         self.fail()
Example #13
0
    def setUp(self):
        #
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                                  "dbload-setup-example.yml")
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=configPath,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)

        #
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        #
        self.__cacheKwargs = {"fmt": "json", "indent": 3}
        self.__exdbDirPath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
        #
        self.__mU = MarshalUtil()
        self.__entryLimitTest = 20
        #
        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Example #14
0
 def setUp(self):
     #
     self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                               "dbload-setup-example.yml")
     #
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName,
                               mockTopPath=self.__mockTopPath)
     #
     self.__cachePath = os.path.join(TOPDIR, "CACHE")
     #
     self.__cacheKwargs = {"fmt": "json", "indent": 3}
     self.__exdbCacheDirPath = os.path.join(
         self.__cachePath,
         self.__cfgOb.get("EXDB_CACHE_DIR", sectionName=configName))
     #
     # Reference sequence test data cache -
     #
     self.__refDbCachePath = os.path.join(HERE, "test-output",
                                          "unp-data-test-cache.json")
     self.__cacheKwargs = {"fmt": "json", "indent": 3}
     self.__useCache = False
     self.__fetchLimit = None
     #
     # Entity polymer extracted data ...
     #
     self.__entryLimit = 500
     #
     self.__mU = MarshalUtil()
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
    def __init__(self, **kwargs):
        """Utilities build and deliver OE molecules for search applications. Source molecular
           definitions are taken from SMILES descriptors generated by ChemCompSearchIndexProvider()

        Args:
            cachePath (str, optional): path to the directory containing cache files (default: '.')
            ccFileNamePrefix (str, optional) file name prefix for chemical component search index (default: "cc")
            oeFileNamePrefix (str, optional) file name prefix for all generated databases (default: "oe")

        """
        # Database file names with be prefixed with base prefix plus the molecular build type and perception options
        oeFileNamePrefixBase = kwargs.get("oeFileNamePrefix", "oe")
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "oe")
        limitPerceptions = kwargs.get("limitPerceptions", False)
        if limitPerceptions:
            self.__oeFileNamePrefix = oeFileNamePrefixBase + "-limit"
        else:
            self.__oeFileNamePrefix = oeFileNamePrefixBase
        #
        cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(cachePath, "oe_mol")
        #
        self.__fpDbD = {}
        self.__ssDb = None
        self.__oeMolD = {}
        self.__oeMolDb = None
        self.__oeMolDbTitleD = None
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__reload(**kwargs)
 def __init__(self, **kwargs):
     self.__dirPath = kwargs.get("holdingsDirPath", ".")
     useCache = kwargs.get("useCache", True)
     baseUrl = kwargs.get(
         "baseUrl",
         "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/development/fall_back/holdings/"
     )
     urlTargetContent = kwargs.get(
         "currentTargetUrl",
         os.path.join(baseUrl, "current_holdings.json.gz"))
     urlFallbackTargetContent = kwargs.get(
         "currentTargetUrl",
         os.path.join(baseUrl, "current_holdings.json.gz"))
     #
     urlTargetIds = kwargs.get(
         "currentTargetUrl", os.path.join(baseUrl,
                                          "current_pdb_ids.json.gz"))
     urlFallbackTargetIds = kwargs.get(
         "currentTargetUrl", os.path.join(baseUrl,
                                          "current_pdb_ids.json.gz"))
     #
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__invD = self.__reloadEntryContent(urlTargetContent,
                                             urlFallbackTargetContent,
                                             self.__dirPath,
                                             useCache=useCache)
     self.__idD = self.__reloadEntryIds(urlTargetIds,
                                        urlFallbackTargetIds,
                                        self.__dirPath,
                                        useCache=useCache)
Example #17
0
 def __init__(self, **kwargs):
     #
     dirName = "glygen"
     cachePath = kwargs.get("cachePath", ".")
     self.__dirPath = os.path.join(cachePath, dirName)
     super(GlyGenProvider, self).__init__(cachePath, [dirName])
     useCache = kwargs.get("useCache", True)
     #
     baseUrl = kwargs.get(
         "glygenBasetUrl",
         "https://data.glygen.org/ln2data/releases/data/v-1.12.3/reviewed/")
     fallbackUrl = kwargs.get(
         "glygenFallbackUrl",
         "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/glygen/"
     )
     #
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__glycanD = self.__reloadGlycans(baseUrl,
                                           fallbackUrl,
                                           self.__dirPath,
                                           useCache=useCache)
     self.__glycoproteinD = self.__reloadGlycoproteins(baseUrl,
                                                       fallbackUrl,
                                                       self.__dirPath,
                                                       useCache=useCache)
 def setUp(self):
     self.__isMac = platform.system() == "Darwin"
     self.__doLoad = True if self.__isMac else False
     #
     #
     self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                               "dbload-setup-example.yml")
     #
     configName = "site_info_configuration"
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName,
                               mockTopPath=self.__mockTopPath)
     #
     self.__cachePath = os.path.join(TOPDIR, "CACHE")
     #
     self.__mU = MarshalUtil()
     #
     self.__readBackCheck = True
     self.__numProc = 2
     self.__chunkSize = 10
     self.__documentLimit = None
     self.__debugFlag = False
     self.__loadType = "full"
     self.__useCache = True
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
    def __init__(self, **kwargs):
        """Utilities build and deliver OE molecule databases from PDB chemical component definition data
        Args:
            cachePath (str, optional): path to the directory containing cache files (default: '.')
            molBuildType (str,optional): data source for building OE molecules (default: "model-xyz")
            oeFileNamePrefix (str, optional) file name prefix for all generated databases (default: "oe")

        """
        # Database file names with be prefixed with base prefix plus the molecular build type and perception options
        oeFileNamePrefixBase = kwargs.get("oeFileNamePrefix", "oe")
        limitPerceptions = kwargs.get("limitPerceptions", False)
        molBuildType = kwargs.get("molBuildType", "model-xyz")
        if limitPerceptions and molBuildType in [
                "oe-smiles", "oe-iso-smiles", "inchi"
        ]:
            self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType + "-limit"
        else:
            self.__oeFileNamePrefix = oeFileNamePrefixBase + "-" + molBuildType
        #
        cachePath = kwargs.get("cachePath", ".")
        self.__dirPath = os.path.join(cachePath, "oe_mol")
        #
        self.__fpDbD = {}
        self.__ssDb = None
        self.__oeMolD = {}
        self.__oeMolDb = None
        self.__oeMolDbTitleD = None
        #
        self.__mU = MarshalUtil(workPath=self.__dirPath)
        self.__molCount = self.__reload(**kwargs)
 def __init__(self, filePath, **kwargs):
     self.__filePath = filePath
     # Turn off warnings for missing values
     self.__verbose = kwargs.get("verbose", False)
     self.__tD = {}
     self.__mU = MarshalUtil()
     self.__byPassMode = not self.__setup(self.__filePath)
    def __init__(self, **kwargs):
        """Wrapper class for batch chemical search/depiction operations.

        Path and prefix data for wrapper class may be set as keyword arguments
        as environmental variables.

        Args:
            ccUrlTarget (str, optional): path to concatenated chemical component definition file. Defaults to public data file.
            birdUrlTarget (str, optional): path to the concatenated BIRD definition file.  Defaults to public data file.
            cachePath (str): path to top-level cache directory used to store search index file dependencies
                             (default environment variable CHEM_SEARCH_CACHE_PATH or ".")
            numProc (int): multi-process cores to reserve. Default to 6.
            chunkSize (int): multi-process batch size.  Defaults to 50.
        """
        self.__startTime = time.time()
        #
        self.__useCache = kwargs.get("useCache", True)
        self.__numProc = kwargs.get("numProc", 6)
        self.__chunkSize = kwargs.get("chunkSize", 50)
        #
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", os.environ.get("CHEM_SEARCH_CC_PREFIX", "cc-full"))
        #
        self.__cachePath = kwargs.get("cachePath", os.environ.get("CHEM_SEARCH_CACHE_PATH", "."))
        # ---
        self.__mU = MarshalUtil(workPath=self.__cachePath)
        # ---
        self.__ccsw = self.__reload()
 def setUp(self):
     #
     #
     self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
     configPath = os.path.join(TOPDIR, "rcsb", "mock-data", "config",
                               "dbload-setup-example.yml")
     #
     # Caution: this is very site specific setting !
     configName = "site_info_remote"
     self.__cfgOb = ConfigUtil(configPath=configPath,
                               defaultSectionName=configName,
                               mockTopPath=self.__mockTopPath)
     if configName != "site_info_configuration":
         self.__cfgOb.replaceSectionName("site_info_configuration",
                                         configName)
     #
     self.__workPath = os.path.join(HERE, "test-cache-preserve")
     #
     self.__entityPolymerCachePath = os.path.join(
         self.__workPath, "entity-polymer-data-cache.pic")
     self.__entityPolymerCacheKwargs = {"fmt": "pickle"}
     self.__useEntityPolymerCache = True
     #
     self.__refDbCachePath = os.path.join(self.__workPath,
                                          "unp-data-test-cache.json")
     self.__refDbCacheKwargs = {"fmt": "json", "indent": 3}
     #
     self.__refDbUseCache = True
     self.__fetchLimit = 500
     #
     self.__mU = MarshalUtil()
     #
     self.__startTime = time.time()
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
    def setUp(self):
        self.__verbose = True
        #
        self.__mockTopPath = os.path.join(TOPDIR, "rcsb", "mock-data")
        self.__pathConfig = os.path.join(TOPDIR, "rcsb", "db", "config",
                                         "exdb-config-example.yml")
        self.__cachePath = os.path.join(TOPDIR, "CACHE")
        self.__updateId = "2018_25"
        self.__export = False
        #
        configName = "site_info_configuration"
        self.__cfgOb = ConfigUtil(configPath=self.__pathConfig,
                                  defaultSectionName=configName,
                                  mockTopPath=self.__mockTopPath)
        self.__schP = SchemaProvider(self.__cfgOb,
                                     self.__cachePath,
                                     useCache=True)
        self.__sandboxPath = self.__cfgOb.getPath("RCSB_EXCHANGE_SANDBOX_PATH",
                                                  sectionName=configName)
        #
        self.__mU = MarshalUtil(workPath=self.__cachePath)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Example #24
0
 def testProviderReadValidationReport(self):
     mU = MarshalUtil()
     vpr = ValidationReportAdapter(dirPath=os.path.join(
         self.__workPath, "vprt"),
                                   useCache=False,
                                   cleaCache=True)
     vrd = vpr.getReader()
     cL = mU.doImport(self.__exampleFileXray,
                      fmt="xml",
                      marshalHelper=vrd.toCif)
     ok = mU.doExport(self.__cifFileXray, cL, fmt="mmcif")
     self.assertTrue(ok)
     #
     vpr = ValidationReportAdapter(dirPath=os.path.join(
         self.__workPath, "vprt"),
                                   useCache=True,
                                   cleaCache=False)
     vrd = vpr.getReader()
     xrt = mU.doImport(self.__exampleFileNmr, fmt="xml")
     cL = vrd.toCif(xrt)
     ok = mU.doExport(self.__cifFileNmr, cL, fmt="mmcif")
     self.assertTrue(ok)
     #
     vpr = ValidationReportAdapter(dirPath=os.path.join(
         self.__workPath, "vprt"),
                                   useCache=True,
                                   cleaCache=False)
     vrd = vpr.getReader()
     xrt = mU.doImport(self.__exampleFilEm, fmt="xml")
     cL = vrd.toCif(xrt)
     ok = mU.doExport(self.__cifFileEm, cL, fmt="mmcif")
     self.assertTrue(ok)
 def testSubsetBuildMoleculeCacheFiltered(self):
     """Test construction of a filtered selection of chemical component definitions."""
     mU = MarshalUtil()
     fD = mU.doImport(self.__missedIdsPath, fmt="json")
     filterIdD = {ccId: True for ccId in fD["filteredIdList"]}
     self.__testBuildMoleculeCacheFiles(filterIdD=filterIdD,
                                        ccFileNamePrefix="cc-filtered")
 def __init__(self, **kwargs):
     #
     self.__cachePath = kwargs.get("cachePath", ".")
     self.__dirPath = os.path.join(self.__cachePath, "chem_comp")
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__ccFileNamePrefix = kwargs.get("ccFileNamePrefix", "cc")
     self.__ccIdxD = self.__reload(**kwargs)
Example #27
0
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "mmcif_pdbx_v5_next.dic")
        self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data", "dictionaries", "vrpt_dictmap.json")
        self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "update-lists", "all-pdb-list")
        self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_BIRD_CC_REPO", "0", "PRDCC_000010.cif")
        # self.__locatorCifFile = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/00/100d.cif.gz"
        self.__locatorCifFileBad = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/00/100dx.cif.gz"

        self.__locatorCifFile = "https://ftp.wwpdb.org/pub/pdb/data/structures/divided/mmCIF/hr/6hrg.cif.gz"
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__pathSaveDictionaryFile = os.path.join(self.__workPath, "mmcif_pdbx_v5_next.dic")
        self.__pathSaveJsonTestFile = os.path.join(self.__workPath, "json-content.json")
        self.__pathSaveIndexFile = os.path.join(self.__workPath, "all-pdb-list")
        self.__pathSaveCifFile = os.path.join(self.__workPath, "cif-content.cif")
        #
        self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data", "MOCK_EXCHANGE_SANDBOX", "sequence", "pdb_seq_prerelease.fasta")
        self.__pathSaveFastaFile = os.path.join(self.__workPath, "test-pre-release.fasta")
        #

        self.__urlTarget = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"
        self.__urlTargetBad = "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump-missing.tar.gz"
        #
        self.__mU = MarshalUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(), time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
Example #28
0
    def testSearchExec(self):
        """Test case:  search cli"""
        try:
            mL = glob.glob(os.path.join(self.__molFileDirPath, "*"))
            logger.info("search list length %d", len(mL))
            mU = MarshalUtil()
            ok = mU.doExport(self.__queryListFilePath, mL, fmt="list")
            exU = ExecUtils()
            logger.info("Executing shell for %s", self.__queryListFilePath)
            cmdPath = os.path.join(TOPDIR, "rcsb", "utils", "ccdc",
                                   "CcdcSearchExec.py")

            logger.info("cmdPath %r", cmdPath)
            ok = exU.runShell(
                "%s %s --mol_list_path %s --result_path %s --search_type %s --csdhome %s"
                % (self.__pythonBinPath, cmdPath, self.__queryListFilePath,
                   self.__ssResultPath, "substructure", self.__csdHome),
                outPath=self.__logPath,
                outAppend=False,
                timeOut=60,
                suppressStderr=False,
            )
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
    def reloadDump(self, fmt="json"):
        """Reload PubChem reference data store from saved dump.

        Args:
            fmt (str, optional): format of the backup file (pickle or json). Defaults to "json".

        Returns:
            (int): number of objects restored.
        """
        numUpd = 0
        try:
            # Read from disk backup and update object store -
            if fmt in ["json", "pickle"]:
                fp = self.__getdumpFilePath(fmt="json")
                logger.info("Restoring object store from %s", fp)
                mU = MarshalUtil(workPath=self.__dirPath)
                matchD = mU.doImport(fp, fmt=fmt)
                numUpd = self.__reloadDump(
                    matchD,
                    self.__databaseName,
                    self.__matchIndexCollectionName,
                    indexAttributeNames=["rcsb_id", "rcsb_last_update"])
        except Exception as e:
            logger.exception("Failing for %r with %s", self.__dirPath, str(e))
        # --
        return numUpd
Example #30
0
 def __init__(self, **kwargs):
     #
     self.__cachePath = kwargs.get("cachePath", ".")
     self.__dirPath = os.path.join(self.__cachePath, "CARD-targets")
     #
     self.__mU = MarshalUtil(workPath=self.__dirPath)
     self.__oD, self.__version = self.__reload(self.__dirPath, **kwargs)