Python FileUtil Examples, rcsb.utils.io.FileUtil.FileUtil Python Examples

Example #1

0

Show file

File: DrugBankTargetCofactorProvider.py Project: rcsb/py-rcsb_utils_targets

    def __reload(self, dirPath, useCache, fmt):
        startTime = time.time()
        fD = {}

        ok = False
        cofactorPath = self.__getCofactorDataPath(fmt=fmt)
        #
        logger.info("useCache %r featurePath %r", useCache, cofactorPath)
        if useCache and self.__mU.exists(cofactorPath):
            fD = self.__mU.doImport(cofactorPath, fmt=fmt)
        else:
            fU = FileUtil()
            fU.mkdir(dirPath)
        # ---
        logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                    time.time() - startTime)
        return fD

Example #2

0

Show file

File: ChemAxonDescriptorProvider.py Project: rcsb/py-rcsb_utils_chem

 def __fetchUrl(self, urlTarget, dirPath, useCache=False):
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     filePath = os.path.join(dirPath, fn)
     if not (useCache and fU.exists(filePath)):
         startTime = time.time()
         ok2 = fU.get(urlTarget, filePath)
         endTime = time.time()
         if ok2:
             logger.info(
                 "Fetched %s for resource file %s (status = %r) (%.4f seconds)",
                 urlTarget, filePath, ok2, endTime - startTime)
         else:
             logger.error(
                 "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)",
                 urlTarget, filePath, ok2, endTime - startTime)
     #
     return filePath

Example #3

0

Show file

 def __reload(self, dirPath, useCache):
     startTime = time.time()
     aD = {}
     fU = FileUtil()
     fU.mkdir(dirPath)
     targetMechanismFilePath = self.getTargetMechanismDataPath()
     #
     if useCache and fU.exists(targetMechanismFilePath):
         logger.info("useCache %r using %r", useCache,
                     targetMechanismFilePath)
         qD = self.__mU.doImport(targetMechanismFilePath, fmt="json")
         aD = qD["mechanism"] if "mechanism" in qD else {}
     #
     logger.info("Completed reload of (%d) at %s (%.4f seconds)", len(aD),
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return aD

Example #4

0

Show file

File: ChemRefMappingProvider.py Project: rcsb/py-rcsb_exdb

 def __reload(self, dirPath, useCache):
     startTime = time.time()
     fD = {}
     ok = False
     mappingPath = self.__getMappingDataPath()
     #
     logger.info("useCache %r mappingPath %r", useCache, mappingPath)
     if useCache and self.__mU.exists(mappingPath):
         fD = self.__mU.doImport(mappingPath, fmt="json")
         ok = True
     else:
         fU = FileUtil()
         fU.mkdir(dirPath)
     # ---
     logger.info("Completed reload with status (%r) at %s (%.4f seconds)",
                 ok, time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     return fD

Example #5

0

Show file

    def put(self, localPath, remotePath):
        """Put a local file on a remote FTP server.

        Arguments:
            localPath (str): local file path
            remotePath (str): remote file path

        Returns:
            bool: True for success or false otherwise
        """
        try:
            # First, make sure the provided localPath represents a file, not a directory
            if not os.path.isfile(localPath):
                logger.error(
                    "put failing for localPath %s - path must be to a specific file, not a directory.",
                    localPath)
                return False

            fileU = FileUtil()
            remotePathDir = fileU.getFilePath(remotePath)
            self.mkdir(remotePathDir)
            # If provided remotePath already exists and is a directory, put the file on the remote server using the local filename
            # to avoid unintentionally overwriting an entire remote directory with a single file
            if (os.path.exists(remotePath) and os.path.isdir(remotePath)):
                localFileName = FileUtil().getFileName(localPath)
                remoteFilePath = os.path.join(remotePath, localFileName)
            else:
                remoteFilePath = remotePath
            with open(localPath, 'rb') as lFP:
                self.__ftpClient.storbinary('STOR %s' % remoteFilePath, lFP)
            if remoteFilePath in self.listdir(remotePathDir):
                return True
            else:
                logger.error("put failing for localPath %s remoteFilePath %s",
                             localPath, remoteFilePath)
                return False
        except Exception as e:
            if self.__raiseExceptions:
                raise e
            else:
                logger.error(
                    "put failing for localPath %s  remotePath %s with %s",
                    localPath, remotePath, str(e))
                return False

Example #6

0

Show file

File: testPharosTargetActivityProvider.py Project: rcsb/py-rcsb_utils_targets

 def __pharosFixture(self):
     try:
         ok = False
         fU = FileUtil()
         srcPath = os.path.join(self.__dataPath, "Pharos")
         dstPath = os.path.join(self.__cachePath, "Pharos-targets")
         for fn in ["drug_activity", "cmpd_activity", "protein"]:
             inpPath = os.path.join(srcPath, fn + ".tdd.gz")
             outPath = os.path.join(dstPath, fn + ".tdd.gz")
             fU.get(inpPath, outPath)
             fU.uncompress(outPath, outputDir=dstPath)
             fU.remove(outPath)
         fU.put(os.path.join(srcPath, "pharos-readme.txt"),
                os.path.join(dstPath, "pharos-readme.txt"))
         ok = True
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         ok = False
     return ok

Example #7

0

Show file

 def __reload(self, dirPath, baseVersion, useCache, **kwargs):
     startTime = time.time()
     mU = MarshalUtil(workPath=dirPath)
     chemblDbUrl = kwargs.get(
         "ChEMBLDbUrl",
         "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/")
     ok = False
     fU = FileUtil()
     fU.mkdir(dirPath)
     #
     # ChEMBL current version <baseVersion>,...
     # template:  chembl_<baseVersion>.fa.gz
     #
     targetFileName = "chembl_" + str(baseVersion) + ".fa.gz"
     mappingFileName = "chembl_uniprot_mapping.txt"
     #
     chemblTargetPath = os.path.join(dirPath, targetFileName)
     chemblMappingPath = os.path.join(dirPath, mappingFileName)
     mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json")
     #
     mapD = {}
     if useCache and fU.exists(mappingFilePath):
         logger.info("useCache %r using %r and %r and %r", useCache,
                     chemblTargetPath, chemblMappingPath, mappingFilePath)
         mapD = mU.doImport(mappingFilePath, fmt="json")
     else:
         # Get the ChEMBL UniProt mapping file
         url = os.path.join(chemblDbUrl, mappingFileName)
         ok = fU.get(url, chemblMappingPath)
         logger.info("Fetched %r url %s path %s", ok, url,
                     chemblMappingPath)
         logger.info("Reading ChEMBL mapping file path %s", mappingFilePath)
         rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list")
         for row in rowL:
             mapD[row[0]] = (row[1], row[2], row[3])
         ok = mU.doExport(mappingFilePath, mapD, fmt="json")
         logger.info("Processed mapping path %s (%d) %r", mappingFilePath,
                     len(mapD), ok)
         #
         # Get the target FASTA files --
         for vers in range(baseVersion, baseVersion + 10):
             logger.info("Now fetching version %r", vers)
             self.__version = vers
             targetFileName = "chembl_" + str(vers) + ".fa.gz"
             chemblTargetPath = os.path.join(dirPath,
                                             "chembl_targets_raw.fa.gz")
             url = os.path.join(chemblDbUrl, targetFileName)
             ok = fU.get(url, chemblTargetPath)
             logger.info("Fetched %r url %s path %s", ok, url,
                         chemblTargetPath)
             if ok:
                 break
     #
     logger.info("Completed reload at %s (%.4f seconds)",
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return mapD

Example #8

0

Show file

File: ConfigUtil.py Project: rcsb/py-rcsb_utils_config

    def __processAppendedSections(self,
                                  appendConfigOption,
                                  cachePath,
                                  useCache=True):
        """Fetch and append configuration assets assigned to input configuration option.

        Args:
            appendConfigOption (str): reserved configuration option to hold a list of configuration asset locators
            cachePath (str): path to store cached copies configuration assets
            useCache (bool, optional): use existing cached configuration assets. Defaults to True.

        Returns:
            bool: True for success of False otherwise
        """
        try:
            ret = True
            appendLocL = self.getList(appendConfigOption,
                                      sectionName=self.__defaultSectionName)
            logger.debug("appendLocL is %r", appendLocL)
            if appendLocL:
                cP = os.path.join(cachePath, "config")
                fU = FileUtil(workPath=cP)
                logger.debug("Fetching append sections from %r", appendLocL)
                for appendLoc in appendLocL:
                    fn = fU.getFileName(appendLoc)
                    fp = os.path.join(cP, fn)
                    okF = True
                    if not (useCache and fU.exists(fp)):
                        # get a fresh copy from source
                        okF = fU.get(appendLoc, fp)
                        logger.debug("Fetched %r to %r", appendLoc, fp)
                    ok = self.appendConfig(fp)
                    ret = ret and ok and okF
        except Exception as e:
            logger.exception("Failing for option %r cachePath %r with %s",
                             appendConfigOption, cachePath, str(e))
            ret = False
        #
        if not ret:
            logger.error("Fetching appended sections failing %r", appendLocL)

        return ret

Example #9

0

Show file

File: RemovedHoldingsProvider.py Project: rcsb/py-rcsb_utils_repository

 def __reload(self, urlTarget, urlFallbackTarget, dirPath, useCache=True):
     invD = {}
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     fp = os.path.join(dirPath, fn)
     self.__mU.mkdir(dirPath)
     #
     if useCache and self.__mU.exists(fp):
         invD = self.__mU.doImport(fp, fmt="json")
         logger.debug("Reading cached inventory (%d)", len(invD))
     else:
         logger.info("Fetch inventory from %s", urlTarget)
         ok = fU.get(urlTarget, fp)
         if not ok:
             ok = fU.get(urlFallbackTarget, fp)
         #
         if ok:
             invD = self.__mU.doImport(fp, fmt="json")
     #
     return invD

Example #10

0

Show file

File: testFileUtil.py Project: kimadeline/py-rcsb_utils_io

    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                     "mock-data",
                                                     "dictionaries",
                                                     "mmcif_pdbx_v5_next.dic")

        self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "NCBI", "names.dmp.gz")
        self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip"
        #
        self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__inpDirPath = os.path.join(HERE, "test-data")
        self.__fileU = FileUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

Example #11

0

Show file

    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Utilities to access and update provenance details.

        Args:
            cfgOb ([type]): ConfigInfo() instance
            cachePath ([type]): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = cachePath
        self.__useCache = useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")
        self.__provenanceCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR", sectionName=self.__configName))
        self.__provenanceLocator = self.__cfgOb.getPath("PROVENANCE_INFO_LOCATOR", sectionName=self.__configName)
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__fileU.mkdir(self.__provenanceCachePath)
        self.__kwargs = kwargs

Example #12

0

Show file

File: ChemCompSearchWrapper.py Project: rcsb/py-rcsb_utils_chem

    def stashDependencies(self,
                          url,
                          dirPath,
                          bundleLabel="A",
                          userName=None,
                          pw=None):
        """Store a copy of the bundled search dependencies remotely -

        Args:
            url (str): URL string for the destination host (e.g. sftp://myserver.net or None for a local file)
            dirPath (str): directory path on the remote resource
            bundleLabel (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            userName (str, optional): optional access information. Defaults to None.
            password (str, optional): optional access information. Defaults to None.

        Returns:
          bool:  True for success or False otherwise

        """
        try:
            ok = False
            fn = self.__makeBundleFileName(self.__dependFileName,
                                           bundleLabel=bundleLabel)
            if url and url.startswith("sftp://"):
                sftpU = SftpUtil()
                hostName = url[7:]
                ok = sftpU.connect(hostName, userName, pw=pw, port=22)
                if ok:
                    remotePath = os.path.join("/", dirPath, fn)
                    ok = sftpU.put(self.__dependTarFilePath, remotePath)
            elif not url:
                fileU = FileUtil()
                remotePath = os.path.join(dirPath, fn)
                ok = fileU.put(self.__dependTarFilePath, remotePath)
            else:
                logger.error("Unsupported stash protocol %r", url)
            return ok
        except Exception as e:
            logger.exception("For %r %r failing with %s", url, dirPath, str(e))
        return False

Example #13

0

Show file

    def __reload(self, urlTarget, dirPath, useCache=True):
        """Reload local cache of mapping resources to support validation report reader and translator.

        Args:
            urlTarget (list, str): URL for schema mapping file
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of ValidationReportReader()
        """
        mapD = {}
        #
        mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        mappingFilePath = os.path.join(dirPath, fn)
        mU.mkdir(dirPath)
        #
        # if not useCache:
        #     for fp in [mappingFilePath]:
        #         try:
        #             os.remove(fp)
        #         except Exception:
        #             pass
        # #
        logger.debug("Loading validation mapping data in %s (useCache %r)", fn,
                     useCache)
        if useCache and fU.exists(mappingFilePath):
            mapD = mU.doImport(mappingFilePath, fmt="json")
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        mappingFilePath)
            tS = uuid.uuid4().hex
            tP = os.path.join(dirPath, "._" + tS)
            ok = fU.get(urlTarget, tP)
            if ok:
                mapD = mU.doImport(tP, fmt="json")
                os.replace(tP, mappingFilePath)
        return mapD

Example #14

0

Show file

 def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath,
                    useCache):
     pfamD = {}
     fmt = "json"
     ext = fmt if fmt == "json" else "pic"
     pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext)
     #
     logger.debug("Using cache data path %s", dirPath)
     self.__mU.mkdir(dirPath)
     #
     if useCache and self.__mU.exists(pfamDataPath):
         pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt)
         logger.debug("Pfam data length %d", len(pfamD))
     elif not useCache:
         # ------
         fU = FileUtil()
         logger.info("Fetch data from source %s in %s", urlTargetPfam,
                     dirPath)
         fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam))
         ok = fU.get(urlTargetPfam, fp)
         if not ok:
             fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB))
             ok = fU.get(urlTargetPfamFB, fp)
             logger.info("Fetch data fallback fetch status is %r", ok)
         pfamD = self.__getPfamIndex(fp)
         ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt)
         logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath,
                     ok)
         # ------
     #
     return pfamD

Example #15

0

Show file

 def __reload(self, dirPath, **kwargs):
     startTime = time.time()
     fD = {}
     useCache = kwargs.get("useCache", True)
     ok = False
     cofactorPath = self.__getCofactorDataPath()
     #
     logger.info("useCache %r cofactorPath %r", useCache, cofactorPath)
     if useCache and self.__mU.exists(cofactorPath):
         fD = self.__mU.doImport(cofactorPath, fmt="json")
         ok = True
     else:
         fU = FileUtil()
         fU.mkdir(dirPath)
     # ---
     numCofactors = len(fD["cofactors"]) if fD and "cofactors" in fD else 0
     logger.info(
         "Completed reload of (%d) cofactors with status (%r) at %s (%.4f seconds)",
         numCofactors, ok,
         time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
         time.time() - startTime)
     return fD

Example #16

0

Show file

File: InterProProvider.py Project: rcsb/py-rcsb_utils_seq

    def __rebuildCache(self, urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache):
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        interProDataPath = os.path.join(dirPath, "interPro-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(interProDataPath):
            rD = self.__mU.doImport(interProDataPath, fmt=fmt)
            interProD = rD["index"]
            interProParentD = rD["parents"]
            logger.debug("InterPro index length %d parent length %d", len(interProD), len(interProParentD))
        else:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetInterPro, dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetInterPro))
            ok = fU.get(urlTargetInterPro, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProFB))
                ok = fU.get(urlTargetInterProFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            interProD = self.__getInterProIndex(fp)

            logger.info("Caching %d in %s status %r", len(interProD), interProDataPath, ok)
            # ------
            logger.info("Fetch data from source %s in %s", urlTargetInterProParent, dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParent))
            ok = fU.get(urlTargetInterProParent, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParentFB))
                ok = fU.get(urlTargetInterProParentFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            interProParentD = self.__getInterProParents(fp)
            #
            ok = self.__mU.doExport(interProDataPath, {"index": interProD, "parents": interProParentD}, fmt=fmt)
        #
        return interProD, interProParentD

Example #17

0

Show file

File: StashUtil.py Project: kimadeline/py-rcsb_utils_io

    def fetchBundle(self, localRestoreDirPath, url, remoteDirPath, remoteStashPrefix="A", userName=None, password=None):
        """Restore bundled dependencies from remote storage and unbundle these in the
           current local cache directory.

        Args:
            localRestoreDirPath (str): local restore path
            url (str): remote URL
            remoteDirPath (str): remote directory path on the remote resource
            remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            userName (str, optional): optional access information. Defaults to None.
            password (str, optional): optional access information. Defaults to None.
        """
        try:
            ok = False
            fileU = FileUtil()
            fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix)
            if not url:
                remotePath = os.path.join(remoteDirPath, fn)
                ok = fileU.get(remotePath, self.__localStashTarFilePath)

            elif url and (url.startswith("http://") or url.startswith("https://")):
                remotePath = url + os.path.join("/", remoteDirPath, fn)
                ok = fileU.get(remotePath, self.__localStashTarFilePath)

            elif url and url.startswith("sftp://"):
                sftpU = SftpUtil()
                ok = sftpU.connect(url[7:], userName, pw=password, port=22)
                if ok:
                    remotePath = os.path.join(remoteDirPath, fn)
                    ok = sftpU.get(remotePath, self.__localStashTarFilePath)
            else:
                logger.error("Unsupported protocol %r", url)
            if ok:
                ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath)
            return ok
        except Exception as e:
            logger.exception("For %r %r Failing with %s", url, remoteDirPath, str(e))
            ok = False
        return ok

Example #18

0

Show file

    def __init__(self,
                 cfgOb,
                 cachePath,
                 useCache=True,
                 rebuildFlag=False,
                 **kwargs):
        """A collection of schema build and caching methods.

        Args:
            cfgOb (object): ConfigInfo() instance
            cachePath (str): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
            rebuildFlag (bool, optional): on-the-fly rebuild and cache schema
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = os.path.abspath(cachePath)
        self.__useCache = useCache
        self.__rebuildFlag = rebuildFlag
        self.__useCache = rebuildFlag if rebuildFlag else useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")

        self.__fileU = FileUtil(
            workPath=os.path.join(self.__cachePath, "work"))
        self.__schemaCachePath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR",
                             sectionName=self.__configName))
        self.__jsonSchemaCachePath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR",
                             sectionName=self.__configName))
        self.__fileU.mkdir(self.__schemaCachePath)
        self.__fileU.mkdir(self.__jsonSchemaCachePath)
        self.__kwargs = kwargs

Example #19

0

Show file

 def __reload(self, dirPath, useCache):
     startTime = time.time()
     aD = {}
     allIdD = {}
     fU = FileUtil()
     fU.mkdir(dirPath)
     targetActivityFilePath = self.getTargetActivityDataPath()
     #
     if useCache and fU.exists(targetActivityFilePath):
         logger.info("useCache %r using %r", useCache, targetActivityFilePath)
         qD = self.__mU.doImport(targetActivityFilePath, fmt="json")
         aD = qD["activity"] if "activity" in qD else {}
         idL = qD["all_ids"] if "all_ids" in qD else []
         allIdD = {k: k in aD for k in idL}
     #
     logger.info(
         "Completed reload (%d activities) (%d tried identifiers) at %s (%.4f seconds)",
         len(aD),
         len(allIdD),
         time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
         time.time() - startTime,
     )
     #
     return aD, allIdD

Example #20

0

Show file

File: ChemCompSearchWrapper.py Project: rcsb/py-rcsb_utils_chem

    def buildDependenices(self, ccUrlTarget, birdUrlTarget, **kwargs):
        """Convenience method to build configuration and static dependencies for the chemical search services.

        Args:
            ccUrlTarget (str): path to source concatenated chemical component definition file
            birdUrlTarget (str): path to the source concatenated BIRD definition file

            Other options are propagated to configurations of the wrapped classes in __bootstrapConfig()

        """
        try:
            okT = False
            ok1 = self.setConfig(ccUrlTarget=ccUrlTarget,
                                 birdUrlTarget=birdUrlTarget,
                                 **kwargs)
            useCache = kwargs.get("useCache", False)
            ok2 = self.updateChemCompIndex(useCache=useCache)
            ok3 = self.updateSearchIndex(useCache=useCache)
            ok4 = self.updateSearchMoleculeProvider(useCache=useCache)
            okBuild = ok1 and ok2 and ok3 and ok4
            if okBuild:
                fileU = FileUtil()
                dirPathList = [
                    os.path.join(self.__cachePath, subDir)
                    for subDir in ["chem_comp", "oe_mol", "config"]
                ]
                okT = fileU.bundleTarfile(self.__dependTarFilePath,
                                          dirPathList,
                                          mode="w:gz",
                                          recursive=True)
            #
            return okT and okBuild
        except Exception as e:
            logger.exception("Failing build with %r and %r with %s",
                             ccUrlTarget, birdUrlTarget, str(e))
        return False

Example #21

0

Show file

File: EcodClassificationProvider.py Project: rcsb/py-rcsb_utils_struct

 def __fetchFromSource(self, urlTarget):
     """Fetch the classification names and domain assignments from the ECOD repo."""
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     fp = os.path.join(self.__dirPath, fn)
     if not fU.exists(fp):
         fU.get(urlTarget, fp)
     #
     with open(fp, "r", encoding="utf-8") as ifh:
         line = ifh.readline()
         line = ifh.readline()
         line = ifh.readline()
         ff = line[:-1].split()
         self.__version = ff[-1]
     #
     nmL = self.__mU.doImport(fp, fmt="list", uncomment=True)
     fU.remove(fp)
     #
     return nmL

Example #22

0

Show file

    def pushBundle(self, gitRepositoryPath, accessToken, gitHost="github.com", gitBranch="master", remoteStashPrefix="A", maxSizeMB=95):
        """Push bundle to remote stash git repository.

        Args:
            gitRepositoryPath (str): git repository path (e.g., rcsb/py-rcsb_exdb_assets_stash)
            accessToken (str): git repository access token
            gitHost (str, optional): git repository host name. Defaults to github.com.
            gitBranch (str, optional): git branch name. Defaults to master.
            remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            maxSizeMB (int, optional): maximum stash bundle file size that will be committed. Defaults to 95MB.

        Returns:
          bool:  True for success or False otherwise

        """
        try:
            ok = False
            gU = GitUtil(token=accessToken, repositoryHost=gitHost)
            fU = FileUtil()
            localRepositoryPath = os.path.join(self.__localBundlePath, "stash_repository")
            fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix)
            #
            # Update existing local repository, otherwise clone a new copy
            if fU.exists(localRepositoryPath):
                ok = gU.pull(localRepositoryPath, branch=gitBranch)
                logger.debug("After pull status %r", gU.status(localRepositoryPath))
            else:
                ok = gU.clone(gitRepositoryPath, localRepositoryPath, branch=gitBranch)
            #
            # Split all bundles
            mbSize = float(fU.size(self.__localStashTarFilePath)) / 1000000.0
            logger.info("Splitting bundle %r (%.3f MB/Max %d MB)", fn, mbSize, maxSizeMB)
            sj = SplitJoin()
            splitDirPath = os.path.join(localRepositoryPath, "stash", fn[:-7])
            sj.split(self.__localStashTarFilePath, splitDirPath, maxSizeMB=maxSizeMB)
            fU.remove(self.__localStashTarFilePath)
            # else:
            # fU.put(self.__localStashTarFilePath, os.path.join(localRepositoryPath, "stash", fn))

            ok = gU.addAll(localRepositoryPath, branch=gitBranch)
            ok = gU.commit(localRepositoryPath, branch=gitBranch)
            logger.debug("After commit status %r", gU.status(localRepositoryPath))
            #
            if accessToken:
                ok = gU.push(localRepositoryPath, branch=gitBranch)
                logger.info("After push status %r", gU.status(localRepositoryPath))
            #
            return ok
        except Exception as e:
            logger.exception("For %r %r failing with %s", gitHost, gitRepositoryPath, str(e))
        return False

Example #23

0

Show file

    def setUp(self):
        self.__workPath = os.path.join(HERE, "test-output")
        #
        self.__testLogFileMin = os.path.join(self.__workPath,
                                             "logfile-min.json")
        self.__testLogFileDetailed = os.path.join(self.__workPath,
                                                  "logfile-detailed.json")
        fU = FileUtil()
        fU.remove(self.__testLogFileMin)
        fU.remove(self.__testLogFileDetailed)

        self.__startTime = time.time()
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

Example #24

0

Show file

File: testChemCompSearchWrapper.py Project: rcsb/py-rcsb_utils_chem

    def testAABuildDependenciesAndStash(self):
        """Test case - build, stash and restore dependencies -"""
        try:
            ccsw = ChemCompSearchWrapper()
            ccUrlTarget = os.path.join(
                self.__dataPath,
                "components-abbrev.cif") if not self.__testFlagFull else None
            birdUrlTarget = os.path.join(
                self.__dataPath,
                "prdcc-abbrev.cif") if not self.__testFlagFull else None
            ok = ccsw.buildDependenices(ccUrlTarget=ccUrlTarget,
                                        birdUrlTarget=birdUrlTarget)
            self.assertTrue(ok)
            #
            if self.__testStash:
                url = "sftp://bl-east.rcsb.org"
                userName = ""
                pw = ""
                dirPath = "4-coastal"
                ok = ccsw.stashDependencies(url,
                                            dirPath,
                                            userName=userName,
                                            pw=pw)
                self.assertTrue(ok)
                #
                fileU = FileUtil()
                fileU.remove(self.__cachePath)
                #
                url = "http://bl-east.rcsb.org"
                ok = ccsw.restoreDependencies(url, dirPath)
                #
                fileU.remove(self.__cachePath)
                #
                url = "sftp://bl-east.rcsb.org"
                ok = ccsw.restoreDependencies(url,
                                              dirPath,
                                              userName=userName,
                                              pw=pw)
                self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

Example #25

0

Show file

    def __reload(self, urlTarget, dirPath, useCache=True):
        """ Reload input GO OBO ontology file and return a nx graph object.
'
        Returns:
            dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... }
        """
        goGraph = None
        #
        # mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        oboFilePath = os.path.join(dirPath, fn)
        fU.mkdir(dirPath)
        #
        if not useCache:
            for fp in [oboFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and fU.exists(oboFilePath):
            goGraph = obonet.read_obo(oboFilePath)
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        oboFilePath)
            ok = fU.get(urlTarget, oboFilePath)
            if ok:
                goGraph = obonet.read_obo(oboFilePath)
        if goGraph:
            logger.info("Reading %d nodes and %d edges", len(goGraph),
                        goGraph.number_of_edges())
        else:
            logger.info("Go graph construction failing")
        #
        return goGraph

Example #26

0

Show file

    def get(self, remotePath, localPath):
        """Get a file from a remote FTP server.

        Arguments:
            remotePath (str): remote file path
            localPath (str): local file path

        Returns:
            bool: True for success or false otherwise
        """
        try:
            fileU = FileUtil()
            fileU.mkdirForFile(localPath)
            # If provided localPath already exists and is a directory, retrieve the file using the name on the remote server
            # to avoid unintentionally overwriting an entire local directory with a single retrieved file
            if (os.path.exists(localPath) and os.path.isdir(localPath)):
                remoteFileName = FileUtil().getFileName(remotePath)
                localFilePath = os.path.join(localPath, remoteFileName)
            else:
                localFilePath = localPath
            with open(localFilePath, 'wb') as lFP:
                self.__ftpClient.retrbinary('RETR %s' % remotePath, lFP.write)
            ok = fileU.exists(localFilePath)
            if ok:
                return True
            else:
                logger.error("get failing for remotePath %s localFilePath %s",
                             remotePath, localFilePath)
                return False
        except Exception as e:
            if self.__raiseExceptions:
                raise e
            else:
                logger.error(
                    "get failing for remotePath %s localPath %s with %s",
                    remotePath, localPath, str(e))
                return False

Example #27

0

Show file

File: SchemaProvider.py Project: MShaffar19/py-rcsb_db

class SchemaProvider(SingletonClass):
    """ A collection of schema build and caching methods.

        Static cache worflow:

            <authorative source>  <--   <cache dir>  <-  client API

        Compute workflow:

        <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema>

    """

    def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs):
        """A collection of schema build and caching methods.

        Args:
            cfgOb (object): ConfigInfo() instance
            cachePath (str): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
            rebuildFlag (bool, optional): on-the-fly rebuild and cache schema
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = os.path.abspath(cachePath)
        self.__useCache = useCache
        self.__rebuildFlag = rebuildFlag
        self.__useCache = rebuildFlag if rebuildFlag else useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")

        self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work"))
        self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__fileU.mkdir(self.__schemaCachePath)
        self.__fileU.mkdir(self.__jsonSchemaCachePath)
        self.__kwargs = kwargs

    def getSchemaOptions(self, schemaLevel, extraOpts=None):
        opts = extraOpts + "|" if extraOpts else ""
        if schemaLevel == "full":
            return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb"
        elif schemaLevel in ["min", "minimum"]:
            return opts + "mandatoryKeys|enums|rcsb"
        else:
            return opts

    def getSchemaInfo(self, databaseName, dataTyping="ANY"):
        """Convenience method to return essential schema details for the input repository content type.

        Args:
            databaseName (str): schema name  (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

        Returns:
            tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list


        """
        sd = None
        dbName = None
        collectionNameList = []
        docIndexD = {}
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
            if self.__rebuildFlag:
                filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True)
            else:
                filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache)

            if not filePath:
                logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping)
            logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator)
            schemaDef = mU.doImport(filePath, fmt="json")
            if schemaDef:
                logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping)
                sd = SchemaDefAccess(schemaDef)
                if sd:
                    dbName = sd.getDatabaseName()
                    collectionInfoList = sd.getCollectionInfo()
                    logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList)
                    for cd in collectionInfoList:
                        collectionName = cd["NAME"]
                        collectionNameList.append(collectionName)
                        docIndexD[collectionName] = sd.getDocumentIndices(collectionName)

        except Exception as e:
            logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e))

        return sd, dbName, collectionNameList, docIndexD

    def schemaDefCompare(self, databaseName, dataTyping="ANY"):
        """Compare computed schema defintion with current source/cached version.

        Args:
            databaseName (str): schema definition name for comparison
            dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY".

        Returns:
            (str): file path for schema difference or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
        fn = self.__fileU.getFileName(schemaPath)
        sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping)
        v2 = sD["DATABASE_VERSION"]
        # ----
        # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting schema def to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # sD = mU.doImport(tPath, fmt="json")
        # ----
        cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath))
        sDCache = mU.doImport(cPath, fmt="json")
        v1 = sDCache["DATABASE_VERSION"]
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        #
        # jD = diff(sDCache, sD, syntax="explicit", marshal=True)
        diffPath = None
        if numDiff:
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100))
            mU.doExport(diffPath, difD, fmt="json", indent=3)
        #
        return diffPath

    def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None):
        """Compare computed JSON schema defintion with current source/cached version.

        Args:
            databaseName (str): schema name
            collectionName (str): collection name
            encodingType (str): schema data type conventions (JSON|BSON)
            level (str): metadata level (min|full)
            extraOpts (str): extra schema construction options

        Returns:
            (str): path to the difference file or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level)
        fn = self.__fileU.getFileName(schemaLocator)
        schemaPath = os.path.join(self.__jsonSchemaCachePath, fn)
        #
        sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts)
        v2 = self.__getSchemaVersion(sD)
        # ----
        # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting json schema to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # ----
        #
        sDCache = mU.doImport(schemaPath, fmt="json")
        v1 = self.__getSchemaVersion(sDCache)
        if not v1:
            logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName)
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        # jD = diff(sDCache, sD, marshal=True, syntax="explicit")
        diffPath = None
        if numDiff:
            logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100))
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            mU.doExport(diffPath, difD, fmt="json", indent=3)

        return diffPath

    def __getSchemaVersion(self, jsonSchema):
        try:
            comment = jsonSchema["$comment"] if "$comment" in jsonSchema else ""
            ff = comment.split(":")
            version = ff[1].strip()
            return version
        except Exception as e:
            logger.exception("Failing for with %s", str(e))
        return ""

    def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"):
        """Internal method returning schema definition path for the input content type and application.
           Defines schema definition naming convention -

           Args:
            databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

            Returns:

             str: schema definition file locator

        """
        schemaLocator = None
        try:
            locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
            fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper())
            schemaLocator = os.path.join(locPath, fn)
        except Exception as e:
            logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e))
        return schemaLocator

    def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"):
        """Internal method returning JSON schema path for the input collection data type convention and level.
           Defines the JSON/BSON schema naming convention -

           Args:
            databaseName (str): database name in the document store
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

            Returns:

            str: schema file locator

        """
        schemaLocator = None
        try:
            sdType = None
            sLevel = None
            schemaLocator = None
            if encodingType.upper() in ["JSON", "BSON"]:
                sdType = encodingType.lower()
            if level.lower() in ["min", "minimun"]:
                sLevel = "min"
            elif level.lower() in ["full"]:
                sLevel = level.lower()
            #
            if sdType and sLevel:
                locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
                fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName)
                schemaLocator = os.path.join(locPath, fn)
            else:
                logger.error("Unsupported schema options:  %s level %r type %r", collectionName, level, encodingType)
                schemaLocator = None
        except Exception as e:
            logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e))
        #
        return schemaLocator

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Target cache filePath %s", filePath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.info("Fetch data from source %s to %s", locator, filePath)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None):
        """Return JSON schema (w/ BSON types) object for the input collection and level.and

        Args:
            databaseName (str): database name
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

        Returns:
            dict: Schema object

        """
        sObj = None
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
        #
        if self.__rebuildFlag:
            filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
            self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts)
        else:
            filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache)
        mU = MarshalUtil(workPath=self.__workPath)
        if filePath and mU.exists(filePath):
            mU = MarshalUtil(workPath=self.__workPath)
            sObj = mU.doImport(filePath, fmt="json")
        else:
            logger.debug("Failed to read schema for %s %r", collectionName, level)
        return sObj

    def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None):
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            #
            cD = None
            stU = encodingType.upper()
            cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts))
            if cD and saveSchema:
                schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
                localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e))
        return cD

    def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False):
        schemaDef = None
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb")
            if schemaDef and saveSchema:
                schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
                localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s failing with %s", databaseName, str(e))
        return schemaDef

    def schemaCompare(self, orgD, newD):
        """ Compute the difference of nested dictionaries.

        """
        fOrgD = self.__flatten(orgD)
        fNewD = self.__flatten(newD)
        if len(fOrgD) != len(fNewD):
            logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD))
        #
        addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)}
        removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)}
        changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        chD = {}
        for ky in changedOrgD:
            kyS = ".".join(ky)
            vOrg = changedOrgD[ky]
            vNew = changedNewD[ky]
            if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)):
                # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew)
                dV = list(set(vNew) - set(vOrg))
                if dV:
                    chD[kyS] = {"diff": dV}
            else:
                chD[kyS] = {"from": vOrg, "to": vNew}
        #
        nT = len(addedD) + len(removedD) + len(chD)
        diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD}
        return nT, diffD

    def __flatten(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, (list, tuple)) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenX(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        # separator = "."
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenOrg(self, inpDict, separator=".", prefix=""):
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flattenOrg(value, separator, prefix + key + separator)
                outDict.update({key2: val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator)
                        outDict.update({key2: val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[prefix + key + separator + str(index)] = value
            else:
                outDict[prefix + key] = value
        return outDict

    def __dictGen(self, indict, pre=None):
        pre = pre[:] if pre else []
        if isinstance(indict, dict):
            for key, value in indict.items():
                if isinstance(value, dict):
                    for dD in self.__dictGen(value, pre + [key]):
                        yield dD
                elif isinstance(value, list) or isinstance(value, tuple):
                    for v in value:
                        for dD in self.__dictGen(v, pre + [key]):
                            yield dD
                else:
                    yield pre + [key, value]
        else:
            yield indict

Example #28

0

Show file

File: IoUtil.py Project: rcsb/py-rcsb_utils_io

 def __init__(self, **kwargs):
     self.__fileU = FileUtil(**kwargs)

Example #29

0

Show file

File: IoUtil.py Project: rcsb/py-rcsb_utils_io

class IoUtil(object):
    def __init__(self, **kwargs):
        self.__fileU = FileUtil(**kwargs)

    def serialize(self, filePath, myObj, fmt="pickle", **kwargs):
        """Public method to serialize format appropriate objects

        Args:
            filePath (str): local file path'
            myObj (object): format appropriate object to be serialized
            format (str, optional): one of ['mmcif', mmcif-dict', json', 'list', 'text-dump', pickle' (default)]
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            bool: status of serialization operation; true for success or false otherwise

        """
        ret = False
        fmt = str(fmt).lower()
        ret = self.__fileU.mkdirForFile(filePath)
        if not ret:
            return ret
        if fmt in ["mmcif"]:
            ret = self.__serializeMmCif(filePath, myObj, **kwargs)
        elif fmt in ["json"]:
            ret = self.__serializeJson(filePath, myObj, **kwargs)
        elif fmt in ["pickle"]:
            ret = self.__serializePickle(filePath, myObj, **kwargs)
        elif fmt in ["list"]:
            ret = self.__serializeList(filePath,
                                       myObj,
                                       enforceAscii=True,
                                       **kwargs)
        elif fmt in ["mmcif-dict"]:
            ret = self.__serializeMmCifDict(filePath, myObj, **kwargs)
        elif fmt in ["text-dump"]:
            ret = self.__textDump(filePath, myObj, **kwargs)
        elif fmt in ["fasta"]:
            ret = self.__serializeFasta(filePath, myObj, **kwargs)
        elif fmt in ["csv"]:
            ret = self.__serializeCsv(filePath, myObj, **kwargs)
        else:
            pass

        return ret

    def deserialize(self, filePath, fmt="pickle", **kwargs):
        """Public method to deserialize objects in supported formats.

        Args:
            filePath (str): local file path
            format (str, optional): one of ['mmcif', 'json', 'list', ..., 'pickle' (default)]
            **kwargs:  additional keyword arguments passed to worker methods -

        Returns:
            object: deserialized object data

        """
        fmt = str(fmt).lower()
        if fmt in ["mmcif"]:
            ret = self.__deserializeMmCif(filePath, **kwargs)  # type: ignore
        elif fmt in ["json"]:
            ret = self.__deserializeJson(filePath, **kwargs)  # type: ignore
        elif fmt in ["pickle"]:
            ret = self.__deserializePickle(filePath, **kwargs)  # type: ignore
        elif fmt in ["list"]:
            ret = self.__deserializeList(filePath, enforceAscii=True,
                                         **kwargs)  # type: ignore
        elif fmt in ["mmcif-dict"]:
            ret = self.__deserializeMmCifDict(filePath,
                                              **kwargs)  # type: ignore
        elif fmt in ["fasta"]:
            ret = self.__deserializeFasta(filePath, **kwargs)  # type: ignore
        # elif fmt in ["vrpt-xml-to-cif"]:
        #    ret = self.__deserializeVrptToCif(filePath, **kwargs)  # type: ignore
        elif fmt in ["csv", "tdd"]:
            delimiter = kwargs.get("csvDelimiter",
                                   "," if fmt == "csv" else "\t")
            ret = self.__deserializeCsv(filePath,
                                        delimiter=delimiter,
                                        **kwargs)  # type: ignore
        elif fmt in ["xml"]:
            ret = self.__deserializeXml(filePath, **kwargs)  # type: ignore
        else:
            ret = None  # type: ignore

        return ret

    def __sliceInChunks(self, myList, numChunks):
        mc = min(len(myList), numChunks)
        chunkSize = int(len(myList) / mc)
        if len(myList) % mc:
            chunkSize += 1
        for i in range(0, len(myList), chunkSize):
            yield myList[i:i + chunkSize]

    def serializeInParts(self,
                         filePath,
                         myObj,
                         numParts,
                         fmt="json",
                         **kwargs):
        """Public method to serialize format appropriate (json, pickle) objects in multiple parts

        Args:
            filePath (str): local file path
            myObj (object): format appropriate object to be serialized
            numParts (int): divide the data into numParts segments
            format (str, optional): one of ['json' or 'pickle']. Defaults to json
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            bool: True for success or False otherwise
        """
        if fmt not in ["json", "pickle"]:
            logger.error("Unsupported format for %s", fmt)
            return False
        pth, fn = os.path.split(filePath)
        self.__fileU.mkdirForFile(pth)
        bn, ext = os.path.splitext(fn)
        ret = True
        if isinstance(myObj, list):
            for ii, subList in enumerate(self.__sliceInChunks(myObj,
                                                              numParts)):
                fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
                ok = self.serialize(fp, subList, fmt=fmt, **kwargs)
                ret = ret and ok
        elif isinstance(myObj, dict):
            for ii, keyList in enumerate(
                    self.__sliceInChunks(list(myObj.keys()), numParts)):
                fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
                ok = self.serialize(fp,
                                    OrderedDict([(k, myObj[k])
                                                 for k in keyList]),
                                    fmt=fmt,
                                    **kwargs)
                ret = ret and ok
        else:
            logger.error("Unsupported data type for serialization in parts")
            ret = False
        #
        return ret

    def deserializeInParts(self, filePath, numParts, fmt="json", **kwargs):
        """Public method to deserialize objects in supported formats from multiple parts

        Args:
            filePath (str): local file path
            numParts (int): reconstruct the data object from numParts segments
            format (str, optional): one of ['json' or 'pickle']. Defaults to json
            **kwargs: additional keyword arguments passed to worker methods -

        Returns:
            object: deserialized object data
        """
        rObj = None
        if fmt not in ["json", "pickle"]:
            logger.error("Unsupported format for %s", fmt)
            return rObj
        #
        pth, fn = os.path.split(filePath)
        bn, ext = os.path.splitext(fn)
        if not numParts:
            fp = os.path.join(pth, bn + "_part_*" + ext)
            numParts = len(glob.glob(fp))
        #
        for ii in range(numParts):
            fp = os.path.join(pth, bn + "_part_%d" % (ii + 1) + ext)
            tObj = self.deserialize(fp, fmt=fmt, **kwargs)
            if isinstance(tObj, list):
                if not rObj:
                    rObj = []
                rObj.extend(tObj)
            elif isinstance(tObj, dict):
                if not rObj:
                    rObj = OrderedDict()
                rObj.update(tObj)
            else:
                logger.error(
                    "Unsupported data type for deserialization in parts")
        return rObj

    def exists(self, filePath, mode=os.R_OK):
        return self.__fileU.exists(filePath, mode=mode)

    def mkdir(self, dirPath, mode=0o755):
        return self.__fileU.mkdir(dirPath, mode=mode)

    def remove(self, pth):
        return self.__fileU.remove(pth)

    def __deserializeFasta(self, filePath, **kwargs):
        try:
            commentStyle = kwargs.get("commentStyle", "uniprot")
            fau = FastaUtil()
            return fau.readFasta(filePath, commentStyle=commentStyle)
        except Exception as e:
            logger.error("Unable to deserialize %r %r ", filePath, str(e))
        return {}

    def __serializeFasta(self, filePath, myObj, **kwargs):
        try:
            maxLineLength = int(kwargs.get("maxLineLength", 70))
            makeComment = kwargs.get("makeComment", False)
            fau = FastaUtil()
            ok = fau.writeFasta(filePath,
                                myObj,
                                maxLineLength=maxLineLength,
                                makeComment=makeComment)
            return ok
        except Exception as e:
            logger.error("Unable to serialize FASTA file %r  %r", filePath,
                         str(e))
        return False

    def __textDump(self, filePath, myObj, **kwargs):
        try:
            indent = kwargs.get("indent", 1)
            width = kwargs.get("width", 120)
            sOut = pprint.pformat(myObj, indent=indent, width=width)
            with open(filePath, "w") as ofh:
                ofh.write("\n%s\n" % sOut)
            return True
        except Exception as e:
            logger.error("Unable to dump to %r  %r", filePath, str(e))
        return False

    def __serializePickle(self, filePath, myObj, **kwargs):
        try:
            pickleProtocol = kwargs.get("pickleProtocol",
                                        pickle.DEFAULT_PROTOCOL)

            with open(filePath, "wb") as outfile:
                pickle.dump(myObj, outfile, pickleProtocol)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r  %r", filePath, str(e))
        return False

    def __deserializePickle(self, filePath, **kwargs):
        myDefault = kwargs.get("default", {})
        try:
            if sys.version_info[0] > 2:
                encoding = kwargs.get("encoding", "ASCII")
                errors = kwargs.get("errors", "strict")
                with open(filePath, "rb") as outfile:
                    return pickle.load(outfile,
                                       encoding=encoding,
                                       errors=errors)
            else:
                with open(filePath, "rb") as outfile:
                    return pickle.load(outfile)
        except Exception as e:
            logger.warning("Unable to deserialize %r %r", filePath, str(e))
        return myDefault

    def __serializeJson(self, filePath, myObj, **kwargs):
        """Internal method to serialize the input object as JSON.  An encoding
        helper class is included to handle selected python data types (e.g., datetime)
        """
        indent = kwargs.get("indent", 0)
        enforceAscii = kwargs.get("enforceAscii", True)
        try:
            if enforceAscii:
                with open(filePath, "w") as outfile:
                    json.dump(myObj,
                              outfile,
                              indent=indent,
                              cls=JsonTypeEncoder,
                              ensure_ascii=enforceAscii)
            else:
                with io.open(filePath, "w", encoding="utf-8") as outfile:
                    json.dump(myObj,
                              outfile,
                              indent=indent,
                              cls=JsonTypeEncoder,
                              ensure_ascii=enforceAscii)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r  %r", filePath, str(e))
        return False

    def __deserializeJson(self, filePath, **kwargs):
        myDefault = kwargs.get("default", {})
        encoding = kwargs.get("encoding", "utf-8-sig")
        encodingErrors = kwargs.get("encodingErrors", "ignore")
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as inpFile:
                        return json.load(inpFile,
                                         object_pairs_hook=OrderedDict)
                else:
                    # Py2 situation non-ascii encodings is problematic
                    # with gzip.open(filePath, "rb") as csvFile:
                    #    oL = self.__csvReader(csvFile, rowFormat, delimiter)
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 newline="",
                                 encoding=encoding,
                                 errors="ignore") as inpFile:
                        return json.load(inpFile,
                                         object_pairs_hook=OrderedDict)
            else:
                with open(filePath, "r") as inpFile:
                    return json.load(inpFile, object_pairs_hook=OrderedDict)
        except Exception as e:
            logger.warning("Unable to deserialize %r %r", filePath, str(e))
        return myDefault

    def __hasMinSize(self, pth, minSize):
        try:
            return os.path.getsize(pth) >= minSize
        except Exception:
            return False

    def __deserializeMmCif(self, locator, **kwargs):
        """ """
        try:
            containerList = []
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            minSize = kwargs.get("minSize", 5)
            #
            if self.__fileU.isLocal(locator):
                if minSize >= 0 and not self.__hasMinSize(locator, minSize):
                    logger.warning("Minimum file size not satisfied for: %r",
                                   locator)
                myIo = IoAdapter(raiseExceptions=raiseExceptions,
                                 useCharRefs=useCharRefs)
                containerList = myIo.readFile(
                    locator, enforceAscii=enforceAscii,
                    outDirPath=workPath)  # type: ignore
            else:
                # myIo = IoAdapterPy(raiseExceptions=raiseExceptions, useCharRefs=useCharRefs)
                # containerList = myIo.readFile(locator, enforceAscii=enforceAscii, outDirPath=workPath)
                containerList = self.__deserializeMmCifRemote(
                    locator, useCharRefs, enforceAscii, workPath)

        except Exception as e:
            logger.error("Failing for %s with %s", locator, str(e))
        return containerList

    @retry((requests.exceptions.RequestException),
           maxAttempts=3,
           delaySeconds=1,
           multiplier=2,
           defaultValue=[],
           logger=logger)
    def __deserializeMmCifRemote(self, locator, useCharRefs, enforceAscii,
                                 workPath):
        containerList = []
        try:
            myIo = IoAdapterPy(raiseExceptions=True, useCharRefs=useCharRefs)
            containerList = myIo.readFile(locator,
                                          enforceAscii=enforceAscii,
                                          outDirPath=workPath)
        except Exception as e:
            raise e
        return containerList

    def __serializeMmCif(self, filePath, containerList, **kwargs):
        """ """
        try:
            ret = False
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapter(raiseExceptions=raiseExceptions,
                             useCharRefs=useCharRefs)
            if filePath.endswith(".gz") and workPath:
                rfn = "".join(
                    random.choice(string.ascii_uppercase + string.digits)
                    for _ in range(10))
                tPath = os.path.join(workPath, rfn)
                ret = myIo.writeFile(tPath,
                                     containerList=containerList,
                                     enforceAscii=enforceAscii)
                ret = self.__fileU.compress(tPath,
                                            filePath,
                                            compressType="gzip")
            else:
                ret = myIo.writeFile(filePath,
                                     containerList=containerList,
                                     enforceAscii=enforceAscii)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return ret

    def __deserializeMmCifDict(self, filePath, **kwargs):
        """ """
        try:
            containerList = []
            workPath = kwargs.get("workPath", None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapterPy(raiseExceptions=raiseExceptions,
                               useCharRefs=useCharRefs)
            containerList = myIo.readFile(filePath,
                                          enforceAscii=enforceAscii,
                                          outDirPath=workPath)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return containerList

    def __serializeMmCifDict(self, filePath, containerList, **kwargs):
        """ """
        try:
            ret = False
            # workPath = kwargs.get('workPath', None)
            enforceAscii = kwargs.get("enforceAscii", True)
            raiseExceptions = kwargs.get("raiseExceptions", True)
            useCharRefs = kwargs.get("useCharRefs", True)
            #
            myIo = IoAdapterPy(raiseExceptions=raiseExceptions,
                               useCharRefs=useCharRefs)
            ret = myIo.writeFile(filePath,
                                 containerList=containerList,
                                 enforceAscii=enforceAscii)
        except Exception as e:
            logger.error("Failing for %s with %s", filePath, str(e))
        return ret

    def __serializeList(self, filePath, aList, enforceAscii=True, **kwargs):
        """ """

        try:
            _ = kwargs
            if enforceAscii:
                encoding = "ascii"
            else:
                encoding = "utf-8"
            #
            if sys.version_info[0] > 2:
                with open(filePath, "w") as ofh:
                    if enforceAscii:
                        for st in aList:
                            ofh.write("%s\n" % st.encode(
                                "ascii", "xmlcharrefreplace").decode("ascii"))
                    else:
                        for st in aList:
                            ofh.write("%s\n" % st)
            else:
                if enforceAscii:
                    with io.open(filePath, "w", encoding=encoding) as ofh:
                        for st in aList:
                            ofh.write("%s\n" % st.encode(
                                "ascii", "xmlcharrefreplace").decode("ascii"))
                else:
                    with open(filePath, "wb") as ofh:
                        for st in aList:
                            ofh.write("%s\n" % st)
            return True
        except Exception as e:
            logger.error("Unable to serialize %r %r", filePath, str(e))
        return False

    def __processList(self, ifh, enforceAscii=True, **kwargs):
        uncomment = kwargs.get("uncomment", True)
        aList = []
        for line in ifh:
            if enforceAscii:
                pth = line[:-1].encode("ascii",
                                       "xmlcharrefreplace").decode("ascii")
            else:
                pth = line[:-1]
            if not pth or (uncomment and pth.startswith("#")):
                continue
            aList.append(pth)
        return aList

    def __deserializeList(self,
                          filePath,
                          enforceAscii=True,
                          encodingErrors="ignore",
                          **kwargs):
        aList = []
        _ = kwargs
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding="utf-8-sig",
                                   errors=encodingErrors) as ifh:
                        aList = self.__processList(ifh,
                                                   enforceAscii=enforceAscii,
                                                   **kwargs)
                else:
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    # for py2 this commented code is problematic for non-ascii data
                    # with gzip.open(filePath, "rb") as ifh:
                    #    aList = self.__processList(ifh, enforceAscii=enforceAscii)
                    with io.open(tPath, encoding="utf-8-sig",
                                 errors="ignore") as ifh:
                        aList = self.__processList(ifh,
                                                   enforceAscii=enforceAscii)
            else:
                with io.open(filePath, encoding="utf-8-sig",
                             errors="ignore") as ifh:
                    aList = self.__processList(ifh,
                                               enforceAscii=enforceAscii,
                                               **kwargs)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        logger.debug("Reading list length %d", len(aList))
        return aList

    def __csvReader(self, csvFile, rowFormat, delimiter, uncomment=True):
        oL = []

        maxInt = sys.maxsize
        csv.field_size_limit(maxInt)
        if rowFormat == "dict":
            if uncomment:
                reader = csv.DictReader(uncommentFilter(csvFile),
                                        delimiter=delimiter)
            else:
                reader = csv.DictReader(csvFile, delimiter=delimiter)
            for rowD in reader:
                oL.append(rowD)
        elif rowFormat == "list":
            if uncomment:
                reader = csv.reader(uncommentFilter(csvFile),
                                    delimiter=delimiter)
            else:
                reader = csv.reader(csvFile, delimiter=delimiter)
            for rowL in reader:
                oL.append(rowL)
        return oL

    def deserializeCsvIter(self,
                           filePath,
                           delimiter=",",
                           rowFormat="dict",
                           encodingErrors="ignore",
                           uncomment=True,
                           **kwargs):
        """Return an iterator to input CSV format file.

        Args:
            filePath (str): input file path
            delimiter (str, optional): CSV delimiter. Defaults to ",".
            rowFormat (str, optional): format for each process row (list or dict). Defaults to "dict".
            encodingErrors (str, optional): treatment of encoding errors. Defaults to "ignore".
            uncomment (bool, optional): flag to ignore leading comments. Defaults to True.

        Returns:
            (iterator): iterator for rowwise access to processed CSV data
        """
        encoding = kwargs.get("encoding", "utf-8-sig")
        maxInt = sys.maxsize
        csv.field_size_limit(maxInt)
        try:
            if filePath[-3:] == ".gz":
                with gzip.open(filePath,
                               "rt",
                               encoding=encoding,
                               errors=encodingErrors) as csvFile:
                    startIt = itertools.dropwhile(
                        lambda x: x.startswith("#"),
                        csvFile) if uncomment else csvFile
                    if rowFormat == "dict":
                        reader = csv.DictReader(startIt, delimiter=delimiter)
                    elif rowFormat == "list":
                        reader = csv.reader(startIt, delimiter=delimiter)
                    for row in reader:
                        yield row
            else:
                with io.open(filePath,
                             newline="",
                             encoding=encoding,
                             errors="ignore") as csvFile:
                    startIt = itertools.dropwhile(
                        lambda x: x.startswith("#"),
                        csvFile) if uncomment else csvFile
                    if rowFormat == "dict":
                        reader = csv.DictReader(startIt, delimiter=delimiter)
                    elif rowFormat == "list":
                        reader = csv.reader(startIt, delimiter=delimiter)
                    for row in reader:
                        # if uncomment and row.startswith("#"):
                        #    continue
                        yield row
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))

    def __deserializeCsv(self,
                         filePath,
                         delimiter=",",
                         rowFormat="dict",
                         encodingErrors="ignore",
                         uncomment=True,
                         **kwargs):
        oL = []
        encoding = kwargs.get("encoding", "utf-8-sig")
        try:
            if filePath[-3:] == ".gz":
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as csvFile:
                        oL = self.__csvReader(csvFile,
                                              rowFormat,
                                              delimiter,
                                              uncomment=uncomment)
                else:
                    # Py2 situation non-ascii encodings is problematic
                    # with gzip.open(filePath, "rb") as csvFile:
                    #    oL = self.__csvReader(csvFile, rowFormat, delimiter)
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 newline="",
                                 encoding=encoding,
                                 errors="ignore") as csvFile:
                        oL = self.__csvReader(csvFile,
                                              rowFormat,
                                              delimiter,
                                              uncomment=uncomment)
            else:
                with io.open(filePath,
                             newline="",
                             encoding=encoding,
                             errors="ignore") as csvFile:
                    oL = self.__csvReader(csvFile,
                                          rowFormat,
                                          delimiter,
                                          uncomment=uncomment)

            return oL
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        logger.debug("Reading list length %d", len(oL))
        return oL

    def __serializeCsv(self, filePath, rowDictList, fieldNames=None, **kwargs):
        """ """
        _ = kwargs
        try:
            wD = {}
            ret = False
            fNames = fieldNames if fieldNames else list(rowDictList[0].keys())
            # with io.open(filePath, 'w', newline='') as csvFile:
            with open(filePath, "w") as csvFile:
                writer = csv.DictWriter(csvFile, fieldnames=fNames)
                writer.writeheader()
                for ii, rowDict in enumerate(rowDictList):
                    try:
                        wD = {k: v for k, v in rowDict.items() if k in fNames}
                        writer.writerow(wD)
                    except Exception as e:
                        logger.error(
                            "Skipping bad CSV record %d wD %r rowDict %r with %s",
                            ii + 1, wD, rowDict, str(e))
                        continue

            ret = True
        except Exception as e:
            logger.error("Failing for %s : %r with %s", filePath, wD, str(e))
        return ret

    def __csvEncoder(self,
                     csvData,
                     encoding="utf-8-sig",
                     encodingErrors="ignore"):
        """Handle encoding issues for gzipped data in Py2. (beware of the BOM chars)

        Args:
            csvData (text lines): uncompressed data from gzip open
            encoding (str, optional): character encoding. Defaults to "utf-8-sig".
            encodingErrors (str, optional): error treatment. Defaults to "ignore".
        """
        for line in csvData:
            yield line.decode("utf-8-sig", errors=encodingErrors).encode(
                encoding, errors=encodingErrors)

    def __deserializeXmlPrev(self, filePath, **kwargs):
        """Read the input XML file path and return an ElementTree data object instance.

        Args:
            filePath (sting): input XML file path

        Returns:
            object: instance of an ElementTree tree object
        """
        _ = kwargs
        tree = None
        try:
            logger.debug("Parsing XML path %s", filePath)
            if filePath[-3:] == ".gz":
                with gzip.open(filePath, mode="rb") as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            else:
                with open(filePath, mode="rb") as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            logger.debug("Parsed %s in %.2f seconds", filePath,
                         time.time() - tV)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        return tree

    def __testGzip(self, filePath):
        ok = True
        with gzip.open(filePath, "r") as fh:
            try:
                fh.read(1)
            except gzip.BadGzipFile:
                ok = False
            except Exception:
                ok = False
        logger.debug("Gzip file check %r", ok)
        return ok

    def __deserializeXml(self, filePath, **kwargs):
        """Read the input XML file path and return an ElementTree data object instance.

        Args:
            filePath (sting): input XML file path

        Returns:
            object: instance of an ElementTree tree object
        """
        _ = kwargs
        tree = None
        encoding = kwargs.get("encoding", "utf-8-sig")
        encodingErrors = kwargs.get("encodingErrors", "ignore")
        #
        try:
            logger.debug("Parsing XML path %s", filePath)
            if filePath[-3:] == ".gz" and self.__testGzip(filePath):
                if sys.version_info[0] > 2:
                    with gzip.open(filePath,
                                   "rt",
                                   encoding=encoding,
                                   errors=encodingErrors) as ifh:
                        tV = time.time()
                        tree = ET.parse(ifh)
                else:
                    tPath = self.__fileU.uncompress(filePath, outputDir=None)
                    with io.open(tPath,
                                 encoding=encoding,
                                 errors=encodingErrors) as ifh:
                        tV = time.time()
                        tree = ET.parse(ifh)
            else:
                with io.open(filePath,
                             encoding=encoding,
                             errors=encodingErrors) as ifh:
                    tV = time.time()
                    tree = ET.parse(ifh)
            logger.debug("Parsed %s in %.2f seconds", filePath,
                         time.time() - tV)
        except Exception as e:
            logger.error("Unable to deserialize %r %s", filePath, str(e))
        #
        return tree

Example #30

0

Show file

File: IMGTTargetProvider.py Project: rcsb/py-rcsb_utils_targets

    def __reload(self,
                 dirPath,
                 useCache=False,
                 imgtDumpUrl=None,
                 testList=None,
                 maxCount=None):
        imgtD = {}
        startTime = time.time()

        fU = FileUtil()
        fU.mkdir(dirPath)
        #
        imgtDataPath = os.path.join(self.__dirPath, "imgt-data.json")
        #
        logger.info("useCache %r imgtFeaturePath %r", useCache, imgtDataPath)
        if useCache and self.__mU.exists(imgtDataPath):
            imgtD = self.__mU.doImport(imgtDataPath, fmt="json")
            self.__version = imgtD["version"]
        else:
            imgtDumpUrl = imgtDumpUrl if imgtDumpUrl else "http://www.imgt.org/download/3Dstructure-DB/IMGT3DFlatFiles.tgz"
            imgtReadmeUrl = "http://www.imgt.org/download/3Dstructure-DB/RELEASE"
            imgtDumpFileName = fU.getFileName(imgtDumpUrl)
            imgtDumpPath = os.path.join(dirPath, imgtDumpFileName)
            imgtReleasePath = os.path.join(dirPath, "IMGT-release.txt")
            _, fn = os.path.split(imgtDumpUrl)
            imgtFlatFilePath = os.path.join(self.__dirPath, fn[:-4])
            #
            logger.info("Fetching url %s path %s", imgtDumpUrl, imgtDumpPath)
            ok1 = fU.get(imgtDumpUrl, imgtDumpPath)
            ok2 = fU.get(imgtReadmeUrl, imgtReleasePath)
            fU.unbundleTarfile(imgtDumpPath, dirPath=dirPath)
            logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1
                        and ok2,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
            # ---
            readmeLines = self.__mU.doImport(imgtReleasePath, fmt="list")
            self.__version = readmeLines[0].strip() if readmeLines else None
            logger.info("IMGT version %r", self.__version)
            # ---
            chainD, rawD = self.__imgtFlatFileProcessor(imgtFlatFilePath,
                                                        maxCount=maxCount,
                                                        testList=testList)
            # ---
            tS = datetime.datetime.now().isoformat()
            # vS = datetime.datetime.now().strftime("%Y-%m-%d")
            if testList:
                imgtD = {
                    "version": self.__version,
                    "date": tS,
                    "chains": chainD,
                    "raw": rawD
                }
            else:
                imgtD = {
                    "version": self.__version,
                    "date": tS,
                    "chains": chainD
                }
            ok = self.__mU.doExport(imgtDataPath, imgtD, fmt="json", indent=3)
            logger.info("Completed flatfile prep (%r) at %s (%.4f seconds)",
                        ok, time.strftime("%Y %m %d %H:%M:%S",
                                          time.localtime()),
                        time.time() - startTime)
        return imgtD