Example #1
0
 def __rebuildCache(self, urlTargetPfam, urlTargetPfamFB, dirPath,
                    useCache):
     pfamD = {}
     fmt = "json"
     ext = fmt if fmt == "json" else "pic"
     pfamDataPath = os.path.join(dirPath, "pfam-data.%s" % ext)
     #
     logger.debug("Using cache data path %s", dirPath)
     self.__mU.mkdir(dirPath)
     #
     if useCache and self.__mU.exists(pfamDataPath):
         pfamD = self.__mU.doImport(pfamDataPath, fmt=fmt)
         logger.debug("Pfam data length %d", len(pfamD))
     elif not useCache:
         # ------
         fU = FileUtil()
         logger.info("Fetch data from source %s in %s", urlTargetPfam,
                     dirPath)
         fp = os.path.join(dirPath, fU.getFileName(urlTargetPfam))
         ok = fU.get(urlTargetPfam, fp)
         if not ok:
             fp = os.path.join(dirPath, fU.getFileName(urlTargetPfamFB))
             ok = fU.get(urlTargetPfamFB, fp)
             logger.info("Fetch data fallback fetch status is %r", ok)
         pfamD = self.__getPfamIndex(fp)
         ok = self.__mU.doExport(pfamDataPath, pfamD, fmt=fmt)
         logger.info("Caching %d in %s status %r", len(pfamD), pfamDataPath,
                     ok)
         # ------
     #
     return pfamD
 def __reloadEntryIds(self,
                      urlTarget,
                      urlFallbackTarget,
                      dirPath,
                      useCache=True):
     idD = {}
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     fp = os.path.join(dirPath, fn)
     self.__mU.mkdir(dirPath)
     #
     if useCache and self.__mU.exists(fp):
         tdL = self.__mU.doImport(fp, fmt="json")
         logger.debug("Reading cached IDs list (%d)", len(tdL))
     else:
         logger.info("Fetch ID list from %s", urlTarget)
         ok = fU.get(urlTarget, fp)
         if not ok:
             ok = fU.get(urlFallbackTarget, fp)
         #
         if ok:
             tdL = self.__mU.doImport(fp, fmt="json")
     #
     for td in tdL:
         for k, v in td.items():
             try:
                 idD[k] = datetime.datetime.fromisoformat(v)
             except Exception as e:
                 logger.error("Date processing failing for %r %r with %s",
                              k, v, str(e))
     #
     sTupL = sorted(idD.items(), key=lambda item: item[1])
     return {k: v for k, v in sTupL}
Example #3
0
 def __reloadGlycans(self, baseUrl, fallbackUrl, dirPath, useCache=True):
     gD = {}
     logger.debug("Using dirPath %r", dirPath)
     self.__mU.mkdir(dirPath)
     #
     myDataPath = os.path.join(dirPath, "glygen-glycan-list.json")
     if useCache and self.__mU.exists(myDataPath):
         gD = self.__mU.doImport(myDataPath, fmt="json")
         logger.debug("GlyGen glycan data length %d", len(gD))
     elif not useCache:
         logger.debug(
             "Fetch GlyGen glycan data from primary data source %s",
             baseUrl)
         endPoint = os.path.join(baseUrl, "glycan_masterlist.csv")
         #
         logger.info("Fetch GlyGen glycan data from primary data source %s",
                     endPoint)
         rawPath = os.path.join(dirPath, "glycan_masterlist.csv")
         fU = FileUtil()
         ok = fU.get(endPoint, rawPath)
         logger.debug("Fetch GlyGen glycan data status %r", ok)
         if not ok:
             endPoint = os.path.join(fallbackUrl, "glycan_masterlist.csv")
             ok = fU.get(endPoint, rawPath)
             logger.info("Fetch fallback GlyGen glycan data status %r", ok)
         #
         if ok:
             gD = self.__parseGlycanList(rawPath)
             ok = self.__mU.doExport(myDataPath, gD, fmt="json")
             logger.info("Exported GlyGen glycan list (%d) (%r) %s",
                         len(gD), ok, myDataPath)
         #
     return gD
Example #4
0
 def __reload(self, dirPath, baseVersion, useCache, **kwargs):
     startTime = time.time()
     mU = MarshalUtil(workPath=dirPath)
     chemblDbUrl = kwargs.get(
         "ChEMBLDbUrl",
         "ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/")
     ok = False
     fU = FileUtil()
     fU.mkdir(dirPath)
     #
     # ChEMBL current version <baseVersion>,...
     # template:  chembl_<baseVersion>.fa.gz
     #
     targetFileName = "chembl_" + str(baseVersion) + ".fa.gz"
     mappingFileName = "chembl_uniprot_mapping.txt"
     #
     chemblTargetPath = os.path.join(dirPath, targetFileName)
     chemblMappingPath = os.path.join(dirPath, mappingFileName)
     mappingFilePath = os.path.join(dirPath, "chembl_uniprot_mapping.json")
     #
     mapD = {}
     if useCache and fU.exists(mappingFilePath):
         logger.info("useCache %r using %r and %r and %r", useCache,
                     chemblTargetPath, chemblMappingPath, mappingFilePath)
         mapD = mU.doImport(mappingFilePath, fmt="json")
     else:
         # Get the ChEMBL UniProt mapping file
         url = os.path.join(chemblDbUrl, mappingFileName)
         ok = fU.get(url, chemblMappingPath)
         logger.info("Fetched %r url %s path %s", ok, url,
                     chemblMappingPath)
         logger.info("Reading ChEMBL mapping file path %s", mappingFilePath)
         rowL = mU.doImport(chemblMappingPath, fmt="tdd", rowFormat="list")
         for row in rowL:
             mapD[row[0]] = (row[1], row[2], row[3])
         ok = mU.doExport(mappingFilePath, mapD, fmt="json")
         logger.info("Processed mapping path %s (%d) %r", mappingFilePath,
                     len(mapD), ok)
         #
         # Get the target FASTA files --
         for vers in range(baseVersion, baseVersion + 10):
             logger.info("Now fetching version %r", vers)
             self.__version = vers
             targetFileName = "chembl_" + str(vers) + ".fa.gz"
             chemblTargetPath = os.path.join(dirPath,
                                             "chembl_targets_raw.fa.gz")
             url = os.path.join(chemblDbUrl, targetFileName)
             ok = fU.get(url, chemblTargetPath)
             logger.info("Fetched %r url %s path %s", ok, url,
                         chemblTargetPath)
             if ok:
                 break
     #
     logger.info("Completed reload at %s (%.4f seconds)",
                 time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                 time.time() - startTime)
     #
     return mapD
Example #5
0
 def __reloadGlycoproteins(self,
                           baseUrl,
                           fallbackUrl,
                           dirPath,
                           useCache=True):
     gD = {}
     logger.debug("Using dirPath %r", dirPath)
     self.__mU.mkdir(dirPath)
     #
     myDataPath = os.path.join(dirPath, "glygen-glycoprotein-list.json")
     if useCache and self.__mU.exists(myDataPath):
         gD = self.__mU.doImport(myDataPath, fmt="json")
         logger.debug("GlyGen glycoprotein data length %d", len(gD))
     else:
         for fn in [
                 "sarscov1_protein_masterlist.csv",
                 "sarscov2_protein_masterlist.csv",
                 "hcv1b_protein_masterlist.csv",
                 "hcv1a_protein_masterlist.csv",
                 "human_protein_masterlist.csv",
                 "mouse_protein_masterlist.csv",
                 "rat_protein_masterlist.csv",
         ]:
             logger.debug(
                 "Fetch GlyGen glycoprotein data from primary data source %s",
                 baseUrl)
             endPoint = os.path.join(baseUrl, fn)
             #
             logger.debug(
                 "Fetch GlyGen glycoprotein data from primary data source %s",
                 endPoint)
             rawPath = os.path.join(dirPath, fn)
             fU = FileUtil()
             ok = fU.get(endPoint, rawPath)
             logger.debug("Fetch GlyGen glycoprotein data status %r", ok)
             if not ok:
                 endPoint = os.path.join(fallbackUrl, fn)
                 ok = fU.get(endPoint, rawPath)
                 logger.info("Fetch fallback GlyGen data status %r", ok)
             #
             if ok:
                 tD = self.__parseGlycoproteinList(rawPath)
                 gD.update(tD)
         #
         ok = self.__mU.doExport(myDataPath, gD, fmt="json")
         logger.info("Exported GlyGen glycoprotein list (%d) (%r) %s",
                     len(gD), ok, myDataPath)
     #
     return gD
    def restoreDependencies(self,
                            url,
                            dirPath,
                            bundleLabel="A",
                            userName=None,
                            pw=None):
        """Restore bundled dependencies from remote storage and unbundle these in the
           current local cache directory.

        Args:
            url (str): remote URL
            dirPath (str): remote directory path on the
            bundleLabel (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            userName (str, optional): optional access information. Defaults to None.
            password (str, optional): optional access information. Defaults to None.
        """
        try:
            ok = False
            fileU = FileUtil()
            fn = self.__makeBundleFileName(self.__dependFileName,
                                           bundleLabel=bundleLabel)
            if not url:
                remotePath = os.path.join(dirPath, fn)
                ok = fileU.get(remotePath, self.__dependTarFilePath)

            elif url and url.startswith("http://"):
                remotePath = url + os.path.join("/", dirPath, fn)
                ok = fileU.get(remotePath, self.__dependTarFilePath)

            elif url and url.startswith("sftp://"):
                sftpU = SftpUtil()
                ok = sftpU.connect(url[7:], userName, pw=pw, port=22)
                if ok:
                    remotePath = os.path.join(dirPath, fn)
                    ok = sftpU.get(remotePath, self.__dependTarFilePath)
            else:
                logger.error("Unsupported protocol %r", url)
            if ok:
                ok = fileU.unbundleTarfile(self.__dependTarFilePath,
                                           dirPath=self.__cachePath)
            return ok
        except Exception as e:
            logger.exception("For %r %r Failing with %s", url, dirPath, str(e))
            ok = False
        return ok
Example #7
0
    def fetchBundle(self, localRestoreDirPath, url, remoteDirPath, remoteStashPrefix="A", userName=None, password=None):
        """Restore bundled dependencies from remote storage and unbundle these in the
           current local cache directory.

        Args:
            localRestoreDirPath (str): local restore path
            url (str): remote URL
            remoteDirPath (str): remote directory path on the remote resource
            remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')
            userName (str, optional): optional access information. Defaults to None.
            password (str, optional): optional access information. Defaults to None.
        """
        try:
            ok = False
            fileU = FileUtil()
            fn = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix)
            if not url:
                remotePath = os.path.join(remoteDirPath, fn)
                if fileU.exists(remotePath):
                    ok = fileU.get(remotePath, self.__localStashTarFilePath)
                else:
                    ok = False
                    logger.warning("Missing bundle file %r", remotePath)

            elif url and (url.startswith("http://") or url.startswith("https://")):
                remotePath = url + os.path.join("/", remoteDirPath, fn)
                ok = fileU.get(remotePath, self.__localStashTarFilePath)

            elif url and url.startswith("sftp://"):
                sftpU = SftpUtil()
                ok = sftpU.connect(url[7:], userName, pw=password, port=22)
                if ok:
                    remotePath = os.path.join(remoteDirPath, fn)
                    ok = sftpU.get(remotePath, self.__localStashTarFilePath)
            else:
                logger.error("Unsupported protocol %r", url)
            if ok:
                ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath)
            return ok
        except Exception as e:
            logger.exception("For %r %r Failing with %s", url, remoteDirPath, str(e))
            ok = False
        return ok
 def __fetchFromSource(self, urlTarget):
     """Fetch the classification names and domain assignments from the ECOD repo."""
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     fp = os.path.join(self.__dirPath, fn)
     if not fU.exists(fp):
         fU.get(urlTarget, fp)
     #
     with open(fp, "r", encoding="utf-8") as ifh:
         line = ifh.readline()
         line = ifh.readline()
         line = ifh.readline()
         ff = line[:-1].split()
         self.__version = ff[-1]
     #
     nmL = self.__mU.doImport(fp, fmt="list", uncomment=True)
     fU.remove(fp)
     #
     return nmL
Example #9
0
 def __fetchFromBackup(self, urlBackupPath, cathDirPath):
     fn = self.__getCathDomainFileName()
     cathDomainPath = os.path.join(cathDirPath, fn)
     self.__mU.mkdir(cathDirPath)
     #
     backupUrl = urlBackupPath + "/" + fn
     logger.info("Using backup URL %r", backupUrl)
     fU = FileUtil()
     ok = fU.get(backupUrl, cathDomainPath)
     return ok
 def __pharosFixture(self):
     try:
         ok = False
         fU = FileUtil()
         srcPath = os.path.join(self.__dataPath, "Pharos")
         dstPath = os.path.join(self.__cachePath, "Pharos-targets")
         for fn in ["drug_activity", "cmpd_activity", "protein"]:
             inpPath = os.path.join(srcPath, fn + ".tdd.gz")
             outPath = os.path.join(dstPath, fn + ".tdd.gz")
             fU.get(inpPath, outPath)
             fU.uncompress(outPath, outputDir=dstPath)
             fU.remove(outPath)
         fU.put(os.path.join(srcPath, "pharos-readme.txt"),
                os.path.join(dstPath, "pharos-readme.txt"))
         ok = True
     except Exception as e:
         logger.exception("Failing with %s", str(e))
         ok = False
     return ok
 def __reload(self, urlTarget, urlFallbackTarget, dirPath, useCache=True):
     invD = {}
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     fp = os.path.join(dirPath, fn)
     self.__mU.mkdir(dirPath)
     #
     if useCache and self.__mU.exists(fp):
         invD = self.__mU.doImport(fp, fmt="json")
         logger.debug("Reading cached inventory (%d)", len(invD))
     else:
         logger.info("Fetch inventory from %s", urlTarget)
         ok = fU.get(urlTarget, fp)
         if not ok:
             ok = fU.get(urlFallbackTarget, fp)
         #
         if ok:
             invD = self.__mU.doImport(fp, fmt="json")
     #
     return invD
Example #12
0
 def __fetchFromBackup(self, urlBackupPath, scopDirPath):
     pyVersion = sys.version_info[0]
     fn = "scop_domains-py%s.pic" % str(pyVersion)
     scopDomainPath = os.path.join(scopDirPath, fn)
     self.__mU.mkdir(scopDirPath)
     #
     backupUrl = urlBackupPath + "/" + fn
     logger.info("Using backup URL %r", backupUrl)
     fU = FileUtil()
     ok = fU.get(backupUrl, scopDomainPath)
     return ok
 def __fetchFromBackup(self, fmt="json"):
     urlTarget = "https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets/master/fall_back/SCOP2"
     #
     fn = self.__getAssignmentFileName(fmt=fmt)
     assignmentPath = os.path.join(self.__dirPath, fn)
     urlPath = os.path.join(urlTarget, fn)
     self.__mU.mkdir(assignmentPath)
     #
     logger.info("Using backup URL %r", urlPath)
     fU = FileUtil()
     ok = fU.get(urlPath, assignmentPath)
     return ok
Example #14
0
    def __rebuildCache(self, urlTargetIsoLtwa, dirPath, useCache):
        """Rebuild the cache of ISO abbreviation term data

        Args:
            urlTargetIsoLtwa (str): URL for ISO4 LTWA title word abbreviations
            dirPath (str):  cache path
            useCache (bool):  flag to use cached files

        Returns:
            tuple: (dict) title word abbreviations
                   (dict) language conflict dictionary
                   (list) multi-word abbreviation targets

        Notes:
            ISO source file (tab delimited UTF-16LE) is maintained at the ISSN site -
            https://www.issn.org/wp-content/uploads/2013/09/LTWA_20160915.txt
        """
        aD = {}
        mU = MarshalUtil(workPath=dirPath)
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        isoLtwaNamePath = os.path.join(dirPath, "iso-ltwa.%s" % ext)
        logger.debug("Using cache data path %s", dirPath)
        mU.mkdir(dirPath)
        if not useCache:
            for fp in [isoLtwaNamePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and mU.exists(isoLtwaNamePath):
            aD = mU.doImport(isoLtwaNamePath, fmt=fmt)
            logger.debug("Abbreviation name length %d", len(aD["abbrev"]))
        elif not useCache:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetIsoLtwa,
                        dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetIsoLtwa))
            ok = fU.get(urlTargetIsoLtwa, fp)
            aD = self.__getLtwaTerms(dirPath, fp)
            ok = mU.doExport(isoLtwaNamePath, aD, fmt=fmt)
            logger.debug("abbrevD keys %r", list(aD.keys()))
            logger.debug("Caching %d ISO LTWA in %s status %r",
                         len(aD["abbrev"]), isoLtwaNamePath, ok)
        #
        abbrevD = aD["abbrev"] if "abbrev" in aD else {}
        conflictD = aD["conflicts"] if "conflicts" in aD else {}
        multiWordTermL = aD[
            "multi_word_abbrev"] if "multi_word_abbrev" in aD else []
        #
        return abbrevD, conflictD, multiWordTermL
    def __rebuildCache(self, urlTargetInterPro, urlTargetInterProFB, urlTargetInterProParent, urlTargetInterProParentFB, dirPath, useCache):
        fmt = "json"
        ext = fmt if fmt == "json" else "pic"
        interProDataPath = os.path.join(dirPath, "interPro-data.%s" % ext)
        #
        logger.debug("Using cache data path %s", dirPath)
        self.__mU.mkdir(dirPath)
        #
        if useCache and self.__mU.exists(interProDataPath):
            rD = self.__mU.doImport(interProDataPath, fmt=fmt)
            interProD = rD["index"]
            interProParentD = rD["parents"]
            logger.debug("InterPro index length %d parent length %d", len(interProD), len(interProParentD))
        else:
            # ------
            fU = FileUtil()
            logger.info("Fetch data from source %s in %s", urlTargetInterPro, dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetInterPro))
            ok = fU.get(urlTargetInterPro, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProFB))
                ok = fU.get(urlTargetInterProFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            interProD = self.__getInterProIndex(fp)

            logger.info("Caching %d in %s status %r", len(interProD), interProDataPath, ok)
            # ------
            logger.info("Fetch data from source %s in %s", urlTargetInterProParent, dirPath)
            fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParent))
            ok = fU.get(urlTargetInterProParent, fp)
            if not ok:
                fp = os.path.join(dirPath, fU.getFileName(urlTargetInterProParentFB))
                ok = fU.get(urlTargetInterProParentFB, fp)
                logger.info("Fetch data fallback fetch status is %r", ok)
            interProParentD = self.__getInterProParents(fp)
            #
            ok = self.__mU.doExport(interProDataPath, {"index": interProD, "parents": interProParentD}, fmt=fmt)
        #
        return interProD, interProParentD
 def __fetchUrl(self, urlTarget, dirPath, useCache=False):
     fU = FileUtil()
     fn = fU.getFileName(urlTarget)
     filePath = os.path.join(dirPath, fn)
     if not (useCache and fU.exists(filePath)):
         startTime = time.time()
         ok2 = fU.get(urlTarget, filePath)
         endTime = time.time()
         if ok2:
             logger.info("Fetched %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime)
         else:
             logger.error("Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)", urlTarget, filePath, ok2, endTime - startTime)
     #
     return filePath
Example #17
0
 def __reload(self, dirPath, **kwargs):
     oD = None
     version = None
     startTime = time.time()
     useCache = kwargs.get("useCache", True)
     #
     # CARDDumpUrl = kwargs.get("CARDDumpUrl", "https://card.mcmaster.ca/latest/data/broadstreet-v3.1.0.tar.bz2")
     cardDumpUrl = kwargs.get("CARDDumpUrl",
                              "https://card.mcmaster.ca/latest/data")
     ok = False
     fU = FileUtil()
     cardDumpFileName = "card-data.tar.bz2"
     cardDumpPath = os.path.join(dirPath, cardDumpFileName)
     cardDumpDirPath = os.path.join(dirPath, "dump")
     #
     fU.mkdir(dirPath)
     cardDataPath = os.path.join(dirPath, "card-select-data.json")
     #
     logger.info("useCache %r CARDDumpPath %r", useCache, cardDumpPath)
     if useCache and self.__mU.exists(cardDataPath):
         qD = self.__mU.doImport(cardDataPath, fmt="json")
         version = qD["version"]
         oD = qD["data"]
     else:
         logger.info("Fetching url %s path %s", cardDumpUrl, cardDumpPath)
         ok = fU.get(cardDumpUrl, cardDumpPath)
         fU.mkdir(cardDumpDirPath)
         fU.uncompress(cardDumpPath, outputDir=cardDumpDirPath)
         fU.unbundleTarfile(os.path.join(cardDumpDirPath,
                                         cardDumpFileName[:-4]),
                            dirPath=cardDumpDirPath)
         logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok,
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     time.time() - startTime)
         oD, version = self.__parseCardData(
             os.path.join(cardDumpDirPath, "card.json"))
         tS = datetime.datetime.now().isoformat()
         qD = {"version": version, "created": tS, "data": oD}
         oD = qD["data"]
         ok = self.__mU.doExport(cardDataPath, qD, fmt="json", indent=3)
         logger.info("Export CARD data (%d) status %r", len(oD), ok)
     # ---
     return oD, version
Example #18
0
    def __processAppendedSections(self,
                                  appendConfigOption,
                                  cachePath,
                                  useCache=True):
        """Fetch and append configuration assets assigned to input configuration option.

        Args:
            appendConfigOption (str): reserved configuration option to hold a list of configuration asset locators
            cachePath (str): path to store cached copies configuration assets
            useCache (bool, optional): use existing cached configuration assets. Defaults to True.

        Returns:
            bool: True for success of False otherwise
        """
        try:
            ret = True
            appendLocL = self.getList(appendConfigOption,
                                      sectionName=self.__defaultSectionName)
            logger.debug("appendLocL is %r", appendLocL)
            if appendLocL:
                cP = os.path.join(cachePath, "config")
                fU = FileUtil(workPath=cP)
                logger.debug("Fetching append sections from %r", appendLocL)
                for appendLoc in appendLocL:
                    fn = fU.getFileName(appendLoc)
                    fp = os.path.join(cP, fn)
                    okF = True
                    if not (useCache and fU.exists(fp)):
                        # get a fresh copy from source
                        okF = fU.get(appendLoc, fp)
                        logger.debug("Fetched %r to %r", appendLoc, fp)
                    ok = self.appendConfig(fp)
                    ret = ret and ok and okF
        except Exception as e:
            logger.exception("Failing for option %r cachePath %r with %s",
                             appendConfigOption, cachePath, str(e))
            ret = False
        #
        if not ret:
            logger.error("Fetching appended sections failing %r", appendLocL)

        return ret
Example #19
0
    def __reload(self, urlTarget, dirPath, useCache=True):
        """Reload local cache of mapping resources to support validation report reader and translator.

        Args:
            urlTarget (list, str): URL for schema mapping file
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of ValidationReportReader()
        """
        mapD = {}
        #
        mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        mappingFilePath = os.path.join(dirPath, fn)
        mU.mkdir(dirPath)
        #
        # if not useCache:
        #     for fp in [mappingFilePath]:
        #         try:
        #             os.remove(fp)
        #         except Exception:
        #             pass
        # #
        logger.debug("Loading validation mapping data in %s (useCache %r)", fn,
                     useCache)
        if useCache and fU.exists(mappingFilePath):
            mapD = mU.doImport(mappingFilePath, fmt="json")
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        mappingFilePath)
            tS = uuid.uuid4().hex
            tP = os.path.join(dirPath, "._" + tS)
            ok = fU.get(urlTarget, tP)
            if ok:
                mapD = mU.doImport(tP, fmt="json")
                os.replace(tP, mappingFilePath)
        return mapD
Example #20
0
    def __reload(self, urlTarget, dirPath, useCache=True):
        """ Reload input GO OBO ontology file and return a nx graph object.
'
        Returns:
            dictionary[goId] = {'name_list': ... , 'id_list': ... 'depth_list': ... }
        """
        goGraph = None
        #
        # mU = MarshalUtil()
        fU = FileUtil()
        fn = fU.getFileName(urlTarget)
        oboFilePath = os.path.join(dirPath, fn)
        fU.mkdir(dirPath)
        #
        if not useCache:
            for fp in [oboFilePath]:
                try:
                    os.remove(fp)
                except Exception:
                    pass
        #
        if useCache and fU.exists(oboFilePath):
            goGraph = obonet.read_obo(oboFilePath)
        else:
            logger.info("Fetching url %s to resource file %s", urlTarget,
                        oboFilePath)
            ok = fU.get(urlTarget, oboFilePath)
            if ok:
                goGraph = obonet.read_obo(oboFilePath)
        if goGraph:
            logger.info("Reading %d nodes and %d edges", len(goGraph),
                        goGraph.number_of_edges())
        else:
            logger.info("Go graph construction failing")
        #
        return goGraph
    def __rebuildCache(self,
                       targetUrl,
                       mapNameL,
                       outDirPath,
                       rawDirPath,
                       fmt="pickle",
                       useCache=True):
        """Fetch the UniProt selected id mapping resource file and extract
        UniProt Acc to  'mapIndex' mapping. Serialize the mapping as required.

        Args:
            targetUrl (str): source URL of the remote index file
            mapNameL (list): list of key mapping names to extract from the index
            outDirPath (str): directory path for raw and processed mapping files
            fmt (str, optional): output format (pickle|json) . Defaults to "pickle".
            useCache (bool, optional): use cached files. Defaults to True.

        Returns:
            dict: od[uniprotId] = mapped value

                idmapping_selected.tab

                1. UniProtKB-AC
                2. UniProtKB-ID
                3. GeneID (EntrezGene)
                4. RefSeq
                5. GI
                6. PDB
                7. GO
                8. UniRef100
                9. UniRef90
                10. UniRef50
                11. UniParc
                12. PIR
                13. NCBI-taxon
                14. MIM
                15. UniGene
                16. PubMed
                17. EMBL
                18. EMBL-CDS
                19. Ensembl
                20. Ensembl_TRS
                21. Ensembl_PRO
                22. Additional PubMed

        """
        startTime = time.time()
        nL = mapNameL
        oD = {}
        try:
            fileU = FileUtil()
            fExt = "pic" if fmt == "pickle" else "json"
            fExt = "tdd" if fmt == "tdd" else fExt
            fN, _ = os.path.splitext(fileU.getFileName(targetUrl))
            mapFileName = fN + "-map." + fExt
            idMapPath = os.path.join(outDirPath, mapFileName)
            mU = MarshalUtil()
            if useCache and mU.exists(idMapPath):
                logger.info("Reading cached serialized file %r", idMapPath)
                if fmt in ["pickle", "json"]:
                    tD = mU.doImport(idMapPath, fmt=fmt)
                    nL = list(set(tD["idNameList"]))
                    oD = tD["uniprotMapD"]
                    logger.info("keys %r", list(oD.keys())[:10])
                    logger.info("nL %r", nL)
                    ok = True
                elif fmt == "tdd":
                    ioU = IoUtil()
                    it = ioU.deserializeCsvIter(idMapPath,
                                                delimiter="\t",
                                                rowFormat="list",
                                                encodingErrors="ignore")
                    tL = next(it, [])
                    nL = tL[1:]
                    if len(nL) == 1:
                        for row in it:
                            oD[row[0]] = row[1]
                    else:
                        for row in it:
                            oD[row[0]] = row[1:]
                    ok = True
            else:
                idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl))
                if not fileU.exists(idPath):
                    logger.info(
                        "Fetching selected UniProt idmapping data from %r in %r",
                        targetUrl, outDirPath)
                    ok = fileU.get(targetUrl, idPath)
                    if not ok:
                        logger.error("Failed to downlowd %r", targetUrl)
                        return oD
                else:
                    logger.info("Using cached mapping file %r", idPath)
                # ---
                ioU = IoUtil()
                if fmt in ["pickle", "json"]:
                    if len(mapNameL) == 1:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            oD[row[0]] = str(
                                row[self.__mapRecordD[mapNameL[0]] - 1])
                    else:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            for mapName in mapNameL:
                                oD.setdefault(row[0], []).append(
                                    str(row[self.__mapRecordD[mapName] - 1]))
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    ok = mU.doExport(idMapPath, {
                        "idNameList": mapNameL,
                        "uniprotMapD": oD
                    },
                                     fmt=fmt)
                elif fmt == "tdd":
                    #
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    fU = FileUtil()
                    fU.mkdirForFile(idMapPath)
                    colNameL = []
                    colNameL.append("UniProtId")
                    colNameL.extend(mapNameL)
                    with open(idMapPath, "w", encoding="utf-8") as ofh:
                        ofh.write("%s\n" % "\t".join(colNameL))
                        if len(mapNameL) == 1:
                            idx = self.__mapRecordD[mapNameL[0]] - 1
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write("%s\t%s\n" % (row[0], row[idx]))
                        else:
                            idxL = [0]
                            idxL.extend([
                                self.__mapRecordD[mapName] - 1
                                for mapName in mapNameL
                            ])
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write(
                                    "%s\n" %
                                    "\t".join([str(row[idx]) for idx in idxL]))
                            #
                    nL, oD = self.__rebuildCache(targetUrl,
                                                 mapNameL,
                                                 outDirPath,
                                                 rawDirPath,
                                                 fmt=fmt,
                                                 useCache=True)
                    ok = True if nL and oD else False
            logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return nL, oD
    def __reload(self,
                 dirPath,
                 useCache=False,
                 imgtDumpUrl=None,
                 testList=None,
                 maxCount=None):
        imgtD = {}
        startTime = time.time()

        fU = FileUtil()
        fU.mkdir(dirPath)
        #
        imgtDataPath = os.path.join(self.__dirPath, "imgt-data.json")
        #
        logger.info("useCache %r imgtFeaturePath %r", useCache, imgtDataPath)
        if useCache and self.__mU.exists(imgtDataPath):
            imgtD = self.__mU.doImport(imgtDataPath, fmt="json")
            self.__version = imgtD["version"]
        else:
            imgtDumpUrl = imgtDumpUrl if imgtDumpUrl else "http://www.imgt.org/download/3Dstructure-DB/IMGT3DFlatFiles.tgz"
            imgtReadmeUrl = "http://www.imgt.org/download/3Dstructure-DB/RELEASE"
            imgtDumpFileName = fU.getFileName(imgtDumpUrl)
            imgtDumpPath = os.path.join(dirPath, imgtDumpFileName)
            imgtReleasePath = os.path.join(dirPath, "IMGT-release.txt")
            _, fn = os.path.split(imgtDumpUrl)
            imgtFlatFilePath = os.path.join(self.__dirPath, fn[:-4])
            #
            logger.info("Fetching url %s path %s", imgtDumpUrl, imgtDumpPath)
            ok1 = fU.get(imgtDumpUrl, imgtDumpPath)
            ok2 = fU.get(imgtReadmeUrl, imgtReleasePath)
            fU.unbundleTarfile(imgtDumpPath, dirPath=dirPath)
            logger.info("Completed fetch (%r) at %s (%.4f seconds)", ok1
                        and ok2,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
            # ---
            readmeLines = self.__mU.doImport(imgtReleasePath, fmt="list")
            self.__version = readmeLines[0].strip() if readmeLines else None
            logger.info("IMGT version %r", self.__version)
            # ---
            chainD, rawD = self.__imgtFlatFileProcessor(imgtFlatFilePath,
                                                        maxCount=maxCount,
                                                        testList=testList)
            # ---
            tS = datetime.datetime.now().isoformat()
            # vS = datetime.datetime.now().strftime("%Y-%m-%d")
            if testList:
                imgtD = {
                    "version": self.__version,
                    "date": tS,
                    "chains": chainD,
                    "raw": rawD
                }
            else:
                imgtD = {
                    "version": self.__version,
                    "date": tS,
                    "chains": chainD
                }
            ok = self.__mU.doExport(imgtDataPath, imgtD, fmt="json", indent=3)
            logger.info("Completed flatfile prep (%r) at %s (%.4f seconds)",
                        ok, time.strftime("%Y %m %d %H:%M:%S",
                                          time.localtime()),
                        time.time() - startTime)
        return imgtD
    def exportFasta(self, withGaps=False):
        """
        Example:
            The IMGT/GENE-DB FASTA header contains 15 fields separated by '|':

            1. IMGT/LIGM-DB accession number(s)
            2. IMGT gene and allele name
            3. species (may be followed by an "_" and the name of the strain, breed or isolate, if defined)
            4. IMGT gene and allele functionality
            5. exon(s), region name(s), or extracted label(s)
            6. start and end positions in the IMGT/LIGM-DB accession number(s)
            7. number of nucleotides in the IMGT/LIGM-DB accession number(s)
            8. codon start, or 'NR' (not relevant) for non coding labels
            9. +n: number of nucleotides (nt) added in 5' compared to the corresponding label extracted from IMGT/LIGM-DB
            10. +n or -n: number of nucleotides (nt) added or removed in 3' compared to the corresponding label extracted from IMGT/LIGM-DB
            11. +n, -n, and/or nS: number of added, deleted, and/or substituted nucleotides to correct sequencing errors, or 'not corrected' if non corrected sequencing errors
            12. number of amino acids (AA): this field indicates that the sequence is in amino acids
            13. number of characters in the sequence: nt (or AA)+IMGT gaps=total
            14. partial (if it is)
            15. reverse complementary (if it is)

        """
        # --
        fU = FileUtil()
        fU.mkdir(self.__dirPath)
        if withGaps:
            imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithGaps-F+ORF+inframeP"
        else:
            imgtTargetUrl = "http://www.imgt.org/download/GENE-DB/IMGTGENEDB-ReferenceSequences.fasta-AA-WithoutGaps-F+ORF+inframeP"
        imgtTargetFileName = fU.getFileName(imgtTargetUrl)
        rawFastaPath = os.path.join(self.__dirPath, imgtTargetFileName)
        # --
        logger.debug("Fetching url %s path %s", imgtTargetUrl, rawFastaPath)
        ok = fU.get(imgtTargetUrl, rawFastaPath)
        logger.info("Fetch status (%r) url %s path %s", ok, imgtTargetUrl,
                    rawFastaPath)
        # --
        fastaPath = os.path.join(self.__dirPath, "imgt-reference.fa")
        taxonPath = os.path.join(self.__dirPath, "imgt-reference-taxon.tdd")
        tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=True)
        ok = tP.testCache()
        if not ok:
            tP = TaxonomyProvider(cachePath=self.__cachePath, useCache=False)

        rawQD = self.__mU.doImport(rawFastaPath,
                                   fmt="fasta",
                                   commentStyle="default")
        oD = {}
        taxonL = []
        for queryId, sD in rawQD.items():
            qL = queryId.split("|")
            tL = qL[2].split("_")
            taxName = tL[0]
            taxVar = tL[1].replace(" ", "_") if len(tL) > 1 else None
            taxId = tP.getTaxId(taxName)
            if taxId:
                tD = {
                    "seqId": qL[0],
                    "imgtGene": qL[1],
                    "functionality": qL[3],
                    "labels": qL[4],
                    "taxId": taxId
                }
                if taxVar:
                    tD["taxVar"] = taxVar
                sD.update(tD)
            else:
                logger.info("Unknown taxonomy %r (taxName=%r)", queryId,
                            taxName)
            sD["sequence"].replace(".", "-")
            seqId = ""
            cL = []
            for k, v in sD.items():
                if k in ["sequence"]:
                    continue
                cL.append(str(v))
                cL.append(str(k))
            seqId = "|".join(cL)
            oD[seqId] = sD
            taxonL.append("%s\t%s" % (seqId, taxId))
        #
        ok1 = self.__mU.doExport(taxonPath, taxonL, fmt="list")
        ok2 = self.__mU.doExport(fastaPath, oD, fmt="fasta", makeComment=True)
        return ok1 and ok2
Example #24
0
class DataTypeApiProvider(SingletonClass):
    """ Data type application and instance information provider.
    """
    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Data type application and instance information provider.

        Args:
            cfgOb (object):  ConfigInfo() object instance
            cachePath (str): path to hold the cache directory
            useCache (bool, optional): flag to use cached files. Defaults to True.

        """
        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__useCache = useCache
        self.__cachePath = cachePath
        # self.__contentInfoConfigName = "content_info_helper_configuration"
        self.__fileU = FileUtil()
        self.__contentDefHelper = self.__cfgOb.getHelper(
            "CONTENT_DEF_HELPER_MODULE",
            sectionName=self.__configName,
            cfgOb=self.__cfgOb)
        self.__dirPath = os.path.join(
            cachePath,
            self.__cfgOb.get("DATA_TYPE_INFO_CACHE_DIR",
                             sectionName=self.__configName))
        self.__kwargs = kwargs
        #
        logger.debug("Leaving constructor")

    def getDataTypeInstanceApi(self, databaseName, **kwargs):
        """Return instance of DataTypeInstanceInfo().

        Args:
            databaseName (str): database name

        Returns:
            (object): Instance of DataTypeInstanceInfo()
        """
        _ = kwargs
        dataTypeInstanceLocatorPath = self.__cfgOb.getPath(
            "INSTANCE_DATA_TYPE_INFO_LOCATOR_PATH",
            sectionName=self.__configName)
        dataTypeInstanceFile = self.__contentDefHelper.getDataTypeInstanceFile(
            databaseName) if self.__contentDefHelper else None
        if dataTypeInstanceLocatorPath and dataTypeInstanceFile:
            loc = os.path.join(dataTypeInstanceLocatorPath,
                               dataTypeInstanceFile)
            filePath = self.__reload(loc,
                                     self.__dirPath,
                                     useCache=self.__useCache)
            dtApi = DataTypeInstanceInfo(filePath)
        else:
            # DataTypeInstanceInfo() provides an internal by-pass mode where no coverage data is available.
            dtApi = DataTypeInstanceInfo(None)
            logger.debug("No data coverage available for database %s",
                         databaseName)
        return dtApi

    def getDataTypeApplicationApi(self, appName, **kwargs):
        """Return instance of DataTypeApplicationInfo.

        Args:
            appName (str): application name (e.g., SQL, ANY)

        Returns:
            (object): Instance of DataTypeApplicationInfo()
        """
        _ = kwargs
        dataTypeApplicationLocator = self.__cfgOb.getPath(
            "APP_DATA_TYPE_INFO_LOCATOR", sectionName=self.__configName)
        filePath = self.__reload(dataTypeApplicationLocator,
                                 self.__dirPath,
                                 useCache=self.__useCache)
        dtApi = DataTypeApplicationInfo(
            filePath, dataTyping=appName,
            workPath=self.__dirPath) if filePath else None
        return dtApi

    def __reload(self, urlTarget, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(urlTarget)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Using cache path %s", dirPath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.debug("Fetch data from source %s", urlTarget)
            ok = self.__fileU.get(urlTarget, os.path.join(dirPath, fn))

        return filePath if ok else None
class DictionaryApiProvider(SingletonClass):
    """ Resource provider for dictionary APIs.
    """
    def __init__(self, dirPath, useCache=True):
        """Resource provider for dictionary APIs.

        Args:
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        """
        self.__apiMap = {}
        self.__dirPath = dirPath
        self.__useCache = useCache
        #
        self.__fileU = FileUtil(workPath=self.__dirPath)
        logger.debug("Leaving constructor")

    def __reload(self, dictLocators, dirPath, useCache=True):
        """Reload local cache of dictionary resources and return a dictionary API instance.

        Args:
            dictLocators (list, str): list of locators for dictionary resource files
            dirPath (str): path to the directory containing cache files
            useCache (bool, optional): flag to use cached files. Defaults to True.

        Returns:
            (object): instance of dictionary API
        """
        #
        # verify the exitence of the cache directory ...
        self.__fileU.mkdir(dirPath)
        if not useCache:
            for dictLocator in dictLocators:
                try:
                    fn = self.__fileU.getFileName(dictLocator)
                    os.remove(os.path.join(dirPath, fn))
                except Exception:
                    pass
        #
        ret = True
        for dictLocator in dictLocators:
            cacheFilePath = os.path.join(dirPath,
                                         self.__fileU.getFileName(dictLocator))
            if useCache and self.__fileU.exists(cacheFilePath):
                # nothing to do
                continue
            logger.debug("Fetching url %s caching in %s", dictLocator,
                         cacheFilePath)
            ok = self.__fileU.get(dictLocator, cacheFilePath)
            ret = ret and ok
        return ret

    def getApi(self, dictLocators, **kwargs):
        """Return a dictionary API object of the input dictioaries.

        Arguments:
            dictLocators {list str} -- list of dictionary locator paths

        Returns:
            [object] -- returns DictionaryApi() object for input dictionaries
        """
        dictFileNames = [
            self.__fileU.getFileName(dictLocator)
            for dictLocator in dictLocators
        ]
        dictTup = tuple(dictFileNames)
        dApi = self.__apiMap[
            dictTup] if dictTup in self.__apiMap else self.__getApi(
                dictLocators, **kwargs)
        self.__apiMap[dictTup] = dApi
        return dApi

    def __getApi(self, dictLocators, **kwargs):
        """ Return an instance of a dictionary API instance for the input dictionary locator list.
        """
        consolidate = kwargs.get("consolidate", True)
        replaceDefinition = kwargs.get("replaceDefinitions", True)
        verbose = kwargs.get("verbose", True)
        #
        ok = self.__reload(dictLocators,
                           self.__dirPath,
                           useCache=self.__useCache)
        #
        dApi = None
        if ok:
            mU = MarshalUtil()
            containerList = []
            for dictLocator in dictLocators:
                cacheFilePath = os.path.join(
                    self.__dirPath, self.__fileU.getFileName(dictLocator))
                containerList.extend(
                    mU.doImport(cacheFilePath, fmt="mmcif-dict"))
            #
            dApi = DictionaryApi(containerList=containerList,
                                 consolidate=consolidate,
                                 replaceDefinition=replaceDefinition,
                                 verbose=verbose)
        return dApi
Example #26
0
    def fetchPartitionedBundle(self, localRestoreDirPath, gitRepositoryPath, gitRawHost="raw.githubusercontent.com", gitBranch="master", remoteStashPrefix="A"):
        """Fetch bundle from a remote stash public git repository via http.

        Args:
            localRestoreDirPath (str): local restore path
            gitRepositoryPath (str): git repository path (e.g., rcsb/py-rcsb_exdb_assets_stash)
            gitHost (str, optional): git repository host name. Defaults to github.com.
            gitBranch (str, optional): git branch name. Defaults to master.
            remoteStashPrefix (str, optional): optional label preppended to the stashed dependency bundle artifact (default='A')

        Returns:
          bool:  True for success or False otherwise

            https://raw.githubusercontent.com/rcsb/py-rcsb_exdb_assets_stash/master/stash/<file_or_dir>
        """
        try:
            ok = False
            fileU = FileUtil()
            bundleFileName = self.__makeBundleFileName(self.__baseBundleFileName, remoteStashPrefix=remoteStashPrefix)
            urlBase = "https://" + gitRawHost
            rp = gitRepositoryPath[:-4] if gitRepositoryPath.endswith(".git") else gitRepositoryPath
            repoDirPath = os.path.join(urlBase, rp, gitBranch, "stash")

            # First fetch the manifest file
            remoteDirPath = os.path.join(repoDirPath, bundleFileName[:-7])
            remotePath = os.path.join(remoteDirPath, "MANIFEST")
            logger.debug("Manifest remote %r", remotePath)
            #
            localDirPath = os.path.join(self.__localBundlePath, bundleFileName[:-7])
            manifestPath = os.path.join(localDirPath, "MANIFEST")

            ok = fileU.get(remotePath, manifestPath)
            if not ok:
                logger.error("No manifest file at %r", remotePath)
                return ok
            # ---
            partFileName = "part_1"
            remotePartPath = os.path.join(repoDirPath, bundleFileName[:-7], partFileName)
            logger.debug("remotePartPath %r", remotePartPath)
            # ---
            partList = []
            with open(manifestPath, "r") as mfh:
                line = mfh.readline()
                tf, myHash = line[:-1].split("\t")
                logger.debug("Fetched manifest for %s hash %r", tf, myHash)
                for line in mfh:
                    partList.append(line[:-1])
            #
            logger.debug("Parts (%d) %r", len(partList), partList)
            for part in partList:
                localPath = os.path.join(localDirPath, part)
                remotePath = os.path.join(repoDirPath, bundleFileName[:-7], part)
                logger.debug("%r %r", remotePath, localPath)
                fileU.get(remotePath, localPath)
            #
            sj = SplitJoin()
            ok = sj.join(self.__localStashTarFilePath, localDirPath)
            if ok:
                ok = fileU.unbundleTarfile(self.__localStashTarFilePath, dirPath=localRestoreDirPath)
            return ok
        except Exception as e:
            logger.exception("Failing for %r with %s", bundleFileName, str(e))
            ok = False
        return ok
Example #27
0
class FileUtilTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                     "mock-data",
                                                     "dictionaries",
                                                     "mmcif_pdbx_v5_next.dic")

        self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "NCBI", "names.dmp.gz")
        self.__zipFileUrl = "https://inventory.data.gov/dataset/794cd3d7-4d28-4408-8f7d-84b820dbf7f2/resource/6b78ec0c-4980-4ad8-9cbd-2d6eb9eda8e7/download/myfoodapediadata.zip"
        self.__xzFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                     "MOCK_MODBASE_MODELS",
                                     "NP_001030614.1_1.pdb.xz")
        #
        self.__ftpFileUrl = "ftp://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        self.__httpsFileUrl = "https://ftp.wwpdb.org/pub/pdb/data/component-models/complete/chem_comp_model.cif.gz"
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__inpDirPath = os.path.join(HERE, "test-data")
        self.__fileU = FileUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    def testTarBundling(self):
        """Test case for tarfile bundling and unbundling"""
        try:
            tP = os.path.join(self.__workPath, "t0.tar.gz")
            dirPath = os.path.join(self.__inpDirPath, "topdir")

            ok = self.__fileU.bundleTarfile(tP, [dirPath],
                                            mode="w:gz",
                                            recursive=True)
            self.assertTrue(ok)

            numBytes = self.__fileU.size(tP)
            self.assertGreaterEqual(numBytes, 250)
            #
            md5 = self.__fileU.hash(tP, hashType="md5")
            self.assertTrue(md5 is not None)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)
            #
            tP = os.path.join(self.__workPath, "t1.tar.gz")
            dirPathList = [
                os.path.join(self.__inpDirPath, "topdir", "subdirA"),
                os.path.join(self.__inpDirPath, "topdir", "subdirB")
            ]

            ok = self.__fileU.bundleTarfile(tP,
                                            dirPathList,
                                            mode="w:gz",
                                            recursive=True)
            self.assertTrue(ok)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)

            tP = os.path.join(self.__workPath, "t2.tar")
            dirPathList = [
                os.path.join(self.__inpDirPath, "topdir", "subdirA"),
                os.path.join(self.__inpDirPath, "topdir", "subdirB")
            ]

            ok = self.__fileU.bundleTarfile(tP,
                                            dirPathList,
                                            mode="w",
                                            recursive=True)
            self.assertTrue(ok)
            #
            ok = self.__fileU.unbundleTarfile(tP, dirPath=self.__workPath)
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testGetFile(self):
        """Test case for a local files and directories"""
        try:
            remoteLocator = self.__pathPdbxDictionaryFile
            fn = self.__fileU.getFileName(remoteLocator)
            # _, fn = os.path.split(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            ok = self.__fileU.remove(lPath)
            self.assertTrue(ok)
            dPath = os.path.join(self.__workPath, "tdir")
            ok = self.__fileU.mkdir(dPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(dPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(";lakdjf")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testMoveAndCopyFile(self):
        """Test case for copying ("put") and moving ("replace") local files"""
        try:
            remoteLocator = self.__pathPdbxDictionaryFile
            fn = self.__fileU.getFileName(remoteLocator)
            # _, fn = os.path.split(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            # Test copy file
            dPath2 = os.path.join(self.__workPath, "tdir")
            ok = self.__fileU.mkdir(dPath2)
            self.assertTrue(ok)
            lPath2 = os.path.join(dPath2, fn)
            ok = self.__fileU.put(lPath, lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertTrue(ok)
            # Remove copied file (to test moving file next)
            ok = self.__fileU.remove(lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertFalse(ok)
            # Test move file
            ok = self.__fileU.replace(lPath, lPath2)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertFalse(ok)
            ok = self.__fileU.exists(lPath2)
            self.assertTrue(ok)
            # Now clean up files and dirs
            ok = self.__fileU.remove(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.remove(dPath2)
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testZipUrl(self):
        """Test case for downloading remote zip file and extracting contents."""
        try:
            remoteLocator = self.__zipFileUrl
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            lPath = os.path.join(self.__workPath,
                                 self.__fileU.getFileName(self.__zipFileUrl))
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath)
            ok = fp.endswith("Food_Display_Table.xlsx")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testFtpUrl(self):
        """Test case for downloading remote file ftp protocol and extracting contents."""
        try:
            remoteLocator = self.__ftpFileUrl
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            dirPath = os.path.join(self.__workPath, "chem_comp_models")
            lPath = os.path.join(dirPath,
                                 self.__fileU.getFileName(self.__ftpFileUrl))
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=dirPath)
            ok = fp.endswith("chem_comp_model.cif")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testRemote(self):
        """Test case remote status"""
        try:
            remoteLocator = self.__httpsFileUrl
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            ok = self.__fileU.exists(remoteLocator)
            self.assertTrue(ok)
            size = self.__fileU.size(remoteLocator)
            self.assertGreaterEqual(size, 1000)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    @unittest.skip("DrugBank example -- skipping")
    def testGetDrugBankUrl(self):
        """Test case for downloading drugbank master xml file"""
        try:
            remoteLocator = "https://www.drugbank.ca/releases/latest/downloads/all-full-database"
            un = "username"
            pw = "password"
            # fn = self.__fileU.getFileName(remoteLocator)
            ok = self.__fileU.isLocal(remoteLocator)
            self.assertFalse(ok)
            #
            lPath = os.path.join(self.__workPath, "db-download.zip")
            ok = self.__fileU.get(remoteLocator,
                                  lPath,
                                  username=un,
                                  password=pw)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            self.__fileU.uncompress(lPath, outputDir=self.__workPath)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testXzFile(self):
        """Test case for extracting contents from xz file"""
        try:
            remoteLocator = self.__xzFile
            fn = self.__fileU.getFileName(remoteLocator)
            lPath = os.path.join(self.__workPath, fn)
            ok = self.__fileU.get(remoteLocator, lPath)
            self.assertTrue(ok)
            ok = self.__fileU.exists(lPath)
            self.assertTrue(ok)
            ok = self.__fileU.isLocal(lPath)
            self.assertTrue(ok)
            tPath = self.__fileU.getFilePath(lPath)
            self.assertEqual(lPath, tPath)
            fp = self.__fileU.uncompress(lPath, outputDir=self.__workPath)
            ok = fp.endswith(".pdb")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()
Example #28
0
class CODModelSearch(object):
    def __init__(self, cachePath, **kwargs):
        self.__cachePath = cachePath
        #
        self.__useCache = kwargs.get("useCache", True)
        self.__ccUrlTarget = kwargs.get("ccUrlTarget", None)
        self.__birdUrlTarget = kwargs.get("birdUrlTarget", None)
        self.__descriptorUrlTarget = kwargs.get(
            "descriptorUrlTarget",
            "http://www.crystallography.net/cod/smi/allcod.smi")
        self.__prefix = kwargs.get("prefix", None)
        self.__numProc = kwargs.get("numProc", 4)
        self.__chunkSize = kwargs.get("chunkSize", 50)
        self.__ccFileNamePrefix = "cc-%s" % self.__prefix if self.__prefix else "cc-full"
        self.__fU = FileUtil()
        # self.__ccmG = ChemCompModelGen(self.__cachePath, self.__prefix)

    def getResultIndex(self):
        mU = MarshalUtil(workPath=self.__cachePath)
        cD = mU.doImport(self.getResultFilePath(), fmt="json")
        return cD

    def getResultDetails(self, codId):
        mU = MarshalUtil(workPath=self.__cachePath)
        dD = mU.doImport(self.__getCodDetailsFilePath(codId), fmt="json")
        return dD

    def storeResultIndex(self, cD):
        mU = MarshalUtil(workPath=self.__cachePath)
        ok = mU.doExport(self.getResultFilePath(), cD, fmt="json", indent=3)
        return ok

    def getResultDirFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-result-files"
        return os.path.join(self.__cachePath, dN)

    def getRawResultFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files"
        return os.path.join(self.__cachePath, dN,
                            "cod-raw-result-file-index.json")

    def getResultFilePath(self):
        dN = "cod-%s-result-files" % self.__prefix if self.__prefix else "cod-search-files"
        return os.path.join(self.__cachePath, dN, "cod-result-file-index.json")

    def getDescriptorPath(self):
        fn = self.__fU.getFileName(self.__descriptorUrlTarget)
        dirPath = self.getResultDirFilePath()
        filePath = os.path.join(dirPath, fn)
        return filePath

    def updateDescriptors(self):
        self.__fetchUrl(self.__descriptorUrlTarget,
                        filePath=self.getDescriptorPath(),
                        useCache=False)

    def __fetchUrl(self, urlTarget, filePath, useCache=False, noRetry=False):
        ok = False
        try:
            if not (useCache and self.__fU.exists(filePath)):
                startTime = time.time()
                ok = self.__fU.get(urlTarget, filePath, noRetry=noRetry)
                endTime = time.time()
                if ok:
                    logger.debug(
                        "Fetched %s for resource file %s (status = %r) (%.4f seconds)",
                        urlTarget, filePath, ok, endTime - startTime)
                else:
                    logger.error(
                        "Failing fetch for %s for resource file %s (status = %r) (%.4f seconds)",
                        urlTarget, filePath, ok, endTime - startTime)
            else:
                ok = True
                logger.debug("Using cached data for %s", urlTarget)
            #
        except Exception as e:
            logger.exception("Failing for %r with %s", urlTarget, str(e))
        return ok

    def search(self, molLimit=None):
        try:
            bsw = BatchChemSearch(
                useCache=self.__useCache,
                ccUrlTarget=self.__ccUrlTarget,
                birdUrlTarget=self.__birdUrlTarget,
                ccFileNamePrefix=self.__ccFileNamePrefix,
                cachePath=self.__cachePath,
                numProc=self.__numProc,
                chunkSize=self.__chunkSize,
            )
            smiPath = self.getDescriptorPath()
            smiL = bsw.fetchDescriptorList(smiPath, swap=True)
            logger.info("Query length (%d)", len(smiL))
            #
            smiL = bsw.splitSmiles(smiL)
            retL = bsw.doQuery(smiL[:molLimit],
                               "SMILES",
                               matchOpts="graph-exact")
            logger.info("Result length (%d)", len(retL))
            #
            for ii, ret in enumerate(retL, 1):
                logger.debug("%5d %8s %4s (%.3f) %s: %s", ii, ret.queryId,
                             ret.ccId, ret.fpScore, ret.queryType, ret.query)
            #
            fp = self.getRawResultFilePath()
            ok = bsw.storeMatchList(fp, retL)
            return len(retL) if ok else 0
        except Exception as e:
            logger.exception("Failing with %s", str(e))

    def __getSearchResults(self):
        """Read search results and convert to a chemical component dictionary."""
        fp = self.getRawResultFilePath()
        mU = MarshalUtil(workPath=self.__cachePath)
        rawL = mU.doImport(fp, fmt="json")
        rD = {}
        for cD in rawL:
            rD.setdefault(cD["ccId"], []).append(cD)
        return rD

    def __getCodEntryUrl(self, codId):
        # Template Examples:
        # https://molecules.crystallography.net/cod/sdf/1/00/00/1000098.sdf
        # https://molecules.crystallography.net/cod/sdf/6/00/05/6000557.sdf
        #
        baseUrl = "https://molecules.crystallography.net/cod/sdf"
        url = os.path.join(baseUrl, codId[0:1], codId[1:3], codId[3:5],
                           codId + ".sdf")
        return url

    def __getCodDetailsUrl(self, codId):
        baseUrl = "http://www.crystallography.net/cod/optimade/structures"
        url = os.path.join(baseUrl, codId)
        return url

    def __getCodDetailsFilePath(self, codId):
        dirPath = self.getResultDirFilePath()
        fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3],
                          codId[3:5], codId + ".json")
        return fp

    def __getCodEntryFilePath(self, codId):
        dirPath = self.getResultDirFilePath()
        fp = os.path.join(dirPath, "cod-data", codId[0:1], codId[1:3],
                          codId[3:5], codId + ".sdf")
        return fp

    def fetchMatchedData(self, useCache=True):
        """Fetch COD matched entries and metadata and update the raw search index with essential COD data attrbutes.

        Args:
            useCache (bool, optional): use any cached COD data. Defaults to True.

        Returns:
            int: search result count

        """
        eCount = 0
        eSkip = 0
        rcD = {}
        cD = self.__getSearchResults()
        #
        for ccId, qDL in cD.items():
            # cifPath = self.__ccmG.getChemCompPath(ccId)
            # if not cifPath:
            #    logger.info("No CIF for %s skipping", ccId)
            #    continue
            parentId = ccId.split("|")[0]
            rqDL = []
            for qD in qDL:
                codId = qD["queryId"]
                codEntryFilePath = self.__getCodEntryFilePath(codId)
                codDetailsFilePath = self.__getCodDetailsFilePath(codId)
                ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId),
                                      self.__getCodEntryFilePath(codId),
                                      useCache=useCache,
                                      noRetry=True)
                ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId),
                                      self.__getCodDetailsFilePath(codId),
                                      useCache=useCache,
                                      noRetry=True)
                tD = self.getResultDetails(codId)
                dD = tD["data"][
                    "attributes"] if "data" in tD and "attributes" in tD[
                        "data"] else {}
                mD = tD["meta"][
                    "implementation"] if "meta" in tD and "implementation" in tD[
                        "meta"] else {}
                if ok1 & ok2:
                    logger.info("Fetched COD entry and details for %s (%r)",
                                codId, ok1 & ok2)
                    eCount += 1
                    qD["codEntryFilePath"] = codEntryFilePath
                    qD["codDetailsFilePath"] = codDetailsFilePath
                    # qD["cifPath"] = cifPath
                    qD["parentId"] = parentId
                    qD["chemicalName"] = dD[
                        "_cod_commonname"] if "_cod_commonname" in dD else None
                    qD["chemicalName"] = dD[
                        "_cod_chemname"] if "_cod_chemname" in dD else qD[
                            "chemicalName"]
                    qD["rValue"] = dD[
                        "_cod_Robs"] if "_cod_Robs" in dD else None
                    qD["diffrnTemp"] = dD[
                        "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None
                    qD["radiationSource"] = dD[
                        "_cod_radType"] if "_cod_radType" in dD else None
                    qD["publicationDOI"] = dD[
                        "_cod_doi"] if "_cod_doi" in dD else None
                    qD["version"] = mD["version"] if "version" in mD else None
                    qD["hasDisorder"] = "N"
                    rqDL.append(qD)
                else:
                    logger.info("Skipping entry missing data for %r at %r",
                                codId, self.__getCodEntryUrl(codId))
                    eSkip += 1
            if rqDL:
                rcD[ccId] = rqDL
        #
        ok = self.storeResultIndex(rcD)
        logger.info(
            "Final match result (w/sdf and metadata) (%d/%d) cod hits (%d) skipped (%d)",
            len(rcD), len(cD), eCount, eSkip)
        return eCount if ok else 0

    def fetchMatchedDataMp(self, numProc=6, chunkSize=5, useCache=True):
        rcD = {}
        cD = self.__getSearchResults()
        idList = list(cD.keys())
        # ---
        mpu = MultiProcUtil(verbose=True)
        mpu.setWorkingDir(self.__cachePath)
        mpu.setOptions(optionsD={
            "resultPath": self.__cachePath,
            "cD": cD,
            "useCache": useCache
        })
        mpu.set(workerObj=self, workerMethod="fetchDataWorker")

        ok, failList, resultList, _ = mpu.runMulti(dataList=idList,
                                                   numProc=numProc,
                                                   numResults=1,
                                                   chunkSize=chunkSize)
        logger.info("Run ended with status %r success count %d failures %r",
                    ok, len(resultList[0]), len(failList))
        for rTup in resultList[0]:
            rcD[rTup[0]] = rTup[1]
        # ---
        ok = self.storeResultIndex(rcD)
        logger.info("Final match result (w/sdf and metadata) (%d/%d)",
                    len(rcD), len(cD))
        return True

    def fetchDataWorker(self, dataList, procName, optionsD, workingDir):
        """Worker method to fetch COD data for matched entries

        Args:
            dataList (list): list of mol2 file paths to be searched
            procName (str): processName
            optionsD (dict): dictionary of options
            workingDir (str): path to working directory (not used)

        Returns:
            (successList, resultList, []): success and result lists of mol2 paths with CCDC matches
        """
        resultPath = optionsD["resultPath"]
        cD = optionsD["cD"]
        useCache = optionsD["useCache"]
        _ = workingDir
        resultList = []
        successList = []
        startTime = time.time()
        logger.info("starting %s at %s", procName,
                    time.strftime("%Y %m %d %H:%M:%S", time.localtime()))
        #
        eCount = 0
        eSkip = 0
        try:
            stopPath = os.path.join(resultPath, "STOP")
            logger.info("%s starting search data length %d", procName,
                        len(dataList))
            if self.__checkStop(stopPath):
                logger.info("%s stopping", procName)
                return resultList, resultList, []
            #
            # for ccId, qDL in cD.items():
            for ccId in dataList:
                if ccId in cD:
                    qDL = cD[ccId]
                #
                parentId = ccId.split("|")[0]
                rqDL = []
                for qD in qDL:
                    codId = qD["queryId"]
                    codEntryFilePath = self.__getCodEntryFilePath(codId)
                    codDetailsFilePath = self.__getCodDetailsFilePath(codId)
                    ok1 = self.__fetchUrl(self.__getCodEntryUrl(codId),
                                          self.__getCodEntryFilePath(codId),
                                          useCache=useCache,
                                          noRetry=True)
                    ok2 = self.__fetchUrl(self.__getCodDetailsUrl(codId),
                                          self.__getCodDetailsFilePath(codId),
                                          useCache=useCache,
                                          noRetry=True)
                    tD = self.getResultDetails(codId)
                    dD = tD["data"][
                        "attributes"] if "data" in tD and "attributes" in tD[
                            "data"] else {}
                    mD = tD["meta"][
                        "implementation"] if "meta" in tD and "implementation" in tD[
                            "meta"] else {}
                    if ok1 & ok2:
                        logger.info(
                            "Fetched COD entry and details for %s (%r)", codId,
                            ok1 & ok2)
                        eCount += 1
                        qD["codEntryFilePath"] = codEntryFilePath
                        qD["codDetailsFilePath"] = codDetailsFilePath
                        # qD["cifPath"] = cifPath
                        qD["parentId"] = parentId
                        qD["chemicalName"] = dD[
                            "_cod_commonname"] if "_cod_commonname" in dD else None
                        qD["chemicalName"] = dD[
                            "_cod_chemname"] if "_cod_chemname" in dD else qD[
                                "chemicalName"]
                        qD["rValue"] = dD[
                            "_cod_Robs"] if "_cod_Robs" in dD else None
                        qD["diffrnTemp"] = dD[
                            "_cod_diffrtemp"] if "_cod_diffrtemp" in dD else None
                        qD["radiationSource"] = dD[
                            "_cod_radType"] if "_cod_radType" in dD else None
                        qD["publicationDOI"] = dD[
                            "_cod_doi"] if "_cod_doi" in dD else None
                        qD["version"] = mD[
                            "version"] if "version" in mD else None
                        qD["hasDisorder"] = "N"
                        rqDL.append(qD)
                    else:
                        logger.info("Skipping entry missing data for %r at %r",
                                    codId, self.__getCodEntryUrl(codId))
                        eSkip += 1
                if rqDL:
                    resultList.append((ccId, rqDL))
                    successList.append(ccId)
        except Exception as e:
            logger.exception("Failing with %s", str(e))

        endTime = time.time()
        logger.info(
            "%s (entries %d skipped %d) (ccId result length %d) completed at %s (%.2f seconds)",
            procName,
            eCount,
            eSkip,
            len(successList),
            time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
            endTime - startTime,
        )
        return successList, resultList, []

    def __checkStop(self, path):
        try:
            if os.access(path, os.F_OK):
                return True
        except Exception:
            pass
        return False
Example #29
0
class SchemaProvider(SingletonClass):
    """ A collection of schema build and caching methods.

        Static cache worflow:

            <authorative source>  <--   <cache dir>  <-  client API

        Compute workflow:

        <dependent resource files, config file, dictionaries> -> [schema builder] --> <schema def> --> <Json schema>

    """

    def __init__(self, cfgOb, cachePath, useCache=True, rebuildFlag=False, **kwargs):
        """A collection of schema build and caching methods.

        Args:
            cfgOb (object): ConfigInfo() instance
            cachePath (str): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
            rebuildFlag (bool, optional): on-the-fly rebuild and cache schema
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = os.path.abspath(cachePath)
        self.__useCache = useCache
        self.__rebuildFlag = rebuildFlag
        self.__useCache = rebuildFlag if rebuildFlag else useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")

        self.__fileU = FileUtil(workPath=os.path.join(self.__cachePath, "work"))
        self.__schemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__jsonSchemaCachePath = os.path.join(self.__cachePath, self.__cfgOb.get("JSON_SCHEMA_DEFINITION_CACHE_DIR", sectionName=self.__configName))
        self.__fileU.mkdir(self.__schemaCachePath)
        self.__fileU.mkdir(self.__jsonSchemaCachePath)
        self.__kwargs = kwargs

    def getSchemaOptions(self, schemaLevel, extraOpts=None):
        opts = extraOpts + "|" if extraOpts else ""
        if schemaLevel == "full":
            return opts + "mandatoryKeys|mandatoryAttributes|bounds|enums|rcsb"
        elif schemaLevel in ["min", "minimum"]:
            return opts + "mandatoryKeys|enums|rcsb"
        else:
            return opts

    def getSchemaInfo(self, databaseName, dataTyping="ANY"):
        """Convenience method to return essential schema details for the input repository content type.

        Args:
            databaseName (str): schema name  (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

        Returns:
            tuple: SchemaDefAccess(object), target database name, target collection name list, primary index attribute list


        """
        sd = None
        dbName = None
        collectionNameList = []
        docIndexD = {}
        try:
            mU = MarshalUtil(workPath=self.__workPath)
            schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
            if self.__rebuildFlag:
                filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                self.makeSchemaDef(databaseName, dataTyping=dataTyping, saveSchema=True)
            else:
                filePath = self.__reload(schemaLocator, self.__schemaCachePath, useCache=self.__useCache)

            if not filePath:
                logger.error("Unable to recover schema %s (%s)", databaseName, dataTyping)
            logger.debug("ContentType %r dataTyping %r schemaLocator %r", databaseName, dataTyping, schemaLocator)
            schemaDef = mU.doImport(filePath, fmt="json")
            if schemaDef:
                logger.debug("Using cached schema definition for %s application %s", databaseName, dataTyping)
                sd = SchemaDefAccess(schemaDef)
                if sd:
                    dbName = sd.getDatabaseName()
                    collectionInfoList = sd.getCollectionInfo()
                    logger.debug("Schema %s database name %s collections %r", databaseName, dbName, collectionInfoList)
                    for cd in collectionInfoList:
                        collectionName = cd["NAME"]
                        collectionNameList.append(collectionName)
                        docIndexD[collectionName] = sd.getDocumentIndices(collectionName)

        except Exception as e:
            logger.exception("Retreiving schema %s for %s failing with %s", databaseName, dataTyping, str(e))

        return sd, dbName, collectionNameList, docIndexD

    def schemaDefCompare(self, databaseName, dataTyping="ANY"):
        """Compare computed schema defintion with current source/cached version.

        Args:
            databaseName (str): schema definition name for comparison
            dataTyping (str, optional): data type conventions for the schema comparison. Defaults to "ANY".

        Returns:
            (str): file path for schema difference or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaPath = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
        fn = self.__fileU.getFileName(schemaPath)
        sD = self.makeSchemaDef(databaseName, dataTyping=dataTyping)
        v2 = sD["DATABASE_VERSION"]
        # ----
        # tPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting schema def to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # sD = mU.doImport(tPath, fmt="json")
        # ----
        cPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaPath))
        sDCache = mU.doImport(cPath, fmt="json")
        v1 = sDCache["DATABASE_VERSION"]
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        #
        # jD = diff(sDCache, sD, syntax="explicit", marshal=True)
        diffPath = None
        if numDiff:
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            # logger.info("diff for %s %s = \n%s", databaseName, dataTyping, pprint.pformat(difD, indent=3, width=100))
            mU.doExport(diffPath, difD, fmt="json", indent=3)
        #
        return diffPath

    def jsonSchemaCompare(self, databaseName, collectionName, encodingType, level, extraOpts=None):
        """Compare computed JSON schema defintion with current source/cached version.

        Args:
            databaseName (str): schema name
            collectionName (str): collection name
            encodingType (str): schema data type conventions (JSON|BSON)
            level (str): metadata level (min|full)
            extraOpts (str): extra schema construction options

        Returns:
            (str): path to the difference file or None
        """
        mU = MarshalUtil(workPath=self.__workPath)
        schemaDiffPath = os.path.join(self.__cachePath, "schema_diff")
        mU.mkdir(schemaDiffPath)
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType, level)
        fn = self.__fileU.getFileName(schemaLocator)
        schemaPath = os.path.join(self.__jsonSchemaCachePath, fn)
        #
        sD = self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, saveSchema=False, extraOpts=extraOpts)
        v2 = self.__getSchemaVersion(sD)
        # ----
        # tPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaPath) + "-test")
        # logger.info("Exporting json schema to %s", tPath)
        # mU.doExport(tPath, sD, fmt="json", indent=3)
        # ----
        #
        sDCache = mU.doImport(schemaPath, fmt="json")
        v1 = self.__getSchemaVersion(sDCache)
        if not v1:
            logger.error("no version for %s - %s %s", schemaLocator, databaseName, collectionName)
        #
        numDiff, difD = self.schemaCompare(sDCache, sD)
        # jD = diff(sDCache, sD, marshal=True, syntax="explicit")
        diffPath = None
        if numDiff:
            logger.debug("diff for %s %s %s %s = \n%s", databaseName, collectionName, encodingType, level, pprint.pformat(difD, indent=3, width=100))
            bn, _ = os.path.splitext(fn)
            diffPath = os.path.join(schemaDiffPath, bn + "-" + v1 + "-" + v2 + "-diff.json")
            mU.doExport(diffPath, difD, fmt="json", indent=3)

        return diffPath

    def __getSchemaVersion(self, jsonSchema):
        try:
            comment = jsonSchema["$comment"] if "$comment" in jsonSchema else ""
            ff = comment.split(":")
            version = ff[1].strip()
            return version
        except Exception as e:
            logger.exception("Failing for with %s", str(e))
        return ""

    def __getSchemaDefLocator(self, databaseName, dataTyping="ANY"):
        """Internal method returning schema definition path for the input content type and application.
           Defines schema definition naming convention -

           Args:
            databaseName (str): schema name (e.g. pdbx, bird, chem_comp, ...)
            dataTyping (str, optional): Application name for the target schema (e.g. ANY, SQL, ...)

            Returns:

             str: schema definition file locator

        """
        schemaLocator = None
        try:
            locPath = self.__cfgOb.get("SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
            fn = "schema_def-%s-%s.json" % (databaseName, dataTyping.upper())
            schemaLocator = os.path.join(locPath, fn)
        except Exception as e:
            logger.exception("Retreiving schema definition path %s for %s failing with %s", databaseName, dataTyping, str(e))
        return schemaLocator

    def __getJsonSchemaLocator(self, databaseName, collectionName, encodingType="BSON", level="full"):
        """Internal method returning JSON schema path for the input collection data type convention and level.
           Defines the JSON/BSON schema naming convention -

           Args:
            databaseName (str): database name in the document store
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

            Returns:

            str: schema file locator

        """
        schemaLocator = None
        try:
            sdType = None
            sLevel = None
            schemaLocator = None
            if encodingType.upper() in ["JSON", "BSON"]:
                sdType = encodingType.lower()
            if level.lower() in ["min", "minimun"]:
                sLevel = "min"
            elif level.lower() in ["full"]:
                sLevel = level.lower()
            #
            if sdType and sLevel:
                locPath = self.__cfgOb.get("JSON_SCHEMA_DEFINITION_LOCATOR_PATH", sectionName=self.__configName)
                fn = "%s-%s-db-%s-col-%s.json" % (sdType, sLevel, databaseName, collectionName)
                schemaLocator = os.path.join(locPath, fn)
            else:
                logger.error("Unsupported schema options:  %s level %r type %r", collectionName, level, encodingType)
                schemaLocator = None
        except Exception as e:
            logger.debug("Retreiving JSON schema definition for %s type %s failing with %s", collectionName, encodingType, str(e))
        #
        return schemaLocator

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Target cache filePath %s", filePath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.info("Fetch data from source %s to %s", locator, filePath)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def getJsonSchema(self, databaseName, collectionName, encodingType="BSON", level="full", extraOpts=None):
        """Return JSON schema (w/ BSON types) object for the input collection and level.and

        Args:
            databaseName (str): database name
            collectionName (str): collection name in document store
            encodingType (str, optional): data type convention (BSON|JSON)
            level (str, optional): Completeness of the schema (e.g. min or full)

        Returns:
            dict: Schema object

        """
        sObj = None
        schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
        #
        if self.__rebuildFlag:
            filePath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
            self.makeSchema(databaseName, collectionName, encodingType=encodingType, level=level, extraOpts=extraOpts)
        else:
            filePath = self.__reload(schemaLocator, self.__jsonSchemaCachePath, useCache=self.__useCache)
        mU = MarshalUtil(workPath=self.__workPath)
        if filePath and mU.exists(filePath):
            mU = MarshalUtil(workPath=self.__workPath)
            sObj = mU.doImport(filePath, fmt="json")
        else:
            logger.debug("Failed to read schema for %s %r", collectionName, level)
        return sObj

    def makeSchema(self, databaseName, collectionName, encodingType="BSON", level="full", saveSchema=False, extraOpts=None):
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            #
            cD = None
            stU = encodingType.upper()
            cD = smb.build(collectionName, dataTyping=stU, encodingType=stU, enforceOpts=self.getSchemaOptions(level, extraOpts=extraOpts))
            if cD and saveSchema:
                schemaLocator = self.__getJsonSchemaLocator(databaseName, collectionName, encodingType=encodingType, level=level)
                localPath = os.path.join(self.__jsonSchemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, cD, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s collection %s failing with %s", databaseName, collectionName, str(e))
        return cD

    def makeSchemaDef(self, databaseName, dataTyping="ANY", saveSchema=False):
        schemaDef = None
        try:
            smb = SchemaDefBuild(databaseName, self.__cfgOb, cachePath=self.__cachePath)
            schemaDef = smb.build(dataTyping=dataTyping, encodingType="rcsb")
            if schemaDef and saveSchema:
                schemaLocator = self.__getSchemaDefLocator(databaseName, dataTyping=dataTyping)
                localPath = os.path.join(self.__schemaCachePath, self.__fileU.getFileName(schemaLocator))
                mU = MarshalUtil(workPath=self.__workPath)
                mU.doExport(localPath, schemaDef, fmt="json", indent=3, enforceAscii=False)
        except Exception as e:
            logger.exception("Building schema %s failing with %s", databaseName, str(e))
        return schemaDef

    def schemaCompare(self, orgD, newD):
        """ Compute the difference of nested dictionaries.

        """
        fOrgD = self.__flatten(orgD)
        fNewD = self.__flatten(newD)
        if len(fOrgD) != len(fNewD):
            logger.debug("Schema lengths differ: org %d new %d", len(fOrgD), len(fNewD))
        #
        addedD = {k: fNewD[k] for k in set(fNewD) - set(fOrgD)}
        removedD = {k: fOrgD[k] for k in set(fOrgD) - set(fNewD)}
        changedOrgD = {k: fOrgD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        changedNewD = {k: fNewD[k] for k in set(fOrgD) & set(fNewD) if fOrgD[k] != fNewD[k]}
        chD = {}
        for ky in changedOrgD:
            kyS = ".".join(ky)
            vOrg = changedOrgD[ky]
            vNew = changedNewD[ky]
            if isinstance(vOrg, (list, tuple)) and isinstance(vNew, (list, tuple)):
                # logger.info(" >> %r vOrg %r vNew %r", ky, vOrg, vNew)
                dV = list(set(vNew) - set(vOrg))
                if dV:
                    chD[kyS] = {"diff": dV}
            else:
                chD[kyS] = {"from": vOrg, "to": vNew}
        #
        nT = len(addedD) + len(removedD) + len(chD)
        diffD = {"added": [".".join(kk) for kk in addedD.keys()], "removed": [".".join(kk) for kk in removedD.keys()], "changed": chD}
        return nT, diffD

    def __flatten(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, (list, tuple)) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenX(self, inpDict, prefix=None):
        prefix = prefix[:] if prefix else []
        # separator = "."
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flatten(value, prefix + [key])
                outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flatten(sublist, prefix + [key] + [str(index)])
                        outDict.update({tuple(key2): val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[tuple(prefix + [key] + [str(index)])] = value
            else:
                outDict[tuple(prefix + [key])] = value
        return outDict

    def __flattenOrg(self, inpDict, separator=".", prefix=""):
        outDict = {}
        for key, value in inpDict.items():
            if isinstance(value, dict) and value:
                deeper = self.__flattenOrg(value, separator, prefix + key + separator)
                outDict.update({key2: val2 for key2, val2 in deeper.items()})
            elif isinstance(value, list) and value:
                for index, sublist in enumerate(value, start=1):
                    if isinstance(sublist, dict) and sublist:
                        deeper = self.__flattenOrg(sublist, separator, prefix + key + separator + str(index) + separator)
                        outDict.update({key2: val2 for key2, val2 in deeper.items()})
                    else:
                        outDict[prefix + key + separator + str(index)] = value
            else:
                outDict[prefix + key] = value
        return outDict

    def __dictGen(self, indict, pre=None):
        pre = pre[:] if pre else []
        if isinstance(indict, dict):
            for key, value in indict.items():
                if isinstance(value, dict):
                    for dD in self.__dictGen(value, pre + [key]):
                        yield dD
                elif isinstance(value, list) or isinstance(value, tuple):
                    for v in value:
                        for dD in self.__dictGen(v, pre + [key]):
                            yield dD
                else:
                    yield pre + [key, value]
        else:
            yield indict
Example #30
0
class ProvenanceProvider(SingletonClass):
    """Utilities to access and update provenance details."""
    def __init__(self, cfgOb, cachePath, useCache=True, **kwargs):
        """Utilities to access and update provenance details.

        Args:
            cfgOb ([type]): ConfigInfo() instance
            cachePath ([type]): path to directory containing schema
            useCache (bool, optional): use cached schema. Defaults to True.
        """

        self.__cfgOb = cfgOb
        self.__configName = self.__cfgOb.getDefaultSectionName()
        self.__cachePath = cachePath
        self.__useCache = useCache
        #
        self.__workPath = os.path.join(self.__cachePath, "work")
        self.__provenanceCachePath = os.path.join(
            self.__cachePath,
            self.__cfgOb.get("PROVENANCE_INFO_CACHE_DIR",
                             sectionName=self.__configName))
        self.__provenanceLocator = self.__cfgOb.getPath(
            "PROVENANCE_INFO_LOCATOR", sectionName=self.__configName)
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__fileU.mkdir(self.__provenanceCachePath)
        self.__kwargs = kwargs
        #

    def __reload(self, locator, dirPath, useCache=True):
        #
        fn = self.__fileU.getFileName(locator)
        filePath = os.path.join(dirPath, fn)
        logger.debug("Using cache path %s", dirPath)
        self.__fileU.mkdir(dirPath)
        if not useCache:
            try:
                os.remove(filePath)
            except Exception:
                pass
        #
        if useCache and self.__fileU.exists(filePath):
            ok = True
        else:
            logger.debug("Fetch data from source %s", locator)
            ok = self.__fileU.get(locator, filePath)

        return filePath if ok else None

    def fetch(self):
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            return mU.doImport(provenanceFileCachePath, fmt="json")
        except Exception as e:
            logger.exception("Failed retreiving provenance with %s", str(e))
        return {}

    def update(self, provD):
        ok = False
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            tD = mU.doImport(provenanceFileCachePath, fmt="json")
            tD.update(provD)
            ok = mU.doExport(provenanceFileCachePath, tD, fmt="json")
        except Exception as e:
            logger.exception("Failed updating provenance with %s", str(e))
        return ok

    def store(self, provD):
        ok = False
        try:
            provenanceFileCachePath = self.__reload(self.__provenanceLocator,
                                                    self.__provenanceCachePath,
                                                    useCache=self.__useCache)
            mU = MarshalUtil(workPath=self.__workPath)
            ok = mU.doExport(provenanceFileCachePath, provD, fmt="json")
        except Exception as e:
            logger.exception("Failed storing provenance with %s", str(e))
        return ok