Esempio n. 1
0
 def __init__(self, **kwargs):
     self.__workPath = kwargs.get("workPath", ".")
     self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_")
     self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir")
     #
     self.__fileU = FileUtil(workPath=self.__workPath)
     self.__ioU = IoUtil()
    def __rebuildCache(self,
                       targetUrl,
                       mapNameL,
                       outDirPath,
                       rawDirPath,
                       fmt="pickle",
                       useCache=True):
        """Fetch the UniProt selected id mapping resource file and extract
        UniProt Acc to  'mapIndex' mapping. Serialize the mapping as required.

        Args:
            targetUrl (str): source URL of the remote index file
            mapNameL (list): list of key mapping names to extract from the index
            outDirPath (str): directory path for raw and processed mapping files
            fmt (str, optional): output format (pickle|json) . Defaults to "pickle".
            useCache (bool, optional): use cached files. Defaults to True.

        Returns:
            dict: od[uniprotId] = mapped value

                idmapping_selected.tab

                1. UniProtKB-AC
                2. UniProtKB-ID
                3. GeneID (EntrezGene)
                4. RefSeq
                5. GI
                6. PDB
                7. GO
                8. UniRef100
                9. UniRef90
                10. UniRef50
                11. UniParc
                12. PIR
                13. NCBI-taxon
                14. MIM
                15. UniGene
                16. PubMed
                17. EMBL
                18. EMBL-CDS
                19. Ensembl
                20. Ensembl_TRS
                21. Ensembl_PRO
                22. Additional PubMed

        """
        startTime = time.time()
        nL = mapNameL
        oD = {}
        try:
            fileU = FileUtil()
            fExt = "pic" if fmt == "pickle" else "json"
            fExt = "tdd" if fmt == "tdd" else fExt
            fN, _ = os.path.splitext(fileU.getFileName(targetUrl))
            mapFileName = fN + "-map." + fExt
            idMapPath = os.path.join(outDirPath, mapFileName)
            mU = MarshalUtil()
            if useCache and mU.exists(idMapPath):
                logger.info("Reading cached serialized file %r", idMapPath)
                if fmt in ["pickle", "json"]:
                    tD = mU.doImport(idMapPath, fmt=fmt)
                    nL = list(set(tD["idNameList"]))
                    oD = tD["uniprotMapD"]
                    logger.info("keys %r", list(oD.keys())[:10])
                    logger.info("nL %r", nL)
                    ok = True
                elif fmt == "tdd":
                    ioU = IoUtil()
                    it = ioU.deserializeCsvIter(idMapPath,
                                                delimiter="\t",
                                                rowFormat="list",
                                                encodingErrors="ignore")
                    tL = next(it, [])
                    nL = tL[1:]
                    if len(nL) == 1:
                        for row in it:
                            oD[row[0]] = row[1]
                    else:
                        for row in it:
                            oD[row[0]] = row[1:]
                    ok = True
            else:
                idPath = os.path.join(rawDirPath, fileU.getFileName(targetUrl))
                if not fileU.exists(idPath):
                    logger.info(
                        "Fetching selected UniProt idmapping data from %r in %r",
                        targetUrl, outDirPath)
                    ok = fileU.get(targetUrl, idPath)
                    if not ok:
                        logger.error("Failed to downlowd %r", targetUrl)
                        return oD
                else:
                    logger.info("Using cached mapping file %r", idPath)
                # ---
                ioU = IoUtil()
                if fmt in ["pickle", "json"]:
                    if len(mapNameL) == 1:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            oD[row[0]] = str(
                                row[self.__mapRecordD[mapNameL[0]] - 1])
                    else:
                        for row in ioU.deserializeCsvIter(
                                idPath,
                                delimiter="\t",
                                rowFormat="list",
                                encodingErrors="ignore"):
                            for mapName in mapNameL:
                                oD.setdefault(row[0], []).append(
                                    str(row[self.__mapRecordD[mapName] - 1]))
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    ok = mU.doExport(idMapPath, {
                        "idNameList": mapNameL,
                        "uniprotMapD": oD
                    },
                                     fmt=fmt)
                elif fmt == "tdd":
                    #
                    logger.info("Writing serialized mapping file %r",
                                idMapPath)
                    fU = FileUtil()
                    fU.mkdirForFile(idMapPath)
                    colNameL = []
                    colNameL.append("UniProtId")
                    colNameL.extend(mapNameL)
                    with open(idMapPath, "w", encoding="utf-8") as ofh:
                        ofh.write("%s\n" % "\t".join(colNameL))
                        if len(mapNameL) == 1:
                            idx = self.__mapRecordD[mapNameL[0]] - 1
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write("%s\t%s\n" % (row[0], row[idx]))
                        else:
                            idxL = [0]
                            idxL.extend([
                                self.__mapRecordD[mapName] - 1
                                for mapName in mapNameL
                            ])
                            for row in ioU.deserializeCsvIter(
                                    idPath,
                                    delimiter="\t",
                                    rowFormat="list",
                                    encodingErrors="ignore"):
                                ofh.write(
                                    "%s\n" %
                                    "\t".join([str(row[idx]) for idx in idxL]))
                            #
                    nL, oD = self.__rebuildCache(targetUrl,
                                                 mapNameL,
                                                 outDirPath,
                                                 rawDirPath,
                                                 fmt=fmt,
                                                 useCache=True)
                    ok = True if nL and oD else False
            logger.info("Completed reload (%r) at %s (%.4f seconds)", ok,
                        time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                        time.time() - startTime)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
        #
        return nL, oD
Esempio n. 3
0
 def setUp(self):
     self.__verbose = True
     self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                  "mock-data",
                                                  "dictionaries",
                                                  "mmcif_pdbx_v5_next.dic")
     self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "dictionaries",
                                            "vrpt_dictmap.json")
     self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                         "MOCK_EXCHANGE_SANDBOX",
                                         "update-lists", "all-pdb-list")
     self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                       "MOCK_BIRD_CC_REPO", "0",
                                       "PRDCC_000010.cif")
     #
     self.__workPath = os.path.join(HERE, "test-output")
     self.__pathSaveDictionaryFile = os.path.join(self.__workPath,
                                                  "mmcif_pdbx_v5_next.dic")
     self.__pathSaveJsonTestFile = os.path.join(self.__workPath,
                                                "json-content.json")
     self.__pathSaveIndexFile = os.path.join(self.__workPath,
                                             "all-pdb-list")
     self.__pathSaveCifFile = os.path.join(self.__workPath,
                                           "cif-content.cif")
     self.__pathSavePickleFile = os.path.join(self.__workPath,
                                              "json-content.pic")
     self.__pathSaveTextFile = os.path.join(self.__workPath,
                                            "json-content.txt")
     #
     #
     self.__pathInsilicoFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "MOCK_EXCHANGE_SANDBOX",
                                            "status",
                                            "theoretical_model.tsv")
     self.__pathSaveInsilicoFile = os.path.join(
         self.__workPath, "saved-theoretical_model.tsv")
     #
     # self.__pathVariantFastaFile = os.path.join(self.__mockTopPath, 'UniProt', 'uniprot_sprot_varsplic.fasta.gz')
     self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                         "MOCK_EXCHANGE_SANDBOX",
                                         "sequence",
                                         "pdb_seq_prerelease.fasta")
     self.__pathSaveFastaFile = os.path.join(self.__workPath,
                                             "test-pre-release.fasta")
     #
     self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "NCBI", "names.dmp.gz")
     self.__pathSaveTaxonomyFilePic = os.path.join(self.__workPath,
                                                   "taxonomy_names.pic")
     self.__pathSaveTaxonomyFileCsv = os.path.join(self.__workPath,
                                                   "taxonomy_names.csv")
     #
     self.__pathSiftsFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                         "sifts-summary",
                                         "pdb_chain_go.csv.gz")
     #
     self.__ioU = IoUtil()
     self.__startTime = time.time()
     logger.debug("Running tests on version %s", __version__)
     logger.debug("Starting %s at %s", self.id(),
                  time.strftime("%Y %m %d %H:%M:%S", time.localtime()))