Esempio n. 1
0
class MarshalUtil(object):
    """Wrapper for serialization and deserialization methods."""
    def __init__(self, **kwargs):
        self.__workPath = kwargs.get("workPath", ".")
        self.__workDirSuffix = kwargs.get("workDirSuffix", "marshall_")
        self.__workDirPrefix = kwargs.get("workDirSuffix", "_tempdir")
        #
        self.__fileU = FileUtil(workPath=self.__workPath)
        self.__ioU = IoUtil()

    def doExport(self,
                 locator,
                 obj,
                 fmt="list",
                 marshalHelper=None,
                 numParts=None,
                 **kwargs):
        """Serialize the input object at locator path in specified format.  The
        input object is optionally preprocessed by the helper method.

        Args:
            locator (str): target path or URI
            obj (object): data to be serialized
            fmt (str, optional): format for serialization (mmcif, tdd, csv, list). Defaults to "list".
            marshalHelper (method, optional): pre-processor method applied to input data object. Defaults to None.
            numParts (int, optional): serialize the data in parts. Defaults to None. (json and pickle formats)
        Returns:
            bool: True for sucess or False otherwise
        """
        try:
            ret = False
            localFlag = self.__fileU.isLocal(locator)
            if marshalHelper:
                myObj = marshalHelper(obj, **kwargs)
            else:
                myObj = obj
            #
            if localFlag and numParts and fmt in ["json", "pickle"]:
                localFilePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.serializeInParts(localFilePath,
                                                  myObj,
                                                  numParts,
                                                  fmt=fmt,
                                                  **kwargs)
            elif localFlag:
                localFilePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.serialize(localFilePath,
                                           myObj,
                                           fmt=fmt,
                                           workPath=self.__workPath,
                                           **kwargs)
            else:
                with tempfile.TemporaryDirectory(
                        suffix=self.__workDirSuffix,
                        prefix=self.__workDirPrefix,
                        dir=self.__workPath) as tmpDirName:
                    # write a local copy then copy to destination -
                    #
                    localFilePath = os.path.join(
                        self.__workPath, tmpDirName,
                        self.__fileU.getFileName(locator))
                    ok1 = self.__ioU.serialize(localFilePath,
                                               myObj,
                                               fmt=fmt,
                                               workPath=self.__workPath,
                                               **kwargs)
                    ok2 = True
                    if ok1:
                        ok2 = self.__fileU.put(localFilePath, locator,
                                               **kwargs)
                ret = ok1 and ok2
        except Exception as e:
            logger.exception("Exporting locator %r failing with %s", locator,
                             str(e))

        return ret

    def doImport(self,
                 locator,
                 fmt="list",
                 marshalHelper=None,
                 numParts=None,
                 **kwargs):
        """Deserialize data at the target locator in specified format. The deserialized
        data is optionally post-processed by the input helper method.

        Args:
            locator (str): path or URI to input data
            fmt (str, optional): format for deserialization (mmcif, tdd, csv, list). Defaults to "list".
            marshalHelper (method, optional): post-processor method applied to deserialized data object. Defaults to None.
            numParts (int, optional): deserialize the data in parts. Defaults to None. (json and pickle formats)
            tarMember (str, optional): name of a member of tar file bundle. Defaults to None. (tar file format)

        Returns:
            Any: format specific return type
        """
        try:
            tarMember = kwargs.get("tarMember", None)
            localFlag = self.__fileU.isLocal(locator) and not tarMember
            #
            if localFlag and numParts and fmt in ["json", "pickle"]:
                filePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.deserializeInParts(filePath,
                                                    numParts,
                                                    fmt=fmt,
                                                    **kwargs)
            elif localFlag:
                filePath = self.__fileU.getFilePath(locator)
                ret = self.__ioU.deserialize(filePath,
                                             fmt=fmt,
                                             workPath=self.__workPath,
                                             **kwargs)
            else:
                #
                if fmt == "mmcif":
                    ret = self.__ioU.deserialize(locator,
                                                 fmt=fmt,
                                                 workPath=self.__workPath,
                                                 **kwargs)
                else:
                    with tempfile.TemporaryDirectory(
                            suffix=self.__workDirSuffix,
                            prefix=self.__workDirPrefix,
                            dir=self.__workPath) as tmpDirName:
                        #
                        # Fetch first then read a local copy -
                        #
                        if tarMember:
                            localFilePath = os.path.join(
                                self.__workPath, tmpDirName, tarMember)
                        else:
                            localFilePath = os.path.join(
                                self.__workPath, tmpDirName,
                                self.__fileU.getFileName(locator))

                        # ---  Local copy approach ---
                        self.__fileU.get(locator, localFilePath, **kwargs)
                        ret = self.__ioU.deserialize(localFilePath,
                                                     fmt=fmt,
                                                     workPath=self.__workPath,
                                                     **kwargs)

            if marshalHelper:
                ret = marshalHelper(ret, **kwargs)
        except Exception as e:
            logger.exception("Importing locator %r failing with %s", locator,
                             str(e))
            ret = None
        return ret

    def exists(self, filePath, mode=os.R_OK):
        return self.__fileU.exists(filePath, mode=mode)

    def mkdir(self, dirPath, mode=0o755):
        return self.__fileU.mkdir(dirPath, mode=mode)

    def remove(self, pth):
        return self.__fileU.remove(pth)
Esempio n. 2
0
class IoUtilTests(unittest.TestCase):
    def setUp(self):
        self.__verbose = True
        self.__pathPdbxDictionaryFile = os.path.join(TOPDIR, "rcsb",
                                                     "mock-data",
                                                     "dictionaries",
                                                     "mmcif_pdbx_v5_next.dic")
        self.__pathJsonTestFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "dictionaries",
                                               "vrpt_dictmap.json")
        self.__pathIndexFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "MOCK_EXCHANGE_SANDBOX",
                                            "update-lists", "all-pdb-list")
        self.__pathCifFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                          "MOCK_BIRD_CC_REPO", "0",
                                          "PRDCC_000010.cif")
        #
        self.__workPath = os.path.join(HERE, "test-output")
        self.__pathSaveDictionaryFile = os.path.join(self.__workPath,
                                                     "mmcif_pdbx_v5_next.dic")
        self.__pathSaveJsonTestFile = os.path.join(self.__workPath,
                                                   "json-content.json")
        self.__pathSaveIndexFile = os.path.join(self.__workPath,
                                                "all-pdb-list")
        self.__pathSaveCifFile = os.path.join(self.__workPath,
                                              "cif-content.cif")
        self.__pathSavePickleFile = os.path.join(self.__workPath,
                                                 "json-content.pic")
        self.__pathSaveTextFile = os.path.join(self.__workPath,
                                               "json-content.txt")
        #
        #
        self.__pathInsilicoFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "MOCK_EXCHANGE_SANDBOX",
                                               "status",
                                               "theoretical_model.tsv")
        self.__pathSaveInsilicoFile = os.path.join(
            self.__workPath, "saved-theoretical_model.tsv")
        #
        # self.__pathVariantFastaFile = os.path.join(self.__mockTopPath, 'UniProt', 'uniprot_sprot_varsplic.fasta.gz')
        self.__pathFastaFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "MOCK_EXCHANGE_SANDBOX",
                                            "sequence",
                                            "pdb_seq_prerelease.fasta")
        self.__pathSaveFastaFile = os.path.join(self.__workPath,
                                                "test-pre-release.fasta")
        #
        self.__pathTaxonomyFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                               "NCBI", "names.dmp.gz")
        self.__pathSaveTaxonomyFilePic = os.path.join(self.__workPath,
                                                      "taxonomy_names.pic")
        self.__pathSaveTaxonomyFileCsv = os.path.join(self.__workPath,
                                                      "taxonomy_names.csv")
        #
        self.__pathSiftsFile = os.path.join(TOPDIR, "rcsb", "mock-data",
                                            "sifts-summary",
                                            "pdb_chain_go.csv.gz")
        #
        self.__ioU = IoUtil()
        self.__startTime = time.time()
        logger.debug("Running tests on version %s", __version__)
        logger.debug("Starting %s at %s", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()))

    def tearDown(self):
        endTime = time.time()
        logger.debug("Completed %s at %s (%.4f seconds)", self.id(),
                     time.strftime("%Y %m %d %H:%M:%S", time.localtime()),
                     endTime - self.__startTime)

    @unittest.skipIf(sys.version_info[0] < 3, "not compatible with Python 2")
    def testReadCsvIter(self):
        """Test returning an iterator for a large CSV file with leading comments"""
        try:
            iCount = 0
            for row in self.__ioU.deserializeCsvIter(self.__pathSiftsFile,
                                                     delimiter=",",
                                                     rowFormat="list",
                                                     encodingErrors="ignore"):
                if len(row) < 6:
                    logger.error("Failing with row %r", row)
                iCount += 1
            self.assertGreater(iCount, 25000000)

            logger.info("Row count is %d", iCount)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteInParts(self):
        """Test the case reading and writing in parts."""
        try:
            self.maxDiff = None
            lenL = 12483
            aL = [100, 200, 300, 400, 500]
            dL = [aL for ii in range(lenL)]
            numParts = 4
            sPath = os.path.join(self.__workPath, "list-data.json")
            ok = self.__ioU.serializeInParts(sPath,
                                             dL,
                                             numParts,
                                             fmt="json",
                                             indent=3)
            self.assertTrue(ok)
            rL = self.__ioU.deserializeInParts(sPath, numParts, fmt="json")
            logger.info("Reading %d parts with total length %d", numParts,
                        len(rL))
            self.assertEqual(dL, rL)
            #
            lenD = 20341
            qD = OrderedDict([("a", 100), ("b", 100), ("c", 100)])
            dD = OrderedDict([(str(ii), qD) for ii in range(lenD)])
            numParts = 4
            sPath = os.path.join(self.__workPath, "dict-data.json")
            ok = self.__ioU.serializeInParts(sPath,
                                             dD,
                                             numParts,
                                             fmt="json",
                                             indent=3)
            self.assertTrue(ok)
            rD = self.__ioU.deserializeInParts(sPath, numParts, fmt="json")
            logger.info("Reading %d parts with total length %d", numParts,
                        len(rD))
            self.assertDictEqual(dD, rD)
            #
            rD = self.__ioU.deserializeInParts(sPath, None, fmt="json")
            logger.info("Reading %d globbed parts with total length %d",
                        numParts, len(rD))
            self.assertDictEqual(dD, rD)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadDictionaryFile(self):
        """Test the case read PDBx/mmCIF dictionary text file"""
        try:
            cL = self.__ioU.deserialize(self.__pathPdbxDictionaryFile,
                                        fmt="mmcif-dict")
            logger.debug("Dictionary container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadCifFile(self):
        """Test the case read PDBx/mmCIF text file"""
        try:
            cL = self.__ioU.deserialize(self.__pathCifFile, fmt="mmcif")
            logger.debug("Container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadListFile(self):
        """Test the case read list text file"""
        try:
            cL = self.__ioU.deserialize(self.__pathIndexFile, fmt="list")
            logger.debug("List length %d", len(cL))
            self.assertGreaterEqual(len(cL), 1000)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadJsonFile(self):
        """Test the case read JSON file"""
        try:
            rObj = self.__ioU.deserialize(self.__pathJsonTestFile, fmt="json")
            logger.debug("Object length %d", len(rObj))
            self.assertGreaterEqual(len(rObj), 1)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteDictionaryFiles(self):
        """Test the case read and write PDBx/mmCIF dictionary text file"""
        try:
            cL = self.__ioU.deserialize(self.__pathPdbxDictionaryFile,
                                        fmt="mmcif-dict")
            logger.debug("Dictionary container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
            ok = self.__ioU.serialize(self.__pathSaveDictionaryFile,
                                      cL,
                                      fmt="mmcif-dict")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteCifFile(self):
        """Test the case read and write PDBx/mmCIF text file"""
        try:
            cL = self.__ioU.deserialize(self.__pathCifFile, fmt="mmcif")
            logger.debug("Container list %d", len(cL))
            self.assertGreaterEqual(len(cL), 1)
            ok = self.__ioU.serialize(self.__pathSaveCifFile, cL, fmt="mmcif")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteJsonFile(self):
        """Test the case read and write JSON file"""
        try:
            rObj = self.__ioU.deserialize(self.__pathJsonTestFile, fmt="json")
            logger.debug("Object length %d", len(rObj))
            self.assertGreaterEqual(len(rObj), 1)
            ok = self.__ioU.serialize(self.__pathSaveJsonTestFile,
                                      rObj,
                                      fmt="json")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteListFile(self):
        """Test the case read and write list text file"""
        try:
            cL = self.__ioU.deserialize(self.__pathIndexFile, fmt="list")
            logger.debug("List element %r length %d", cL[0], len(cL))
            self.assertGreaterEqual(len(cL), 1000)
            ok = self.__ioU.serialize(self.__pathSaveIndexFile, cL, fmt="list")
            self.assertTrue(ok)
            count = 0
            for cV in cL:
                fields = cV.split()
                count += len(fields)
            _ = count
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWritePickleFile(self):
        """Test the case read and write pickle file"""
        try:
            rObj = self.__ioU.deserialize(self.__pathJsonTestFile, fmt="json")
            logger.debug("Object length %d", len(rObj))
            self.assertGreaterEqual(len(rObj), 1)
            ok = self.__ioU.serialize(self.__pathSavePickleFile,
                                      rObj,
                                      fmt="pickle")
            self.assertTrue(ok)
            rObjP = self.__ioU.deserialize(self.__pathSavePickleFile,
                                           fmt="pickle")
            self.assertDictEqual(rObj, rObjP)
            ok = self.__ioU.serialize(self.__pathSaveTextFile,
                                      rObj,
                                      fmt="text-dump")
            self.assertTrue(ok)

        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteListWithEncodingFile(self):
        """Test the case read and write list text file with non-ascii encoding"""
        try:
            cL = self.__ioU.deserialize(self.__pathInsilicoFile, fmt="list")
            logger.debug("Insilico List length %d", len(cL))
            #
            self.assertGreaterEqual(len(cL), 1450)
            #
            ok = self.__ioU.serialize(self.__pathSaveInsilicoFile,
                                      cL,
                                      fmt="list")
            self.assertTrue(ok)
            #
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteFastaFile(self):
        """Test the case read and write FASTA sequence file"""
        try:
            sD = self.__ioU.deserialize(self.__pathFastaFile,
                                        fmt="fasta",
                                        commentStyle="prerelease")
            logger.debug("Sequence length %d", len(sD.keys()))
            self.assertGreaterEqual(len(sD), 500)
            ok = self.__ioU.serialize(self.__pathSaveFastaFile,
                                      sD,
                                      fmt="fasta")
            self.assertTrue(ok)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()

    def testReadWriteTaxonomyFile(self):
        """Test the case read and write taxonomy resource file"""
        try:
            tL = self.__ioU.deserialize(self.__pathTaxonomyFile,
                                        fmt="tdd",
                                        rowFormat="list")
            logger.info("Taxonomy length %d", len(tL))
            self.assertGreaterEqual(len(tL), 500)
            tD = {}
            csvL = []
            for tV in tL:
                if len(tV) < 7:
                    continue
                taxId = int(tV[0])
                name = tV[2]
                nameType = tV[6]
                csvL.append({"t": taxId, "name": name, "type": nameType})
                #
                if nameType in [
                        "scientific name", "common name", "synonym",
                        "genbank common name"
                ]:
                    if taxId not in tD:
                        tD[taxId] = {}
                    if nameType in ["scientific name"]:
                        tD[taxId]["sn"] = name
                        continue
                    if "cn" not in tD[taxId]:
                        tD[taxId]["cn"] = []
                    tD[taxId]["cn"].append(name)
                else:
                    pass

            ok = self.__ioU.serialize(self.__pathSaveTaxonomyFilePic,
                                      tD,
                                      fmt="pickle")
            self.assertTrue(ok)
            ok = self.__ioU.serialize(self.__pathSaveTaxonomyFileCsv,
                                      csvL,
                                      fmt="csv")
            self.assertTrue(ok)
            tL = self.__ioU.deserialize(self.__pathSaveTaxonomyFileCsv,
                                        fmt="csv",
                                        rowFormat="dict")
            self.assertTrue(len(tL) > 2880000)
        except Exception as e:
            logger.exception("Failing with %s", str(e))
            self.fail()