Esempio n. 1
0
    def test_read_cols(self):
        """ensure only requested cols are read and returned"""
        with TemporaryDirectory() as tmp:
            parquet_file = os.path.join(tmp, "myfile.parquet")

            # create dataframe with some data
            df = pd.DataFrame({"a": [1, 2], "b": ["hello", "world"]})

            # save df to parquet file
            with atomic_write(parquet_file, as_file=False) as f:
                df.to_parquet(f, engine="pyarrow")

            # read specific columns using the read_parquet_columns function
            col_a = read_parquet_columns(parquet_file, ["a"])
            col_b = read_parquet_columns(parquet_file, ["b"])
            col_a_and_b = read_parquet_columns(parquet_file, ["a", "b"])

            # ensure we are getting dataframe instances
            for tmp_df in [col_a, col_b, col_a_and_b]:
                self.assertIsInstance(tmp_df, pd.DataFrame)

            # ensure content of extracted columns match expected values
            for col, result in zip([["a"], ["b"], ["a", "b"]],
                                   [col_a, col_b, col_a_and_b]):
                self.assertTrue(df[col].equals(result))
Esempio n. 2
0
    def test_convert_xlsx_to_parquet(self):
        """ensure xlsx can be converted to equivalent parquet file"""
        # use temp dir where files will be created for testing purposes
        # the underlying context manager will remove the temp dir and all its content when it closes
        with TemporaryDirectory() as tmp:
            # define path to files
            fp_xlsx = os.path.join(tmp, "myfile.xlsx")
            fp_parquet = os.path.join(tmp, "myfile.parquet")

            # create dataframe with some data
            df_xlsx = pd.DataFrame({"a": [1, 2], "b": ["hello", "world"]})

            # save df to xlsx file
            with atomic_write(fp_xlsx, as_file=False) as f:
                df_xlsx.to_excel(f)

            # invoked function 'convert_excel_to_parquet' to convert the xlsx file to a parquet file
            parquet_filepath = convert_excel_to_parquet(fp_xlsx)

            # verify filename returned match filepath specified
            self.assertEqual(fp_parquet, parquet_filepath)
            # verify the parquet file was created
            self.assertTrue(os.path.exists(fp_parquet))
            # ensure contents of xlsx and parquet files match
            df_parquet = pd.read_parquet(fp_parquet, engine="pyarrow")
            self.assertTrue(df_xlsx.equals(df_parquet))
Esempio n. 3
0
    def test_as_file_false(self):
        """To ensure a path to a temporary file is returned when parameter as_file is False."""
        with TemporaryDirectory() as tmp:
            # define path to file
            fp = os.path.join(tmp, "asdf.txt")

            # invoke atomic_write with param as_file set to False
            # this should return a temporary file path string
            with atomic_write(fp, as_file=False) as f:
                self.assertIsInstance(f, str)
Esempio n. 4
0
    def test_file_exists(self):
        """Ensure an error is raised when file already exists"""
        with TemporaryDirectory() as tmp:
            # define path to file
            fp = os.path.join(tmp, "asdf.txt")

            # write atomically to file
            with atomic_write(fp, "w") as f:
                f.write("asdf")

            # ensure file exists
            assert os.path.exists(fp)

            # ensure atomic_write to same file raises an error as it already exists
            try:
                with atomic_write(fp, "w") as f:
                    f.write("asdf")
            except FileExistsError as e:
                self.assertIsInstance(e, FileExistsError)
Esempio n. 5
0
    def test_atomic_failure(self):
        """Ensure that file does not exist after failure during write"""
        with TemporaryDirectory() as tmp:
            fp = os.path.join(tmp, "asdf.txt")

            # raise fake error while writing file atomically
            with self.assertRaises(FakeFileFailure):
                with atomic_write(fp, "w") as f:
                    tmpfile = f.name
                    assert os.path.exists(tmpfile)
                    raise FakeFileFailure()

            # ensure both the temp and destination files do not exist
            assert not os.path.exists(tmpfile)
            assert not os.path.exists(fp)
Esempio n. 6
0
    def test_atomic_write(self):
        """Ensure file exists after being written successfully"""
        with TemporaryDirectory() as tmp:
            fp = os.path.join(tmp, "asdf.txt")

            # perform an atomic write
            with atomic_write(fp, "w") as f:
                assert not os.path.exists(fp)
                tmpfile = f.name
                f.write("asdf")

            # ensure tmp file has been deleted
            assert not os.path.exists(tmpfile)
            # ensure file to write to exists
            assert os.path.exists(fp)

            # ensure content of destination file is what we expect
            with open(fp) as f:
                self.assertEqual(f.read(), "asdf")