def test_read_cols(self): """ensure only requested cols are read and returned""" with TemporaryDirectory() as tmp: parquet_file = os.path.join(tmp, "myfile.parquet") # create dataframe with some data df = pd.DataFrame({"a": [1, 2], "b": ["hello", "world"]}) # save df to parquet file with atomic_write(parquet_file, as_file=False) as f: df.to_parquet(f, engine="pyarrow") # read specific columns using the read_parquet_columns function col_a = read_parquet_columns(parquet_file, ["a"]) col_b = read_parquet_columns(parquet_file, ["b"]) col_a_and_b = read_parquet_columns(parquet_file, ["a", "b"]) # ensure we are getting dataframe instances for tmp_df in [col_a, col_b, col_a_and_b]: self.assertIsInstance(tmp_df, pd.DataFrame) # ensure content of extracted columns match expected values for col, result in zip([["a"], ["b"], ["a", "b"]], [col_a, col_b, col_a_and_b]): self.assertTrue(df[col].equals(result))
def test_convert_xlsx_to_parquet(self): """ensure xlsx can be converted to equivalent parquet file""" # use temp dir where files will be created for testing purposes # the underlying context manager will remove the temp dir and all its content when it closes with TemporaryDirectory() as tmp: # define path to files fp_xlsx = os.path.join(tmp, "myfile.xlsx") fp_parquet = os.path.join(tmp, "myfile.parquet") # create dataframe with some data df_xlsx = pd.DataFrame({"a": [1, 2], "b": ["hello", "world"]}) # save df to xlsx file with atomic_write(fp_xlsx, as_file=False) as f: df_xlsx.to_excel(f) # invoked function 'convert_excel_to_parquet' to convert the xlsx file to a parquet file parquet_filepath = convert_excel_to_parquet(fp_xlsx) # verify filename returned match filepath specified self.assertEqual(fp_parquet, parquet_filepath) # verify the parquet file was created self.assertTrue(os.path.exists(fp_parquet)) # ensure contents of xlsx and parquet files match df_parquet = pd.read_parquet(fp_parquet, engine="pyarrow") self.assertTrue(df_xlsx.equals(df_parquet))
def test_as_file_false(self): """To ensure a path to a temporary file is returned when parameter as_file is False.""" with TemporaryDirectory() as tmp: # define path to file fp = os.path.join(tmp, "asdf.txt") # invoke atomic_write with param as_file set to False # this should return a temporary file path string with atomic_write(fp, as_file=False) as f: self.assertIsInstance(f, str)
def test_file_exists(self): """Ensure an error is raised when file already exists""" with TemporaryDirectory() as tmp: # define path to file fp = os.path.join(tmp, "asdf.txt") # write atomically to file with atomic_write(fp, "w") as f: f.write("asdf") # ensure file exists assert os.path.exists(fp) # ensure atomic_write to same file raises an error as it already exists try: with atomic_write(fp, "w") as f: f.write("asdf") except FileExistsError as e: self.assertIsInstance(e, FileExistsError)
def test_atomic_failure(self): """Ensure that file does not exist after failure during write""" with TemporaryDirectory() as tmp: fp = os.path.join(tmp, "asdf.txt") # raise fake error while writing file atomically with self.assertRaises(FakeFileFailure): with atomic_write(fp, "w") as f: tmpfile = f.name assert os.path.exists(tmpfile) raise FakeFileFailure() # ensure both the temp and destination files do not exist assert not os.path.exists(tmpfile) assert not os.path.exists(fp)
def test_atomic_write(self): """Ensure file exists after being written successfully""" with TemporaryDirectory() as tmp: fp = os.path.join(tmp, "asdf.txt") # perform an atomic write with atomic_write(fp, "w") as f: assert not os.path.exists(fp) tmpfile = f.name f.write("asdf") # ensure tmp file has been deleted assert not os.path.exists(tmpfile) # ensure file to write to exists assert os.path.exists(fp) # ensure content of destination file is what we expect with open(fp) as f: self.assertEqual(f.read(), "asdf")