def test_table_group_tail(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) for table_path in (SRC / "test" / "data").glob("*.csv"): table = read_table(table_path, schema=SCHEMA) test_output_path = workdir / f"latest_{table_path.name}" pandas_output_path = workdir / f"latest_pandas_{table_path.name}" # Create the latest slice of the given table table_group_tail(table_path, test_output_path) # Create a latest slice using pandas grouping table = table.groupby("key").aggregate(agg_last_not_null).reset_index() export_csv(table, path=pandas_output_path, schema=SCHEMA) # Converting to a CSV in memory sometimes produces out-of-order values test_result_lines = sorted(read_lines(test_output_path)) pandas_result_lines = sorted(read_lines(pandas_output_path)) for line1, line2 in zip(test_result_lines, pandas_result_lines): self.assertEqual(line1, line2)
def subset_latest(csv_file: Path) -> Path: output_file = latest_path / csv_file.name table_group_tail(csv_file, output_file) return output_file