def test_table_group_tail(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            for table_path in (SRC / "test" / "data").glob("*.csv"):
                table = read_table(table_path, schema=SCHEMA)
                test_output_path = workdir / f"latest_{table_path.name}"
                pandas_output_path = workdir / f"latest_pandas_{table_path.name}"

                # Create the latest slice of the given table
                table_group_tail(table_path, test_output_path)

                # Create a latest slice using pandas grouping
                table = table.groupby("key").aggregate(agg_last_not_null).reset_index()
                export_csv(table, path=pandas_output_path, schema=SCHEMA)

                # Converting to a CSV in memory sometimes produces out-of-order values
                test_result_lines = sorted(read_lines(test_output_path))
                pandas_result_lines = sorted(read_lines(pandas_output_path))

                for line1, line2 in zip(test_result_lines, pandas_result_lines):
                    self.assertEqual(line1, line2)
Exemple #2
0
 def subset_latest(csv_file: Path) -> Path:
     output_file = latest_path / csv_file.name
     table_group_tail(csv_file, output_file)
     return output_file