def test_convert_csv_to_json_records(self): for json_convert_method in ( _convert_csv_to_json_records_fast, _convert_csv_to_json_records_slow, ): with TemporaryDirectory() as workdir: workdir = Path(workdir) for csv_file in pbar([*(SRC / "test" / "data").glob("*.csv")], leave=False): json_output = workdir / csv_file.name.replace( "csv", "json") json_convert_method(SCHEMA, csv_file, json_output) with json_output.open("r") as fd: json_obj = json.load(fd) json_df = DataFrame(data=json_obj["data"], columns=json_obj["columns"]) csv_test_file = workdir / json_output.name.replace( "json", "csv") export_csv(json_df, csv_test_file, schema=SCHEMA) for line1, line2 in zip(read_lines(csv_file), read_lines(csv_test_file)): self.assertEqual(line1, line2)
def test_table_breakout(self): test_csv = """col1,col2 foo,1 foo,2 bar,3 bar,4 baz,5 baz,6 """ expected_foo = """col1,col2 foo,1 foo,2 """ expected_bar = """col1,col2 bar,3 bar,4 """ expected_baz = """col1,col2 baz,5 baz,6 """ with TemporaryDirectory() as workdir: workdir = Path(workdir) input_file = workdir / "in.csv" with open(input_file, "w") as fd: for line in test_csv.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") output_folder = workdir / "outputs" output_folder.mkdir(exist_ok=True, parents=True) table_breakout(input_file, output_folder, "col1") expected = expected_foo csv_output = output_folder / "foo" / "in.csv" for line1, line2 in zip(expected.split("\n"), read_lines(csv_output)): self.assertEqual(line1.strip(), line2.strip()) expected = expected_bar csv_output = output_folder / "bar" / "in.csv" for line1, line2 in zip(expected.split("\n"), read_lines(csv_output)): self.assertEqual(line1.strip(), line2.strip()) expected = expected_baz csv_output = output_folder / "baz" / "in.csv" for line1, line2 in zip(expected.split("\n"), read_lines(csv_output)): self.assertEqual(line1.strip(), line2.strip())
def _compare_tables_equal(self, table1: Path, table2: Path) -> None: cols1 = get_table_columns(table1) cols2 = get_table_columns(table2) self.assertEqual(set(cols1), set(cols2)) # Converting to a CSV in memory sometimes produces out-of-order values records1 = list(read_lines(table1)) records2 = list(read_lines(table2)) self.assertEqual(len(records1), len(records2)) reader1 = csv.reader(records1) reader2 = csv.reader(records2) for record1, record2 in zip(reader1, reader2): record1 = {col: val for col, val in zip(cols1, record1)} record2 = {col: val for col, val in zip(cols2, record2)} self.assertEqual(record1, record2)
def test_table_rename(self): test_csv = """col1,col2,col3 a,1,foo b,2,bar c,3,foo d,4,bar """ expected = """cola,colb a,1 b,2 c,3 d,4 """ with TemporaryDirectory() as workdir: workdir = Path(workdir) input_file = workdir / "in.csv" with open(input_file, "w") as fd: for line in test_csv.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") output_file = workdir / "out.csv" table_rename(input_file, output_file, {"col1": "cola", "col2": "colb", "col3": None}) for line1, line2 in zip(expected.split("\n"), read_lines(output_file)): self.assertEqual(line1.strip(), line2.strip())
def _subset_grouped_key(main_table_path: Path, output_folder: Path, desc: str = None) -> Iterable[Path]: """ Outputs a subsets of the table with only records with a particular key """ # Read the header of the main file to get the columns with open(main_table_path, "r") as fd: header = next(fd) # Do a first sweep to get the number of keys so we can accurately report progress key_set = set() for line in read_lines(main_table_path, skip_empty=True): key, data = line.split(",", 1) key_set.add(key) # We make use of the main table being sorted by <key, date> and do a linear sweep of the file # assuming that once the key changes we won't see it again in future lines key_folder: Path = None current_key: str = None file_handle: TextIO = None progress_bar = pbar(total=len(key_set), desc=desc) for idx, line in enumerate(read_lines(main_table_path, skip_empty=True)): key, data = line.split(",", 1) # Skip the header line if idx == 0: continue # When the key changes, close the previous file handle and open a new one if current_key != key: if file_handle: file_handle.close() if key_folder: yield key_folder / "main.csv" current_key = key key_folder = output_folder / key key_folder.mkdir(exist_ok=True) file_handle = (key_folder / "main.csv").open("w") file_handle.write(f"{header}") progress_bar.update(1) file_handle.write(f"{key},{data}") # Close the last file handle and we are done file_handle.close() progress_bar.close()
def _test_lexicographical_order(self, file_path: Path): last_line = "" for idx, line in enumerate(read_lines(file_path)): if idx > 0: key1 = line.split(",", 1)[0].replace("_", "-") key2 = last_line.split(",", 1)[0].replace("_", "-") msg = f"Keys in {file_path.name} must follow lexicographical order: {key1} ≤ {key2}" self.assertGreater(key1, key2, msg) last_line = line
def test_table_sort(self): test_csv = """col1,col2,col3 a,1,foo d,4,bar c,3,foo b,2,bar """ with TemporaryDirectory() as workdir: workdir = Path(workdir) input_file = workdir / "in.csv" with open(input_file, "w") as fd: for line in test_csv.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") # Sort using the default (first) column output_file_1 = workdir / "out.csv" table_sort(input_file, output_file_1) output_file_2 = workdir / "pandas.csv" read_table(input_file).sort_values(["col1"]).to_csv(output_file_2, index=False) for line1, line2 in zip(read_lines(output_file_1), read_lines(output_file_2)): self.assertEqual(line1.strip(), line2.strip()) # Sort by each column in order for sort_column in ("col1", "col2", "col3"): output_file_1 = workdir / "out.csv" table_sort(input_file, output_file_1, [sort_column]) output_file_2 = workdir / "pandas.csv" read_table(input_file).sort_values([sort_column ]).to_csv(output_file_2, index=False) for line1, line2 in zip(read_lines(output_file_1), read_lines(output_file_2)): self.assertEqual(line1.strip(), line2.strip())
def _test_make_main_table_helper(self, main_table_path: Path, column_adapter: Dict[str, str]): main_table = read_table(main_table_path, schema=SCHEMA) # Verify that all columns from all tables exist for pipeline in get_pipelines(): for column_name in pipeline.schema.keys(): column_name = column_adapter.get(column_name) if column_name is not None: self.assertTrue( column_name in main_table.columns, f"Column {column_name} missing from main table", ) # Main table should follow a lexical sort (outside of header) main_table_records = [] for line in read_lines(main_table_path): main_table_records.append(line) main_table_records = main_table_records[1:] self.assertListEqual(main_table_records, list(sorted(main_table_records))) # Make sure that all columns present in the index table are in the main table main_table_columns = set(get_table_columns(main_table_path)) index_table_columns = set( get_table_columns(SRC / "test" / "data" / "index.csv")) for column in index_table_columns: column = column_adapter.get(column, column) self.assertTrue(column in main_table_columns, f"{column} not in main") # Make the main table easier to deal with since we optimize for memory usage location_key = "location_key" if "location_key" in main_table.columns else "key" main_table.set_index(location_key, inplace=True) main_table["date"] = main_table["date"].astype(str) # Define sets of columns to check column_prefixes = ("new", "total", "cumulative") column_filter = lambda col: col.split("_")[ 0] in column_prefixes and "age" not in col columns = list(filter(column_filter, main_table.columns)) self.assertGreaterEqual(len({col.split("_")[0] for col in columns}), 2) main_table = main_table[["date"] + columns] # Spot check: Country of Andorra self._spot_check_subset(main_table, "AD", "2020-09-01", "2020-12-31") # Spot check: State of New South Wales self._spot_check_subset(main_table, "AU_NSW", "2020-09-01", "2020-12-31") # Spot check: Alachua County self._spot_check_subset(main_table, "US_FL_12001", "2020-09-01", "2020-12-31")
def test_cross_product(self): csv1 = """col1,col2 a,1 b,2 c,3 d,4 """ csv2 = """col3,col4 1,a 2,b 3,c 4,d """ expected = """col1,col2,col3,col4 a,1,1,a a,1,2,b a,1,3,c a,1,4,d b,2,1,a b,2,2,b b,2,3,c b,2,4,d c,3,1,a c,3,2,b c,3,3,c c,3,4,d d,4,1,a d,4,2,b d,4,3,c d,4,4,d """ with TemporaryDirectory() as workdir: workdir = Path(workdir) with open(workdir / "1.csv", "w") as fd: for line in csv1.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") with open(workdir / "2.csv", "w") as fd: for line in csv2.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") output_file = workdir / "out.csv" table_cross_product(workdir / "1.csv", workdir / "2.csv", output_file) for line1, line2 in zip(expected.split("\n"), read_lines(output_file)): self.assertEqual(line1.strip(), line2.strip())
def test_table_group_tail(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) for table_path in (SRC / "test" / "data").glob("*.csv"): table = read_table(table_path, schema=SCHEMA) test_output_path = workdir / f"latest_{table_path.name}" pandas_output_path = workdir / f"latest_pandas_{table_path.name}" # Create the latest slice of the given table table_group_tail(table_path, test_output_path) # Create a latest slice using pandas grouping table = table.groupby("key").aggregate(agg_last_not_null).reset_index() export_csv(table, path=pandas_output_path, schema=SCHEMA) # Converting to a CSV in memory sometimes produces out-of-order values test_result_lines = sorted(read_lines(test_output_path)) pandas_result_lines = sorted(read_lines(pandas_output_path)) for line1, line2 in zip(test_result_lines, pandas_result_lines): self.assertEqual(line1, line2)
def test_make_main_table(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) # Copy all test tables into the temporary directory copy_tables(SRC / "test" / "data", workdir) # Create the main table main_table_path = workdir / "main.csv" make_main_table(workdir, main_table_path) main_table = read_table(main_table_path, schema=SCHEMA) # Verify that all columns from all tables exist for pipeline in get_pipelines(): if pipeline.table in EXCLUDE_FROM_MAIN_TABLE: continue for column_name in pipeline.schema.keys(): self.assertTrue( column_name in main_table.columns, f"Column {column_name} missing from main table", ) # Main table should follow a lexical sort (outside of header) main_table_records = [] for line in read_lines(main_table_path): main_table_records.append(line) main_table_records = main_table_records[1:] self.assertListEqual(main_table_records, list(sorted(main_table_records))) # Make the main table easier to deal with since we optimize for memory usage main_table.set_index("key", inplace=True) main_table["date"] = main_table["date"].astype(str) # Define sets of columns to check epi_basic = [ "new_confirmed", "total_confirmed", "new_deceased", "total_deceased" ] # Spot check: Country of Andorra self._spot_check_subset(main_table, "AD", epi_basic, "2020-03-02", "2020-09-01") # Spot check: State of New South Wales self._spot_check_subset(main_table, "AU_NSW", epi_basic, "2020-01-25", "2020-09-01") # Spot check: Alachua County self._spot_check_subset(main_table, "US_FL_12001", epi_basic, "2020-03-10", "2020-09-01")