Ejemplo n.º 1
0
    def test_convert_csv_to_json_records(self):
        for json_convert_method in (
                _convert_csv_to_json_records_fast,
                _convert_csv_to_json_records_slow,
        ):
            with TemporaryDirectory() as workdir:
                workdir = Path(workdir)

                for csv_file in pbar([*(SRC / "test" / "data").glob("*.csv")],
                                     leave=False):
                    json_output = workdir / csv_file.name.replace(
                        "csv", "json")
                    json_convert_method(SCHEMA, csv_file, json_output)

                    with json_output.open("r") as fd:
                        json_obj = json.load(fd)
                        json_df = DataFrame(data=json_obj["data"],
                                            columns=json_obj["columns"])

                    csv_test_file = workdir / json_output.name.replace(
                        "json", "csv")
                    export_csv(json_df, csv_test_file, schema=SCHEMA)

                    for line1, line2 in zip(read_lines(csv_file),
                                            read_lines(csv_test_file)):
                        self.assertEqual(line1, line2)
    def test_table_breakout(self):
        test_csv = """col1,col2
        foo,1
        foo,2
        bar,3
        bar,4
        baz,5
        baz,6
        """

        expected_foo = """col1,col2
        foo,1
        foo,2
        """

        expected_bar = """col1,col2
        bar,3
        bar,4
        """

        expected_baz = """col1,col2
        baz,5
        baz,6
        """

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            input_file = workdir / "in.csv"
            with open(input_file, "w") as fd:
                for line in test_csv.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")

            output_folder = workdir / "outputs"
            output_folder.mkdir(exist_ok=True, parents=True)
            table_breakout(input_file, output_folder, "col1")

            expected = expected_foo
            csv_output = output_folder / "foo" / "in.csv"
            for line1, line2 in zip(expected.split("\n"),
                                    read_lines(csv_output)):
                self.assertEqual(line1.strip(), line2.strip())

            expected = expected_bar
            csv_output = output_folder / "bar" / "in.csv"
            for line1, line2 in zip(expected.split("\n"),
                                    read_lines(csv_output)):
                self.assertEqual(line1.strip(), line2.strip())

            expected = expected_baz
            csv_output = output_folder / "baz" / "in.csv"
            for line1, line2 in zip(expected.split("\n"),
                                    read_lines(csv_output)):
                self.assertEqual(line1.strip(), line2.strip())
    def _compare_tables_equal(self, table1: Path, table2: Path) -> None:
        cols1 = get_table_columns(table1)
        cols2 = get_table_columns(table2)
        self.assertEqual(set(cols1), set(cols2))

        # Converting to a CSV in memory sometimes produces out-of-order values
        records1 = list(read_lines(table1))
        records2 = list(read_lines(table2))
        self.assertEqual(len(records1), len(records2))

        reader1 = csv.reader(records1)
        reader2 = csv.reader(records2)
        for record1, record2 in zip(reader1, reader2):
            record1 = {col: val for col, val in zip(cols1, record1)}
            record2 = {col: val for col, val in zip(cols2, record2)}
            self.assertEqual(record1, record2)
    def test_table_rename(self):
        test_csv = """col1,col2,col3
        a,1,foo
        b,2,bar
        c,3,foo
        d,4,bar
        """

        expected = """cola,colb
        a,1
        b,2
        c,3
        d,4
        """

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            input_file = workdir / "in.csv"
            with open(input_file, "w") as fd:
                for line in test_csv.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")
            output_file = workdir / "out.csv"
            table_rename(input_file, output_file, {"col1": "cola", "col2": "colb", "col3": None})

            for line1, line2 in zip(expected.split("\n"), read_lines(output_file)):
                self.assertEqual(line1.strip(), line2.strip())
Ejemplo n.º 5
0
def _subset_grouped_key(main_table_path: Path,
                        output_folder: Path,
                        desc: str = None) -> Iterable[Path]:
    """ Outputs a subsets of the table with only records with a particular key """

    # Read the header of the main file to get the columns
    with open(main_table_path, "r") as fd:
        header = next(fd)

    # Do a first sweep to get the number of keys so we can accurately report progress
    key_set = set()
    for line in read_lines(main_table_path, skip_empty=True):
        key, data = line.split(",", 1)
        key_set.add(key)

    # We make use of the main table being sorted by <key, date> and do a linear sweep of the file
    # assuming that once the key changes we won't see it again in future lines
    key_folder: Path = None
    current_key: str = None
    file_handle: TextIO = None
    progress_bar = pbar(total=len(key_set), desc=desc)
    for idx, line in enumerate(read_lines(main_table_path, skip_empty=True)):
        key, data = line.split(",", 1)

        # Skip the header line
        if idx == 0:
            continue

        # When the key changes, close the previous file handle and open a new one
        if current_key != key:
            if file_handle:
                file_handle.close()
            if key_folder:
                yield key_folder / "main.csv"
            current_key = key
            key_folder = output_folder / key
            key_folder.mkdir(exist_ok=True)
            file_handle = (key_folder / "main.csv").open("w")
            file_handle.write(f"{header}")
            progress_bar.update(1)

        file_handle.write(f"{key},{data}")

    # Close the last file handle and we are done
    file_handle.close()
    progress_bar.close()
Ejemplo n.º 6
0
 def _test_lexicographical_order(self, file_path: Path):
     last_line = ""
     for idx, line in enumerate(read_lines(file_path)):
         if idx > 0:
             key1 = line.split(",", 1)[0].replace("_", "-")
             key2 = last_line.split(",", 1)[0].replace("_", "-")
             msg = f"Keys in {file_path.name} must follow lexicographical order: {key1} ≤ {key2}"
             self.assertGreater(key1, key2, msg)
             last_line = line
    def test_table_sort(self):
        test_csv = """col1,col2,col3
        a,1,foo
        d,4,bar
        c,3,foo
        b,2,bar
        """

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            input_file = workdir / "in.csv"
            with open(input_file, "w") as fd:
                for line in test_csv.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")

            # Sort using the default (first) column
            output_file_1 = workdir / "out.csv"
            table_sort(input_file, output_file_1)

            output_file_2 = workdir / "pandas.csv"
            read_table(input_file).sort_values(["col1"]).to_csv(output_file_2,
                                                                index=False)

            for line1, line2 in zip(read_lines(output_file_1),
                                    read_lines(output_file_2)):
                self.assertEqual(line1.strip(), line2.strip())

            # Sort by each column in order
            for sort_column in ("col1", "col2", "col3"):

                output_file_1 = workdir / "out.csv"
                table_sort(input_file, output_file_1, [sort_column])

                output_file_2 = workdir / "pandas.csv"
                read_table(input_file).sort_values([sort_column
                                                    ]).to_csv(output_file_2,
                                                              index=False)

                for line1, line2 in zip(read_lines(output_file_1),
                                        read_lines(output_file_2)):
                    self.assertEqual(line1.strip(), line2.strip())
Ejemplo n.º 8
0
    def _test_make_main_table_helper(self, main_table_path: Path,
                                     column_adapter: Dict[str, str]):
        main_table = read_table(main_table_path, schema=SCHEMA)

        # Verify that all columns from all tables exist
        for pipeline in get_pipelines():
            for column_name in pipeline.schema.keys():
                column_name = column_adapter.get(column_name)
                if column_name is not None:
                    self.assertTrue(
                        column_name in main_table.columns,
                        f"Column {column_name} missing from main table",
                    )

        # Main table should follow a lexical sort (outside of header)
        main_table_records = []
        for line in read_lines(main_table_path):
            main_table_records.append(line)
        main_table_records = main_table_records[1:]
        self.assertListEqual(main_table_records,
                             list(sorted(main_table_records)))

        # Make sure that all columns present in the index table are in the main table
        main_table_columns = set(get_table_columns(main_table_path))
        index_table_columns = set(
            get_table_columns(SRC / "test" / "data" / "index.csv"))
        for column in index_table_columns:
            column = column_adapter.get(column, column)
            self.assertTrue(column in main_table_columns,
                            f"{column} not in main")

        # Make the main table easier to deal with since we optimize for memory usage
        location_key = "location_key" if "location_key" in main_table.columns else "key"
        main_table.set_index(location_key, inplace=True)
        main_table["date"] = main_table["date"].astype(str)

        # Define sets of columns to check
        column_prefixes = ("new", "total", "cumulative")
        column_filter = lambda col: col.split("_")[
            0] in column_prefixes and "age" not in col
        columns = list(filter(column_filter, main_table.columns))
        self.assertGreaterEqual(len({col.split("_")[0] for col in columns}), 2)
        main_table = main_table[["date"] + columns]

        # Spot check: Country of Andorra
        self._spot_check_subset(main_table, "AD", "2020-09-01", "2020-12-31")

        # Spot check: State of New South Wales
        self._spot_check_subset(main_table, "AU_NSW", "2020-09-01",
                                "2020-12-31")

        # Spot check: Alachua County
        self._spot_check_subset(main_table, "US_FL_12001", "2020-09-01",
                                "2020-12-31")
Ejemplo n.º 9
0
    def test_cross_product(self):
        csv1 = """col1,col2
        a,1
        b,2
        c,3
        d,4
        """

        csv2 = """col3,col4
        1,a
        2,b
        3,c
        4,d
        """

        expected = """col1,col2,col3,col4
        a,1,1,a
        a,1,2,b
        a,1,3,c
        a,1,4,d
        b,2,1,a
        b,2,2,b
        b,2,3,c
        b,2,4,d
        c,3,1,a
        c,3,2,b
        c,3,3,c
        c,3,4,d
        d,4,1,a
        d,4,2,b
        d,4,3,c
        d,4,4,d
        """

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            with open(workdir / "1.csv", "w") as fd:
                for line in csv1.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")
            with open(workdir / "2.csv", "w") as fd:
                for line in csv2.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")

            output_file = workdir / "out.csv"
            table_cross_product(workdir / "1.csv", workdir / "2.csv",
                                output_file)

            for line1, line2 in zip(expected.split("\n"),
                                    read_lines(output_file)):
                self.assertEqual(line1.strip(), line2.strip())
    def test_table_group_tail(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            for table_path in (SRC / "test" / "data").glob("*.csv"):
                table = read_table(table_path, schema=SCHEMA)
                test_output_path = workdir / f"latest_{table_path.name}"
                pandas_output_path = workdir / f"latest_pandas_{table_path.name}"

                # Create the latest slice of the given table
                table_group_tail(table_path, test_output_path)

                # Create a latest slice using pandas grouping
                table = table.groupby("key").aggregate(agg_last_not_null).reset_index()
                export_csv(table, path=pandas_output_path, schema=SCHEMA)

                # Converting to a CSV in memory sometimes produces out-of-order values
                test_result_lines = sorted(read_lines(test_output_path))
                pandas_result_lines = sorted(read_lines(pandas_output_path))

                for line1, line2 in zip(test_result_lines, pandas_result_lines):
                    self.assertEqual(line1, line2)
Ejemplo n.º 11
0
    def test_make_main_table(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            # Copy all test tables into the temporary directory
            copy_tables(SRC / "test" / "data", workdir)

            # Create the main table
            main_table_path = workdir / "main.csv"
            make_main_table(workdir, main_table_path)
            main_table = read_table(main_table_path, schema=SCHEMA)

            # Verify that all columns from all tables exist
            for pipeline in get_pipelines():
                if pipeline.table in EXCLUDE_FROM_MAIN_TABLE:
                    continue
                for column_name in pipeline.schema.keys():
                    self.assertTrue(
                        column_name in main_table.columns,
                        f"Column {column_name} missing from main table",
                    )

            # Main table should follow a lexical sort (outside of header)
            main_table_records = []
            for line in read_lines(main_table_path):
                main_table_records.append(line)
            main_table_records = main_table_records[1:]
            self.assertListEqual(main_table_records,
                                 list(sorted(main_table_records)))

            # Make the main table easier to deal with since we optimize for memory usage
            main_table.set_index("key", inplace=True)
            main_table["date"] = main_table["date"].astype(str)

            # Define sets of columns to check
            epi_basic = [
                "new_confirmed", "total_confirmed", "new_deceased",
                "total_deceased"
            ]

            # Spot check: Country of Andorra
            self._spot_check_subset(main_table, "AD", epi_basic, "2020-03-02",
                                    "2020-09-01")

            # Spot check: State of New South Wales
            self._spot_check_subset(main_table, "AU_NSW", epi_basic,
                                    "2020-01-25", "2020-09-01")

            # Spot check: Alachua County
            self._spot_check_subset(main_table, "US_FL_12001", epi_basic,
                                    "2020-03-10", "2020-09-01")