def publish() -> str:
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)
        tables_folder = workdir / "tables"
        public_folder = workdir / "public"
        tables_folder.mkdir(parents=True, exist_ok=True)
        public_folder.mkdir(parents=True, exist_ok=True)

        # Download all the combined tables into our local storage
        download_folder(GCS_BUCKET_TEST, "tables", tables_folder)

        # Prepare all files for publishing and add them to the public folder
        copy_tables(tables_folder, public_folder)
        print("Output tables copied to public folder")

        # Create the joint main table for all records
        main_table_path = public_folder / "main.csv"
        make_main_table(tables_folder, main_table_path)
        print("Main table created")

        # Create subsets for easy API-like access to slices of data
        list(create_table_subsets(main_table_path, public_folder))
        print("Table subsets created")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v2", public_folder)

    return "OK"
    def test_make_main_table(self):
        with temporary_directory() as workdir:

            # Copy all test tables into the temporary directory
            copy_tables(SRC / "test" / "data", workdir)

            # Create the main table
            main_table_path = workdir / "main.csv"
            merge_output_tables(workdir, main_table_path)

            self._test_make_main_table_helper(main_table_path, {})
Example #3
0
    def test_make_main_table(self):
        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)

            # Copy all test tables into the temporary directory
            copy_tables(SRC / "test" / "data", workdir)

            # Create the main table
            main_table_path = workdir / "main.csv"
            make_main_table(workdir, main_table_path)
            main_table = read_table(main_table_path, schema=SCHEMA)

            # Verify that all columns from all tables exist
            for pipeline in get_pipelines():
                if pipeline.table in EXCLUDE_FROM_MAIN_TABLE:
                    continue
                for column_name in pipeline.schema.keys():
                    self.assertTrue(
                        column_name in main_table.columns,
                        f"Column {column_name} missing from main table",
                    )

            # Main table should follow a lexical sort (outside of header)
            main_table_records = []
            for line in read_lines(main_table_path):
                main_table_records.append(line)
            main_table_records = main_table_records[1:]
            self.assertListEqual(main_table_records,
                                 list(sorted(main_table_records)))

            # Make the main table easier to deal with since we optimize for memory usage
            main_table.set_index("key", inplace=True)
            main_table["date"] = main_table["date"].astype(str)

            # Define sets of columns to check
            epi_basic = [
                "new_confirmed", "total_confirmed", "new_deceased",
                "total_deceased"
            ]

            # Spot check: Country of Andorra
            self._spot_check_subset(main_table, "AD", epi_basic, "2020-03-02",
                                    "2020-09-01")

            # Spot check: State of New South Wales
            self._spot_check_subset(main_table, "AU_NSW", epi_basic,
                                    "2020-01-25", "2020-09-01")

            # Spot check: Alachua County
            self._spot_check_subset(main_table, "US_FL_12001", epi_basic,
                                    "2020-03-10", "2020-09-01")
Example #4
0
    def test_convert_to_json(self):
        with temporary_directory() as workdir:

            # Copy all test tables into the temporary directory
            copy_tables(SRC / "test" / "data", workdir)

            # Copy test tables again but under a subpath
            subpath = workdir / "latest"
            subpath.mkdir()
            copy_tables(workdir, subpath)

            # Convert all the tables to JSON under a new path
            jsonpath = workdir / "json"
            jsonpath.mkdir()
            convert_tables_to_json(workdir, jsonpath)

            # The JSON files should maintain the same relative path
            for csv_file in workdir.glob("**/*.csv"):
                self.assertTrue((workdir / "json" / f"{csv_file.stem}.json").exists())
                self.assertTrue((workdir / "json" / "latest" / f"{csv_file.stem}.json").exists())
def publish_tables() -> Response:
    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Download all the combined tables into our local storage
        download_folder(GCS_BUCKET_TEST, "tables", input_folder)

        # TODO: perform some validation on the outputs and report errors
        # See: https://github.com/GoogleCloudPlatform/covid-19-open-data/issues/186

        # Prepare all files for publishing and add them to the public folder
        copy_tables(input_folder, output_folder)
        logger.log_info("Output tables copied to public folder")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v2", output_folder)

    return Response("OK", status=200)