def publish() -> str: with TemporaryDirectory() as workdir: workdir = Path(workdir) tables_folder = workdir / "tables" public_folder = workdir / "public" tables_folder.mkdir(parents=True, exist_ok=True) public_folder.mkdir(parents=True, exist_ok=True) # Download all the combined tables into our local storage download_folder(GCS_BUCKET_TEST, "tables", tables_folder) # Prepare all files for publishing and add them to the public folder copy_tables(tables_folder, public_folder) print("Output tables copied to public folder") # Create the joint main table for all records main_table_path = public_folder / "main.csv" make_main_table(tables_folder, main_table_path) print("Main table created") # Create subsets for easy API-like access to slices of data list(create_table_subsets(main_table_path, public_folder)) print("Table subsets created") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v2", public_folder) return "OK"
def test_make_main_table(self): with temporary_directory() as workdir: # Copy all test tables into the temporary directory copy_tables(SRC / "test" / "data", workdir) # Create the main table main_table_path = workdir / "main.csv" merge_output_tables(workdir, main_table_path) self._test_make_main_table_helper(main_table_path, {})
def test_make_main_table(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) # Copy all test tables into the temporary directory copy_tables(SRC / "test" / "data", workdir) # Create the main table main_table_path = workdir / "main.csv" make_main_table(workdir, main_table_path) main_table = read_table(main_table_path, schema=SCHEMA) # Verify that all columns from all tables exist for pipeline in get_pipelines(): if pipeline.table in EXCLUDE_FROM_MAIN_TABLE: continue for column_name in pipeline.schema.keys(): self.assertTrue( column_name in main_table.columns, f"Column {column_name} missing from main table", ) # Main table should follow a lexical sort (outside of header) main_table_records = [] for line in read_lines(main_table_path): main_table_records.append(line) main_table_records = main_table_records[1:] self.assertListEqual(main_table_records, list(sorted(main_table_records))) # Make the main table easier to deal with since we optimize for memory usage main_table.set_index("key", inplace=True) main_table["date"] = main_table["date"].astype(str) # Define sets of columns to check epi_basic = [ "new_confirmed", "total_confirmed", "new_deceased", "total_deceased" ] # Spot check: Country of Andorra self._spot_check_subset(main_table, "AD", epi_basic, "2020-03-02", "2020-09-01") # Spot check: State of New South Wales self._spot_check_subset(main_table, "AU_NSW", epi_basic, "2020-01-25", "2020-09-01") # Spot check: Alachua County self._spot_check_subset(main_table, "US_FL_12001", epi_basic, "2020-03-10", "2020-09-01")
def test_convert_to_json(self): with temporary_directory() as workdir: # Copy all test tables into the temporary directory copy_tables(SRC / "test" / "data", workdir) # Copy test tables again but under a subpath subpath = workdir / "latest" subpath.mkdir() copy_tables(workdir, subpath) # Convert all the tables to JSON under a new path jsonpath = workdir / "json" jsonpath.mkdir() convert_tables_to_json(workdir, jsonpath) # The JSON files should maintain the same relative path for csv_file in workdir.glob("**/*.csv"): self.assertTrue((workdir / "json" / f"{csv_file.stem}.json").exists()) self.assertTrue((workdir / "json" / "latest" / f"{csv_file.stem}.json").exists())
def publish_tables() -> Response: with temporary_directory() as workdir: input_folder = workdir / "input" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) # Download all the combined tables into our local storage download_folder(GCS_BUCKET_TEST, "tables", input_folder) # TODO: perform some validation on the outputs and report errors # See: https://github.com/GoogleCloudPlatform/covid-19-open-data/issues/186 # Prepare all files for publishing and add them to the public folder copy_tables(input_folder, output_folder) logger.log_info("Output tables copied to public folder") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v2", output_folder) return Response("OK", status=200)