def publish_json_tables(prod_folder: str = "v2") -> Response: prod_folder = _get_request_param("prod_folder", prod_folder) with temporary_directory() as workdir: input_folder = workdir / "input" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) # Download all the global tables into our local storage forbid_tokens = ("/", "main.", "aggregated.") download_folder( GCS_BUCKET_PROD, prod_folder, input_folder, lambda x: x.suffix == ".csv" and all(token not in str(x) for token in forbid_tokens), ) logger.log_info( f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files" ) # Convert all files to JSON convert_tables_to_json(input_folder, output_folder) logger.log_info("CSV files converted to JSON") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder) return Response("OK", status=200)
def publish_json_locations(prod_folder: str = "v2", location_key_from: str = None, location_key_until: str = None) -> Response: prod_folder = _get_request_param("prod_folder", prod_folder) location_key_from = _get_request_param("location_key_from", location_key_from) location_key_until = _get_request_param("location_key_until", location_key_until) with temporary_directory() as workdir: input_folder = workdir / "input" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) # Convert the tables to JSON for each location independently location_keys = list( table_read_column(SRC / "data" / "metadata.csv", "key")) if location_key_from is not None: location_keys = [ key for key in location_keys if key >= location_key_from ] if location_key_until is not None: location_keys = [ key for key in location_keys if key <= location_key_until ] logger.log_info( f"Converting {len(location_keys)} location subsets to JSON " f"from {location_keys[0]} until {location_keys[-1]}") # Download all the processed tables into our local storage def match_path(table_path: Path) -> bool: try: if prod_folder == "v2": location_key, table_name = str(table_path).split("/", 1) return table_name == "main.csv" and location_key in location_keys elif prod_folder == "v3": location_path, location_key = table_path.parent.name, table_path.stem return location_path == "location" and location_key in location_keys except: return False download_folder(GCS_BUCKET_PROD, prod_folder, input_folder, match_path) logger.log_info( f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files" ) # Convert all files to JSON convert_tables_to_json(input_folder, output_folder) converted_count = sum(1 for _ in output_folder.glob("**/*.json")) logger.log_info(f"Converted {converted_count} files to JSON") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder) return Response("OK", status=200)
def publish_json_tables(prod_folder: str = "v2") -> Response: prod_folder = _get_request_param("prod_folder", prod_folder) with temporary_directory() as workdir: input_folder = workdir / "input" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) # Download all the processed tables into our local storage download_folder( GCS_BUCKET_PROD, prod_folder, input_folder, lambda x: all(token not in str(x) for token in ("/", "main.")), ) # Convert all files to JSON list(convert_tables_to_json(input_folder, output_folder)) logger.log_info("CSV files converted to JSON") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder) return Response("OK", status=200)
def test_convert_to_json(self): with temporary_directory() as workdir: # Copy all test tables into the temporary directory publish_global_tables(SRC / "test" / "data", workdir) # Copy test tables again but under a subpath subpath = workdir / "latest" subpath.mkdir() publish_global_tables(workdir, subpath) # Convert all the tables to JSON under a new path jsonpath = workdir / "json" jsonpath.mkdir() convert_tables_to_json(workdir, jsonpath) # The JSON files should maintain the same relative path for csv_file in workdir.glob("**/*.csv"): self.assertTrue((workdir / "json" / f"{csv_file.stem}.json").exists()) self.assertTrue((workdir / "json" / "latest" / f"{csv_file.stem}.json").exists())
def _convert_json(expr: str) -> str: with TemporaryDirectory() as workdir: workdir = Path(workdir) json_folder = workdir / "json" public_folder = workdir / "public" json_folder.mkdir(parents=True, exist_ok=True) public_folder.mkdir(parents=True, exist_ok=True) # Download all the processed tables into our local storage download_folder(GCS_BUCKET_PROD, "v2", public_folder, lambda x: re.match(expr, str(x))) # Convert all files to JSON list(convert_tables_to_json(public_folder, json_folder)) print("CSV files converted to JSON") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v2", json_folder) return "OK"