Esempio n. 1
0
def publish_json_tables(prod_folder: str = "v2") -> Response:
    prod_folder = _get_request_param("prod_folder", prod_folder)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Download all the global tables into our local storage
        forbid_tokens = ("/", "main.", "aggregated.")
        download_folder(
            GCS_BUCKET_PROD,
            prod_folder,
            input_folder,
            lambda x: x.suffix == ".csv" and all(token not in str(x)
                                                 for token in forbid_tokens),
        )
        logger.log_info(
            f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files"
        )

        # Convert all files to JSON
        convert_tables_to_json(input_folder, output_folder)
        logger.log_info("CSV files converted to JSON")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder)

    return Response("OK", status=200)
Esempio n. 2
0
def publish_json_locations(prod_folder: str = "v2",
                           location_key_from: str = None,
                           location_key_until: str = None) -> Response:
    prod_folder = _get_request_param("prod_folder", prod_folder)
    location_key_from = _get_request_param("location_key_from",
                                           location_key_from)
    location_key_until = _get_request_param("location_key_until",
                                            location_key_until)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Convert the tables to JSON for each location independently
        location_keys = list(
            table_read_column(SRC / "data" / "metadata.csv", "key"))
        if location_key_from is not None:
            location_keys = [
                key for key in location_keys if key >= location_key_from
            ]
        if location_key_until is not None:
            location_keys = [
                key for key in location_keys if key <= location_key_until
            ]
        logger.log_info(
            f"Converting {len(location_keys)} location subsets to JSON "
            f"from {location_keys[0]} until {location_keys[-1]}")

        # Download all the processed tables into our local storage
        def match_path(table_path: Path) -> bool:
            try:
                if prod_folder == "v2":
                    location_key, table_name = str(table_path).split("/", 1)
                    return table_name == "main.csv" and location_key in location_keys
                elif prod_folder == "v3":
                    location_path, location_key = table_path.parent.name, table_path.stem
                    return location_path == "location" and location_key in location_keys
            except:
                return False

        download_folder(GCS_BUCKET_PROD, prod_folder, input_folder, match_path)
        logger.log_info(
            f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files"
        )

        # Convert all files to JSON
        convert_tables_to_json(input_folder, output_folder)
        converted_count = sum(1 for _ in output_folder.glob("**/*.json"))
        logger.log_info(f"Converted {converted_count} files to JSON")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder)

    return Response("OK", status=200)
Esempio n. 3
0
def publish_json_tables(prod_folder: str = "v2") -> Response:
    prod_folder = _get_request_param("prod_folder", prod_folder)

    with temporary_directory() as workdir:
        input_folder = workdir / "input"
        output_folder = workdir / "output"
        input_folder.mkdir(parents=True, exist_ok=True)
        output_folder.mkdir(parents=True, exist_ok=True)

        # Download all the processed tables into our local storage
        download_folder(
            GCS_BUCKET_PROD,
            prod_folder,
            input_folder,
            lambda x: all(token not in str(x) for token in ("/", "main.")),
        )

        # Convert all files to JSON
        list(convert_tables_to_json(input_folder, output_folder))
        logger.log_info("CSV files converted to JSON")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, prod_folder, output_folder)

    return Response("OK", status=200)
    def test_convert_to_json(self):
        with temporary_directory() as workdir:

            # Copy all test tables into the temporary directory
            publish_global_tables(SRC / "test" / "data", workdir)

            # Copy test tables again but under a subpath
            subpath = workdir / "latest"
            subpath.mkdir()
            publish_global_tables(workdir, subpath)

            # Convert all the tables to JSON under a new path
            jsonpath = workdir / "json"
            jsonpath.mkdir()
            convert_tables_to_json(workdir, jsonpath)

            # The JSON files should maintain the same relative path
            for csv_file in workdir.glob("**/*.csv"):
                self.assertTrue((workdir / "json" / f"{csv_file.stem}.json").exists())
                self.assertTrue((workdir / "json" / "latest" / f"{csv_file.stem}.json").exists())
Esempio n. 5
0
def _convert_json(expr: str) -> str:
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)
        json_folder = workdir / "json"
        public_folder = workdir / "public"
        json_folder.mkdir(parents=True, exist_ok=True)
        public_folder.mkdir(parents=True, exist_ok=True)

        # Download all the processed tables into our local storage
        download_folder(GCS_BUCKET_PROD, "v2", public_folder, lambda x: re.match(expr, str(x)))

        # Convert all files to JSON
        list(convert_tables_to_json(public_folder, json_folder))
        print("CSV files converted to JSON")

        # Upload the results to the prod bucket
        upload_folder(GCS_BUCKET_PROD, "v2", json_folder)

    return "OK"