Beispiel #1
0
def publish_versions(prod_folder: str = "v3") -> Response:
    """Lists all the blobs in the bucket with generation."""
    prod_folder = _get_request_param("prod_folder", prod_folder)

    # Enumerate all the versions for each of the global tables
    prefix = prod_folder + "/"
    blob_index: Dict[str, List[str]] = {}
    bucket = get_storage_bucket(GCS_BUCKET_PROD)
    for table_name in ["aggregated", "main"] + list(get_table_names()):
        blobs = bucket.list_blobs(prefix=prefix + table_name, versions=True)
        for blob in blobs:
            fname = blob.name.replace(prefix, "")
            blob_index[fname] = blob_index.get(fname, [])
            blob_index[fname].append(blob.generation)

    # Repeat the process for the intermediate tables
    bucket = get_storage_bucket(GCS_BUCKET_TEST)
    blobs = bucket.list_blobs(prefix="intermediate/", versions=True)
    for blob in blobs:
        # Keep the "intermediate/" prefix to distinguish from the tables
        fname = blob.name
        blob_index[fname] = blob_index.get(fname, [])
        blob_index[fname].append(blob.generation)

    with temporary_directory() as workdir:
        # Write it to disk
        fname = workdir / "versions.json"
        with open(fname, "w") as fh:
            json.dump(blob_index, fh)

        # Upload to root folder
        upload_folder(GCS_BUCKET_PROD, prod_folder + "/", workdir)

    return Response("OK", status=200)
def download_folder(
    bucket_name: str,
    remote_path: str,
    local_folder: Path,
    filter_func: Callable[[Path], bool] = None,
) -> None:
    bucket = get_storage_bucket(bucket_name)

    def _download_blob(local_folder: Path, blob: Blob) -> None:
        # Remove the prefix from the remote path
        rel_path = blob.name.split(f"{remote_path}/", 1)[-1]
        if filter_func is None or filter_func(Path(rel_path)):
            logger.log_debug(f"Downloading {rel_path} to {local_folder}/")
            file_path = local_folder / rel_path
            file_path.parent.mkdir(parents=True, exist_ok=True)
            for i in range(BLOB_OP_MAX_RETRIES):
                try:
                    return blob.download_to_filename(str(file_path))
                except Exception as exc:
                    log_message = f"Error downloading {rel_path}."
                    logger.log_warning(log_message, traceback=traceback.format_exc())
                    # Exponential back-off
                    time.sleep(2 ** i)

            # If error persists, there must be something wrong with the network so we are better
            # off crashing the appengine server.
            error_message = f"Error downloading {rel_path}"
            logger.log_error(error_message)
            raise IOError(error_message)

    map_func = partial(_download_blob, local_folder)
    map_iter = bucket.list_blobs(prefix=remote_path)
    list(thread_map(map_func, map_iter, total=None, disable=True, max_workers=8))
Beispiel #3
0
def cache_build_map() -> Dict[str, List[str]]:
    sitemap: Dict[str, List[str]] = {}
    bucket = get_storage_bucket(GCS_BUCKET_PROD)
    for blob in bucket.list_blobs(prefix="cache"):
        filename = blob.name.split("/")[-1]
        if filename == "sitemap.json":
            continue
        sitemap_key = filename.split(".")[0]
        sitemap[sitemap_key] = sitemap.get(sitemap_key, [])
        sitemap[sitemap_key].append(blob.name)

    # Sort all the cache items
    for sitemap_key, snapshot_list in sitemap.items():
        sitemap[sitemap_key] = list(sorted(snapshot_list))

    return sitemap
Beispiel #4
0
def upload_folder(
    bucket_name: str,
    remote_path: str,
    local_folder: Path,
    filter_func: Callable[[Path], bool] = None,
) -> None:
    bucket = get_storage_bucket(bucket_name)

    def _upload_file(remote_path: str, file_path: Path):
        target_path = file_path.relative_to(local_folder)
        if filter_func is None or filter_func(target_path):
            logger.log_debug(f"Uploading {target_path} to {remote_path}/")
            blob = bucket.blob(os.path.join(remote_path, target_path))
            for i in range(BLOB_OP_MAX_RETRIES):
                try:
                    name, suffix = file_path.name, file_path.suffix

                    # If it's an extension we should compress, upload compressed file
                    if suffix[1:] in COMPRESS_EXTENSIONS:
                        with temporary_directory() as workdir:
                            gzipped_file = workdir / name
                            gzip_file(file_path, gzipped_file)
                            blob.content_encoding = "gzip"
                            return blob.upload_from_filename(gzipped_file)

                    # Otherwise upload the file as-is
                    else:
                        return blob.upload_from_filename(file_path)

                except Exception as exc:
                    log_message = f"Error uploading {target_path}."
                    logger.log_warning(log_message,
                                       traceback=traceback.format_exc())
                    # Exponential back-off
                    time.sleep(2**i)

            # If error persists, there must be something wrong with the network so we are better
            # off crashing the appengine server.
            error_message = f"Error uploading {target_path}"
            logger.log_error(error_message)
            raise IOError(error_message)

    map_func = partial(_upload_file, remote_path)
    map_iter = local_folder.glob("**/*.*")
    list(
        thread_map(map_func, map_iter, total=None, disable=True,
                   max_workers=8))
Beispiel #5
0
def cache_pull() -> Response:
    with temporary_directory() as workdir:
        now = datetime.datetime.utcnow()
        output_folder = workdir / now.strftime("%Y-%m-%d-%H")
        output_folder.mkdir(parents=True, exist_ok=True)

        def _pull_source(cache_source: Dict[str, str]):
            url = cache_source.pop("url")
            data = cache_source.pop("data", None)
            output = cache_source.pop("output")
            logger.log_info(f"Downloading {url} into {output}")
            buffer = BytesIO()
            try:
                download(url, buffer, data=data)
                with (output_folder / output).open("wb") as fd:
                    fd.write(buffer.getvalue())
                logger.log_info(f"Downloaded {output} successfully")
            except:
                logger.log_error(f"Cache pull failed for {url}.",
                                 traceback=traceback.format_exc())

        # Pull each of the sources from the cache config
        with (SRC / "cache.yaml").open("r") as fd:
            cache_list = yaml.safe_load(fd)
        list(thread_map(_pull_source, cache_list, disable=True))

        # Upload all cached data to the bucket
        upload_folder(GCS_BUCKET_PROD, "cache", workdir)

        # Build the sitemap for all cached files
        logger.log_info("Building sitemap")
        sitemap = cache_build_map()
        bucket = get_storage_bucket(GCS_BUCKET_PROD)
        blob = bucket.blob("cache/sitemap.json")
        blob.upload_from_string(json.dumps(sitemap))

    return Response("OK", status=200)
Beispiel #6
0
def download_file(bucket_name: str, remote_path: str, local_path: str) -> None:
    bucket = get_storage_bucket(bucket_name)
    # print(f"Downloading {remote_path} to {local_path}")
    return bucket.blob(remote_path).download_to_filename(str(local_path))