Esempio n. 1
0
def run_direct_msoas(payload_dict: dict):
    logging.info(f"run_direct:: {payload_dict}")
    payload = RawDataPayload(**payload_dict["base"])
    category = payload_dict['category']
    subcategory = payload_dict.get('subcategory')
    area_type = payload_dict['area_type']
    area_code = payload_dict['area_code']
    date = payload_dict['date']

    if category == "vaccination" and subcategory == "age_demographics":
        return run_demographics(payload_dict)

    kws = dict(
        container="pipeline",
        content_type="application/octet-stream",
        cache_control="no-cache, max-age=0, must-revalidate",
        compressed=False,
        tier='Cool'
    )

    # Retrieve data chunk
    with StorageClient(**kws, path=payload.data_path) as client, TemporaryFile() as fp:
        if not client.exists():
            raise RuntimeError(f"Blob not found: {payload.data_path}")

        client.download().readinto(fp)
        fp.seek(0)
        data = read_feather(fp)

    # Process chunk
    # These must be done in a specific order.
    result = (
        data
        .pipe(homogenise_dates)
        .pipe(normalise_records, zero_filled=FILL_WITH_ZEROS, cumulative=START_WITH_ZERO)
    )

    # Store chunk for deployment to DB
    result_path = f"daily_chunks/{category}/{date}/{area_type}_{area_code}.ft"
    with TemporaryFile() as fp:
        result.reset_index(drop=True).to_feather(fp)
        fp.seek(0)

        with StorageClient(**kws, path=result_path) as cli:
            cli.upload(fp.read())

    response_payload = {
        "path": result_path,
        "area_code": area_code,
        "area_type": area_type,
        "date": date,
        "environment": payload.environment,
        "category": category,
        "subcategory": subcategory
    }

    logging.info(response_payload)

    return response_payload
Esempio n. 2
0
    def __init__(self, container):
        kws = dict(container=container,
                   content_type="application/octet-stream",
                   cache_control="no-cache, max-age=0, must-revalidate",
                   compressed=True,
                   tier='Archive')

        self.client = StorageClient(**kws)
        self.container = self.client.get_container()
        self.content_settings = getattr(self.client, '_content_settings')
async def process_and_upload_data(path: str, get_file_data: FileFetcherType,
                                  container: str, base_path: str) -> NoReturn:
    """
    Uploads processed files to the storage using the correct
    caching and ``content-type`` specs.

    Parameters
    ----------
    path: str
        Path (within the storage container) in which the
        file is to be stored.

    get_file_data: FileFetcherType

    base_path: str

    container: str
        Storage container in which the file is to be stored.

    Returns
    -------
    NoReturn
    """
    _, file_name = split_path(path)
    # Files are stored as JSON - the extension must be updated:
    file_name, _ = splitext(file_name)
    json_name = f"{file_name}.json"
    yaml_name = f"{file_name}.yaml"

    json_path = str.join(processor_settings.URL_SEPARATOR,
                         [STORAGE_PATH, json_name])
    yaml_path = str.join(processor_settings.URL_SEPARATOR,
                         [STORAGE_PATH, yaml_name])

    if ".github" in path:
        return None

    raw_data = await get_file_data(path, base_path)
    data = await prepare_data(raw_data)

    # Uploading the data
    with StorageClient(container=container, path=json_path) as client:
        async with Lock():
            client.upload(data=data.json_data)

    with StorageClient(container=container,
                       path=yaml_path,
                       content_type="application/x-yaml") as client:
        async with Lock():
            client.upload(data=data.yaml_data)
def store_data():
    kws = dict(container="pipeline",
               content_type="application/octet-stream",
               cache_control="no-cache, max-age=0, must-revalidate",
               compressed=False,
               tier='Cool')

    client = StorageClient(**kws)
    container = client.get_container()
    content_settings = getattr(client, '_content_settings')

    def upload(data: DataFrame, category: str, subcategory: Union[str, None],
               date: str):

        area_type = data.iloc[0].areaType
        area_code = data.iloc[0].areaCode

        if subcategory:
            path = f"etl/{category}/{subcategory}/{date}/{area_type}_{area_code}.ft"
        else:
            path = f"etl/{category}/{date}/{area_type}_{area_code}.ft"

        with TemporaryFile() as fp:
            _ = (data.sort_values(
                ["areaType", "areaCode", "date"],
                ascending=[True, True, False]).dropna(
                    how='all', axis=1).reset_index(drop=True).to_feather(fp))
            fp.seek(0)

            container.upload_blob(data=fp,
                                  name=path,
                                  content_settings=content_settings,
                                  overwrite=True,
                                  standard_blob_tier=StandardBlobTier.Cool,
                                  timeout=60,
                                  max_concurrency=10)

        response = {
            "path": path,
            "area_type": area_type,
            "area_code": area_code,
            "category": category,
            "subcategory": subcategory,
            "date": date
        }

        return response

    return upload
Esempio n. 5
0
def upload_chunk_feather(*, data: DataFrame, container: str, dir_path: str,
                         filename: str) -> NoReturn:
    """
    Requires keyword arguments.

    Parameters
    ----------
    data: DataFrame

    container: str
        Storage container name.

    dir_path: str
        Storage directory path.

    filename: str
        Name of the file to be stored.

    Returns
    -------
    NoReturn
    """
    file_obj = BytesIO()
    data.reset_index(drop=True).to_feather(file_obj)
    file_obj.seek(0)

    bin_data = file_obj.read()

    with StorageClient(container=container, path=f"{dir_path}/{filename}", **UPLOAD_KWS) as cli:
        cli.upload(bin_data)
Esempio n. 6
0
def get_dataset(payload: MSOAPayload) -> DataFrame:
    with TemporaryFile() as fp, StorageClient(**payload.data_path) as client:
        client.download().readinto(fp)
        fp.seek(0)

        result = read_parquet(fp, columns=["areaCode", "date", payload.metric])

    max_date = result.date.max()
    area_data = result.loc[result.areaCode == payload.area_code, :]
    area_data.date = area_data.date.astype("datetime64").dt.strftime("%Y-%m-%d")

    if max_date in area_data.date:
        return area_data

    dates = date_range(
        start=datetime.strptime(area_data.date.max(), "%Y-%m-%d") + timedelta(days=1),
        end=max_date,
        freq='1D'
    )

    missing_values = [
        {"areaCode": payload.area_code, "date": f"{date:%Y-%m-%d}", payload.metric: 0}
        for date in dates
    ]

    return area_data.append(missing_values)
def main(payload: GenericPayload) -> DisposerResponse:
    """
    Removes blobs from the storage.

    Parameters
    ----------
    payload: GenericPayload

    Returns
    -------
    DisposerResponse
        Message confirming that the process is done.
    """
    logging.info(f"triggered with manifest: {payload['manifest']}")
    logging.info(f"- total blobs to remove: {len(payload['tasks'])}")

    payload_content: List[ArtefactPayload] = payload['tasks']

    first_path = payload_content[0]['from_path']
    with StorageClient(container=payload['manifest']['container'],
                       path=first_path) as cli:
        container = cli.get_container()

        for artefact in payload_content:
            container.delete_blob(artefact['from_path'])

        container.close()

    logging.info(f"done: {payload['timestamp']}")

    return DisposerResponse(total_processed=len(payload['tasks']))
Esempio n. 8
0
class ArchiveStorage:
    def __init__(self, container):
        kws = dict(container=container,
                   content_type="application/octet-stream",
                   cache_control="no-cache, max-age=0, must-revalidate",
                   compressed=True,
                   tier='Archive')

        self.client = StorageClient(**kws)
        self.container = self.client.get_container()
        self.content_settings = getattr(self.client, '_content_settings')

    def upload(self, path: str, data: Union[Iterable[AnyStr], IO[AnyStr]]):
        self.container.upload_blob(data=data,
                                   name=path,
                                   content_settings=self.content_settings,
                                   overwrite=True,
                                   standard_blob_tier=StandardBlobTier.Archive,
                                   timeout=60,
                                   max_concurrency=10)

    def download(self, path: str) -> StorageStreamDownloader:
        return self.container.download_blob(path)

    def ls_of(self, prefix):
        return self.container.walk_blobs(name_starts_with=prefix)

    def __enter__(self) -> 'ArchiveStorage':
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.container.close()
Esempio n. 9
0
def get_timestamp(datestamp) -> datetime:
    with StorageClient(container="publicdata", path="assets/dispatch/dates.json") as client:
        timestamp = loads(client.download().readall().decode())[datestamp]

    ts = datetime.fromisoformat(timestamp.replace("5Z", ""))
    logging.info(f"> timestamp extracted {ts}")
    return ts
Esempio n. 10
0
def main(payload):
    kws = {**UPLOAD_KWS, **payload}
    value = kws.pop("value")

    with StorageClient(**kws) as client:
        client.upload(value)

    return f"DONE: {payload}"
def get_published_timestamp(raw=False):
    with StorageClient("pipeline", "info/latest_published") as client:
        dt = client.download().readall().decode()

    if raw:
        return dt

    return datetime.strptime(dt[:24] + "Z", "%Y-%m-%dT%H:%M:%S.%fZ")
def get_latest_csv():
    with StorageClient(container="pipeline", path=f"archive/processed") as cli:
        filtered_names = filter(lambda x: x['name'].endswith("csv"), cli)
        name = max(filtered_names, key=lambda x: x['last_modified'])['name']
        cli.path = name
        data = cli.download().readall().decode()

    return StringIO(data)
Esempio n. 13
0
def store_html(data, slug_id):
    kws = dict(container="ondemand",
               path=f"prerelease/{slug_id}.html",
               content_type="text/html; charset=UTF-8",
               cache_control="public, max-age=120, must-revalidate")

    with StorageClient(**kws) as cli:
        cli.upload(data)
def get_latest_breakdowns_by_specimen_date():
    with StorageClient(container="rawbreakdowndata",
                       path="specimen_date/daily_cases_") as cli:
        filtered_names = filter(lambda x: x['name'].endswith("zip"), cli)
        name = max(filtered_names, key=lambda x: x['last_modified'])['name']
        cli.path = name
        data = cli.download().readall()

    return BytesIO(data)
Esempio n. 15
0
async def download_file(container: str, path: str, lock: Lock) -> bytes:
    logging.info(f"> Downloading data from '{container}/{path}'")

    with StorageClient(container=container, path=path) as client:
        data = client.download()

    logging.info(f"> Download complete")

    return data.readall()
def get_latest_breakdown():
    with StorageClient(
            container="rawbreakdowndata",
            path="publish_date/daily_cases_by_pub_with_demography_") as cli:
        filtered_names = filter(lambda x: x['name'].endswith("csv"), cli)
        name = max(filtered_names, key=lambda x: x['last_modified'])['name']
        cli.path = name
        data = cli.download().readall().decode()

    return StringIO(data)
Esempio n. 17
0
def store_image(image: bytes):
    with StorageClient(
            container="publicdata",
            path=f"assets/frontpage/images/map.png",
            content_type="image/png",
            cache_control="max-age=300, stale-while-revalidate=30",
            content_language=None,
            compressed=False
    ) as client:
        client.upload(image)
def get_population_data() -> PopulationData:
    try:
        from __app__.population import get_population_data as process_data
    except ImportError:
        from population import get_population_data as process_data

    with StorageClient(container="pipeline",
                       path="assets/population.json") as cli:
        data = cli.download().readall().decode()

    return process_data(data)
def store_data(geo_data, container, path):
    payload = dumps(geo_data).decode().replace("NaN", "null")

    with StorageClient(
            container=container,
            path=path,
            content_type="application/json; charset=utf-8",
            cache_control="public, stale-while-revalidate=60, max-age=90",
            compressed=False,
            content_language=None) as cli:
        cli.upload(payload)
Esempio n. 20
0
def store_graph(data: BytesIO, category, timestamp):
    with StorageClient("static",
                       re.sub(r"[:\s'\"&]+", "_",
                              f"admin/releases/{category}/{timestamp}.png"),
                       content_type="image/png",
                       cache_control="max-age=60, must-revalidate",
                       content_language=None,
                       compressed=False) as client:
        client.upload(data.read())

    return True
def store_graph(data: BytesIO, area_type, area_code):
    # Comment for testing
    with StorageClient("publicdata",
                       f"assets/frontpage/scales/{area_type}/{area_code}.jpg",
                       content_type="image/jpeg",
                       cache_control="max-age=60, must-revalidate",
                       content_language=None,
                       compressed=False) as client:
        client.upload(data.read())

    return True
def get_release_timestamp(date_only=True, raw=False):
    with StorageClient("pipeline", "info/latest_available") as client:
        dt = client.download().readall().decode()

    if raw:
        return dt

    if date_only:
        return datetime.strptime(dt[:24] + "Z", "%Y-%m-%dT%H:%M:%S.%fZ").date()

    return datetime.strptime(dt[:24] + "Z", "%Y-%m-%dT%H:%M:%S.%fZ")
Esempio n. 23
0
def download_file(container: str, path: str) -> BytesIO:
    logging.info(f"> Downloading data from '{container}/{path}'")

    with StorageClient(container=container, path=path) as client:
        data = client.download()

    logging.info(f"> Download complete")

    fp = BytesIO(data.readall())
    fp.seek(0)

    return fp
Esempio n. 24
0
def store_png(filename, svg_image):
    with open(temp_svg_path, "w") as tmp_file:
        print(svg_image, file=tmp_file)

    drawing = svg2rlg(temp_svg_path)
    png_img = BytesIO()
    renderPM.drawToFile(drawing, png_img, fmt="PNG")
    png_img.seek(0)
    date = datetime.now().strftime("%Y%m%d")
    name = f"{filename}_{date}"

    with StorageClient(path=f"og-images/{name}.png", **storage_kws) as cli:
        cli.upload(png_img.read())
def get_latest_msoa_data() -> DataFrame:
    with StorageClient("pipeline", "assets/msoa_pop2019.csv") as client:
        population_io = StringIO(client.download().readall().decode())

    msoa_population = (read_csv(population_io, low_memory=False).rename(
        columns=["areaCode",
                 "population"], inplace=True).set_index("areaCode",
                                                        inplace=True))

    with StorageClient(container="rawsoadata", path="daily_") as cli:
        csv_files = filter(lambda x: x['name'].endswith("csv"), cli)
        cli.path = max(csv_files, key=lambda x: x['name'])['name']
        data_io = StringIO(cli.download().readall().decode().replace(
            "nhs-msoa", "msoa").replace("-99", ""))

    raw_data = read_csv(data_io,
                        low_memory=False,
                        usecols=lambda x: x != "areaName")

    raw_data = raw_data.join(msoa_population, on=["areaCode"])

    return raw_data
Esempio n. 26
0
def upload_tarfile(archive_path: Path, storage_dir: str, filename: str,
                   date: str, total_archived: int) -> NoReturn:
    """
    Uploads archived artefacts as a `tar.bz2` blob in the storage
    under an "Cool" tier.

    Parameters
    ----------
    archive_path: Path
        Path to the temp archive file.

    storage_dir: str
        Path to the directory in the `ARCHIVE_CONTAINER` where the archived
        artefacts are to be stored.

    filename: str
        Name by which to store the file in the storage.

    date: str
        Archive date - i.e. the date on which archived data were generated.

    total_archived: int
        Total number of artefacts included in the archive.

    Returns
    -------
    NoReturn
    """
    logging.info("uploading the Tar archive")

    storage_kws = dict(container=ARCHIVE_CONTAINER,
                       path=f"{storage_dir}/{filename}",
                       compressed=False,
                       tier='Cool')

    with StorageClient(**storage_kws) as cli, \
            open(archive_path, 'rb') as fp:
        cli.upload(fp.read())
        cli.client.set_blob_metadata({
            "date":
            date,
            "generated_on":
            datetime.utcnow().isoformat(),
            "total_artefacts":
            str(total_archived)
        })

    logging.info(f"Tar archive uploaded: {storage_kws}")

    return None
async def download_file(container: str, path: str) -> DataFrame:
    logging.info(f"> Downloading data from '{container}/{path}'")

    with StorageClient(container=container, path=path) as client:
        if not client.exists():
            return DataFrame([], columns=["areaType", "areaCode", "date"])

        data = client.download()

    logging.info(f"> Download complete")

    data_io = BytesIO(data.readall())
    data_io.seek(0)
    return read_feather(data_io)
def get_timestamp_for(container, path, raw=True, date_only=False):
    with StorageClient(container=container, path=path) as cli:
        timestamp = max(cli, key=lambda x: x['last_modified'])['last_modified']

    if raw and not date_only:
        return timestamp.strftime(r"%Y-%m-%dT%H:%M:%S.%fZ")

    if raw and date_only:
        return timestamp.strftime(r"%Y-%m-%d")

    if not raw and not date_only:
        return timestamp

    if not raw and date_only:
        return timestamp.date()
Esempio n. 29
0
def load_data(etl_data: Dict[str, str]) -> DataFrame:
    path = etl_data['path']
    logging.info(f"> Downloading data from '{path}'")

    with StorageClient(**PROCESSED_FILES_KWS, path=path) as client:
        if not client.exists():
            raise RuntimeError(f"Blob does not exist: {path}")

        data = client.download()

        logging.info(f"> Download complete")

        data_io = BytesIO(data.readall())
        data_io.seek(0)

    return read_feather(data_io, use_threads=False)
Esempio n. 30
0
def store_data(date: str,
               metric: str,
               svg: str,
               area_type: str = None,
               area_code: str = None):
    kws = dict(
        container="downloads",
        content_type="image/svg+xml",
        cache_control="public, max-age=30, s-maxage=90, must-revalidate",
        compressed=False)

    path = f"homepage/{date}/thumbnail_{metric}.svg"

    if area_code is not None:
        path = f"homepage/{date}/{metric}/{area_type}/{area_code}_thumbnail.svg"

    with StorageClient(path=path, **kws) as cli:
        cli.upload(svg)