def run_direct_msoas(payload_dict: dict): logging.info(f"run_direct:: {payload_dict}") payload = RawDataPayload(**payload_dict["base"]) category = payload_dict['category'] subcategory = payload_dict.get('subcategory') area_type = payload_dict['area_type'] area_code = payload_dict['area_code'] date = payload_dict['date'] if category == "vaccination" and subcategory == "age_demographics": return run_demographics(payload_dict) kws = dict( container="pipeline", content_type="application/octet-stream", cache_control="no-cache, max-age=0, must-revalidate", compressed=False, tier='Cool' ) # Retrieve data chunk with StorageClient(**kws, path=payload.data_path) as client, TemporaryFile() as fp: if not client.exists(): raise RuntimeError(f"Blob not found: {payload.data_path}") client.download().readinto(fp) fp.seek(0) data = read_feather(fp) # Process chunk # These must be done in a specific order. result = ( data .pipe(homogenise_dates) .pipe(normalise_records, zero_filled=FILL_WITH_ZEROS, cumulative=START_WITH_ZERO) ) # Store chunk for deployment to DB result_path = f"daily_chunks/{category}/{date}/{area_type}_{area_code}.ft" with TemporaryFile() as fp: result.reset_index(drop=True).to_feather(fp) fp.seek(0) with StorageClient(**kws, path=result_path) as cli: cli.upload(fp.read()) response_payload = { "path": result_path, "area_code": area_code, "area_type": area_type, "date": date, "environment": payload.environment, "category": category, "subcategory": subcategory } logging.info(response_payload) return response_payload
def __init__(self, container): kws = dict(container=container, content_type="application/octet-stream", cache_control="no-cache, max-age=0, must-revalidate", compressed=True, tier='Archive') self.client = StorageClient(**kws) self.container = self.client.get_container() self.content_settings = getattr(self.client, '_content_settings')
async def process_and_upload_data(path: str, get_file_data: FileFetcherType, container: str, base_path: str) -> NoReturn: """ Uploads processed files to the storage using the correct caching and ``content-type`` specs. Parameters ---------- path: str Path (within the storage container) in which the file is to be stored. get_file_data: FileFetcherType base_path: str container: str Storage container in which the file is to be stored. Returns ------- NoReturn """ _, file_name = split_path(path) # Files are stored as JSON - the extension must be updated: file_name, _ = splitext(file_name) json_name = f"{file_name}.json" yaml_name = f"{file_name}.yaml" json_path = str.join(processor_settings.URL_SEPARATOR, [STORAGE_PATH, json_name]) yaml_path = str.join(processor_settings.URL_SEPARATOR, [STORAGE_PATH, yaml_name]) if ".github" in path: return None raw_data = await get_file_data(path, base_path) data = await prepare_data(raw_data) # Uploading the data with StorageClient(container=container, path=json_path) as client: async with Lock(): client.upload(data=data.json_data) with StorageClient(container=container, path=yaml_path, content_type="application/x-yaml") as client: async with Lock(): client.upload(data=data.yaml_data)
def store_data(): kws = dict(container="pipeline", content_type="application/octet-stream", cache_control="no-cache, max-age=0, must-revalidate", compressed=False, tier='Cool') client = StorageClient(**kws) container = client.get_container() content_settings = getattr(client, '_content_settings') def upload(data: DataFrame, category: str, subcategory: Union[str, None], date: str): area_type = data.iloc[0].areaType area_code = data.iloc[0].areaCode if subcategory: path = f"etl/{category}/{subcategory}/{date}/{area_type}_{area_code}.ft" else: path = f"etl/{category}/{date}/{area_type}_{area_code}.ft" with TemporaryFile() as fp: _ = (data.sort_values( ["areaType", "areaCode", "date"], ascending=[True, True, False]).dropna( how='all', axis=1).reset_index(drop=True).to_feather(fp)) fp.seek(0) container.upload_blob(data=fp, name=path, content_settings=content_settings, overwrite=True, standard_blob_tier=StandardBlobTier.Cool, timeout=60, max_concurrency=10) response = { "path": path, "area_type": area_type, "area_code": area_code, "category": category, "subcategory": subcategory, "date": date } return response return upload
def upload_chunk_feather(*, data: DataFrame, container: str, dir_path: str, filename: str) -> NoReturn: """ Requires keyword arguments. Parameters ---------- data: DataFrame container: str Storage container name. dir_path: str Storage directory path. filename: str Name of the file to be stored. Returns ------- NoReturn """ file_obj = BytesIO() data.reset_index(drop=True).to_feather(file_obj) file_obj.seek(0) bin_data = file_obj.read() with StorageClient(container=container, path=f"{dir_path}/{filename}", **UPLOAD_KWS) as cli: cli.upload(bin_data)
def get_dataset(payload: MSOAPayload) -> DataFrame: with TemporaryFile() as fp, StorageClient(**payload.data_path) as client: client.download().readinto(fp) fp.seek(0) result = read_parquet(fp, columns=["areaCode", "date", payload.metric]) max_date = result.date.max() area_data = result.loc[result.areaCode == payload.area_code, :] area_data.date = area_data.date.astype("datetime64").dt.strftime("%Y-%m-%d") if max_date in area_data.date: return area_data dates = date_range( start=datetime.strptime(area_data.date.max(), "%Y-%m-%d") + timedelta(days=1), end=max_date, freq='1D' ) missing_values = [ {"areaCode": payload.area_code, "date": f"{date:%Y-%m-%d}", payload.metric: 0} for date in dates ] return area_data.append(missing_values)
def main(payload: GenericPayload) -> DisposerResponse: """ Removes blobs from the storage. Parameters ---------- payload: GenericPayload Returns ------- DisposerResponse Message confirming that the process is done. """ logging.info(f"triggered with manifest: {payload['manifest']}") logging.info(f"- total blobs to remove: {len(payload['tasks'])}") payload_content: List[ArtefactPayload] = payload['tasks'] first_path = payload_content[0]['from_path'] with StorageClient(container=payload['manifest']['container'], path=first_path) as cli: container = cli.get_container() for artefact in payload_content: container.delete_blob(artefact['from_path']) container.close() logging.info(f"done: {payload['timestamp']}") return DisposerResponse(total_processed=len(payload['tasks']))
class ArchiveStorage: def __init__(self, container): kws = dict(container=container, content_type="application/octet-stream", cache_control="no-cache, max-age=0, must-revalidate", compressed=True, tier='Archive') self.client = StorageClient(**kws) self.container = self.client.get_container() self.content_settings = getattr(self.client, '_content_settings') def upload(self, path: str, data: Union[Iterable[AnyStr], IO[AnyStr]]): self.container.upload_blob(data=data, name=path, content_settings=self.content_settings, overwrite=True, standard_blob_tier=StandardBlobTier.Archive, timeout=60, max_concurrency=10) def download(self, path: str) -> StorageStreamDownloader: return self.container.download_blob(path) def ls_of(self, prefix): return self.container.walk_blobs(name_starts_with=prefix) def __enter__(self) -> 'ArchiveStorage': return self def __exit__(self, exc_type, exc_val, exc_tb): self.container.close()
def get_timestamp(datestamp) -> datetime: with StorageClient(container="publicdata", path="assets/dispatch/dates.json") as client: timestamp = loads(client.download().readall().decode())[datestamp] ts = datetime.fromisoformat(timestamp.replace("5Z", "")) logging.info(f"> timestamp extracted {ts}") return ts
def main(payload): kws = {**UPLOAD_KWS, **payload} value = kws.pop("value") with StorageClient(**kws) as client: client.upload(value) return f"DONE: {payload}"
def get_published_timestamp(raw=False): with StorageClient("pipeline", "info/latest_published") as client: dt = client.download().readall().decode() if raw: return dt return datetime.strptime(dt[:24] + "Z", "%Y-%m-%dT%H:%M:%S.%fZ")
def get_latest_csv(): with StorageClient(container="pipeline", path=f"archive/processed") as cli: filtered_names = filter(lambda x: x['name'].endswith("csv"), cli) name = max(filtered_names, key=lambda x: x['last_modified'])['name'] cli.path = name data = cli.download().readall().decode() return StringIO(data)
def store_html(data, slug_id): kws = dict(container="ondemand", path=f"prerelease/{slug_id}.html", content_type="text/html; charset=UTF-8", cache_control="public, max-age=120, must-revalidate") with StorageClient(**kws) as cli: cli.upload(data)
def get_latest_breakdowns_by_specimen_date(): with StorageClient(container="rawbreakdowndata", path="specimen_date/daily_cases_") as cli: filtered_names = filter(lambda x: x['name'].endswith("zip"), cli) name = max(filtered_names, key=lambda x: x['last_modified'])['name'] cli.path = name data = cli.download().readall() return BytesIO(data)
async def download_file(container: str, path: str, lock: Lock) -> bytes: logging.info(f"> Downloading data from '{container}/{path}'") with StorageClient(container=container, path=path) as client: data = client.download() logging.info(f"> Download complete") return data.readall()
def get_latest_breakdown(): with StorageClient( container="rawbreakdowndata", path="publish_date/daily_cases_by_pub_with_demography_") as cli: filtered_names = filter(lambda x: x['name'].endswith("csv"), cli) name = max(filtered_names, key=lambda x: x['last_modified'])['name'] cli.path = name data = cli.download().readall().decode() return StringIO(data)
def store_image(image: bytes): with StorageClient( container="publicdata", path=f"assets/frontpage/images/map.png", content_type="image/png", cache_control="max-age=300, stale-while-revalidate=30", content_language=None, compressed=False ) as client: client.upload(image)
def get_population_data() -> PopulationData: try: from __app__.population import get_population_data as process_data except ImportError: from population import get_population_data as process_data with StorageClient(container="pipeline", path="assets/population.json") as cli: data = cli.download().readall().decode() return process_data(data)
def store_data(geo_data, container, path): payload = dumps(geo_data).decode().replace("NaN", "null") with StorageClient( container=container, path=path, content_type="application/json; charset=utf-8", cache_control="public, stale-while-revalidate=60, max-age=90", compressed=False, content_language=None) as cli: cli.upload(payload)
def store_graph(data: BytesIO, category, timestamp): with StorageClient("static", re.sub(r"[:\s'\"&]+", "_", f"admin/releases/{category}/{timestamp}.png"), content_type="image/png", cache_control="max-age=60, must-revalidate", content_language=None, compressed=False) as client: client.upload(data.read()) return True
def store_graph(data: BytesIO, area_type, area_code): # Comment for testing with StorageClient("publicdata", f"assets/frontpage/scales/{area_type}/{area_code}.jpg", content_type="image/jpeg", cache_control="max-age=60, must-revalidate", content_language=None, compressed=False) as client: client.upload(data.read()) return True
def get_release_timestamp(date_only=True, raw=False): with StorageClient("pipeline", "info/latest_available") as client: dt = client.download().readall().decode() if raw: return dt if date_only: return datetime.strptime(dt[:24] + "Z", "%Y-%m-%dT%H:%M:%S.%fZ").date() return datetime.strptime(dt[:24] + "Z", "%Y-%m-%dT%H:%M:%S.%fZ")
def download_file(container: str, path: str) -> BytesIO: logging.info(f"> Downloading data from '{container}/{path}'") with StorageClient(container=container, path=path) as client: data = client.download() logging.info(f"> Download complete") fp = BytesIO(data.readall()) fp.seek(0) return fp
def store_png(filename, svg_image): with open(temp_svg_path, "w") as tmp_file: print(svg_image, file=tmp_file) drawing = svg2rlg(temp_svg_path) png_img = BytesIO() renderPM.drawToFile(drawing, png_img, fmt="PNG") png_img.seek(0) date = datetime.now().strftime("%Y%m%d") name = f"{filename}_{date}" with StorageClient(path=f"og-images/{name}.png", **storage_kws) as cli: cli.upload(png_img.read())
def get_latest_msoa_data() -> DataFrame: with StorageClient("pipeline", "assets/msoa_pop2019.csv") as client: population_io = StringIO(client.download().readall().decode()) msoa_population = (read_csv(population_io, low_memory=False).rename( columns=["areaCode", "population"], inplace=True).set_index("areaCode", inplace=True)) with StorageClient(container="rawsoadata", path="daily_") as cli: csv_files = filter(lambda x: x['name'].endswith("csv"), cli) cli.path = max(csv_files, key=lambda x: x['name'])['name'] data_io = StringIO(cli.download().readall().decode().replace( "nhs-msoa", "msoa").replace("-99", "")) raw_data = read_csv(data_io, low_memory=False, usecols=lambda x: x != "areaName") raw_data = raw_data.join(msoa_population, on=["areaCode"]) return raw_data
def upload_tarfile(archive_path: Path, storage_dir: str, filename: str, date: str, total_archived: int) -> NoReturn: """ Uploads archived artefacts as a `tar.bz2` blob in the storage under an "Cool" tier. Parameters ---------- archive_path: Path Path to the temp archive file. storage_dir: str Path to the directory in the `ARCHIVE_CONTAINER` where the archived artefacts are to be stored. filename: str Name by which to store the file in the storage. date: str Archive date - i.e. the date on which archived data were generated. total_archived: int Total number of artefacts included in the archive. Returns ------- NoReturn """ logging.info("uploading the Tar archive") storage_kws = dict(container=ARCHIVE_CONTAINER, path=f"{storage_dir}/{filename}", compressed=False, tier='Cool') with StorageClient(**storage_kws) as cli, \ open(archive_path, 'rb') as fp: cli.upload(fp.read()) cli.client.set_blob_metadata({ "date": date, "generated_on": datetime.utcnow().isoformat(), "total_artefacts": str(total_archived) }) logging.info(f"Tar archive uploaded: {storage_kws}") return None
async def download_file(container: str, path: str) -> DataFrame: logging.info(f"> Downloading data from '{container}/{path}'") with StorageClient(container=container, path=path) as client: if not client.exists(): return DataFrame([], columns=["areaType", "areaCode", "date"]) data = client.download() logging.info(f"> Download complete") data_io = BytesIO(data.readall()) data_io.seek(0) return read_feather(data_io)
def get_timestamp_for(container, path, raw=True, date_only=False): with StorageClient(container=container, path=path) as cli: timestamp = max(cli, key=lambda x: x['last_modified'])['last_modified'] if raw and not date_only: return timestamp.strftime(r"%Y-%m-%dT%H:%M:%S.%fZ") if raw and date_only: return timestamp.strftime(r"%Y-%m-%d") if not raw and not date_only: return timestamp if not raw and date_only: return timestamp.date()
def load_data(etl_data: Dict[str, str]) -> DataFrame: path = etl_data['path'] logging.info(f"> Downloading data from '{path}'") with StorageClient(**PROCESSED_FILES_KWS, path=path) as client: if not client.exists(): raise RuntimeError(f"Blob does not exist: {path}") data = client.download() logging.info(f"> Download complete") data_io = BytesIO(data.readall()) data_io.seek(0) return read_feather(data_io, use_threads=False)
def store_data(date: str, metric: str, svg: str, area_type: str = None, area_code: str = None): kws = dict( container="downloads", content_type="image/svg+xml", cache_control="public, max-age=30, s-maxage=90, must-revalidate", compressed=False) path = f"homepage/{date}/thumbnail_{metric}.svg" if area_code is not None: path = f"homepage/{date}/{metric}/{area_type}/{area_code}_thumbnail.svg" with StorageClient(path=path, **kws) as cli: cli.upload(svg)