def get_orphans(): s3 = s3_client(region_name=DEAFRICA_AWS_REGION) print("Finding Orphans") report_files = list(s3_ls_dir(uri=DEAFRICA_GAP_REPORT_S3_PATH, s3=s3)) report_files_json = [ report_file for report_file in report_files if report_file.endswith(".json") ] # fetch the latest report: Landsat 5, Landsat 7 and Landsat 8 report_files_json.sort() landsat_8_report = [ report_file for report_file in report_files_json if "landsat_8" in report_file ][-1] landsat_7_report = [ report_file for report_file in report_files_json if "landsat_7" in report_file ][-1] landsat_5_report = [ report_file for report_file in report_files_json if "landsat_5" in report_file ][-1] # collect orphan paths list_orphan_paths = [] for report in [landsat_5_report, landsat_7_report, landsat_8_report]: file = s3_fetch(url=report, s3=s3) dict_file = json.loads(file) orphans = set(dict_file.get("orphan")) print(f"collected orphan scenes from {report}: {len(orphans)}") list_orphan_paths.extend(orphans) return list_orphan_paths
def prepare_message(scene_paths: list, log: Optional[logging.Logger] = None): """ Prepare a single message for each stac file """ s3 = s3_client(region_name=SOURCE_REGION) message_id = 0 for s3_path in scene_paths: try: contents = s3_fetch(url=s3_path, s3=s3) contents_dict = json.loads(contents) attributes = get_common_message_attributes(contents_dict) message = { "Id": str(message_id), "MessageBody": json.dumps( { "Message": json.dumps(contents_dict), "MessageAttributes": attributes, } ), } message_id += 1 yield message except Exception as exc: if log: log.error(f"{s3_path} does not exist - {exc}")
def get_and_filter_cogs_keys(): """ Retrieve key list from a inventory bucket and filter :return: """ s3 = s3_client(region_name=SOURCE_REGION) source_keys = list_inventory( manifest=f"{SOURCE_INVENTORY_PATH}", s3=s3, prefix=BASE_FOLDER_NAME, contains=".json", n_threads=200, ) africa_tile_ids = set( pd.read_csv( "https://raw.githubusercontent.com/digitalearthafrica/deafrica-extent/master/deafrica-mgrs-tiles.csv.gz", header=None, ).values.ravel()) return set( key.Key for key in source_keys if (key.Key.split("/")[-2].split("_")[1] in africa_tile_ids # We need to ensure we're ignoring the old format data and re.match(r"sentinel-s2-l2a-cogs/\d{4}/", key.Key) is None))
def cli(file_list, no_sign_request=None): global s3 s3 = s3_client(aws_unsigned=no_sign_request) urls = [line.rstrip() for line in file_list.readlines()] for url in tqdm(urls): if not url: continue tqdm.write(f"Updating {url}", end='') replace_in_s3_obj(url)
def check_scene_exist_in_source(path: str): """ check scene exists in usgs source bucket """ s3 = s3_client(region_name=USGS_AWS_REGION) usgs_path = path.replace(f"s3://{DEAFRICA_LANDSAT_BUCKET_NAME}", f"s3://{USGS_S3_BUCKET_NAME}") returned = set(s3_ls(usgs_path, s3=s3, **{"RequestPayer": "requester"})) if returned: return True return False
def cli( inventory, prefix, regex, glob, aws_profile, no_sign_request=None, request_payer=False, ): """List S3 inventory entries. prefix can be combined with regex or glob pattern, but supplying both regex and glob doesn't make sense. \b Example: s3-inventory s3://my-inventory-bucket/path-to-inventory/ '*yaml' """ def entry_to_url(entry): return "s3://{e.Bucket}/{e.Key}".format(e=entry) opts = {} if request_payer: opts["RequestPayer"] = "requester" flush_freq = 100 s3 = s3_client(profile=aws_profile, aws_unsigned=no_sign_request) if glob == "": glob = None if glob is not None and regex is not None: click.echo("Can not mix regex and shell patterns") sys.exit(1) if inventory is None: # TODO: read from config file inventory = "s3://dea-public-data-inventory/dea-public-data/dea-public-data-csv-inventory/" predicate = build_predicate(glob=glob, regex=regex, prefix=prefix) to_str = entry_to_url for i, entry in enumerate(list_inventory(inventory, s3=s3, **opts)): if predicate(entry): print(to_str(entry), flush=(i % flush_freq) == 0)
def publish_to_s3(data: list, output_filename: str, content_type: str = "text/plain"): """ write report to s3 """ s3 = s3_client(region_name=DEAFRICA_AWS_REGION) s3_dump( data=data, url=str(DEAFRICA_ORPHAN_REPORT_S3_PATH / output_filename), s3=s3, ContentType=content_type, ) print( f"Report can be accessed from {DEAFRICA_ORPHAN_REPORT_S3_PATH / output_filename}" )
def main(s3_urls, workers): """ Script to sync Sentinel-2 data from NCI to AWS S3 bucket Pass in a file containing destination S3 urls that need to be uploaded. """ setup_logging() global S3 S3 = s3_client() urls_to_upload = [url.strip() for url in s3_urls.readlines()] _LOG.info(f"{len(urls_to_upload)} datasets to upload.") with ThreadPoolExecutor(max_workers=workers) as executor: futures = [executor.submit(upload_dataset, s3_url) for s3_url in urls_to_upload] for future in tqdm(as_completed(futures), total=len(urls_to_upload), unit='datasets', disable=None): _LOG.info(f"Completed uploaded: {future.result()}")
def read_report_missing_scenes(report_path: str, limit=None): """ read the gap report """ s3 = s3_client(region_name="af-south-1") report_json = s3_fetch(url=report_path, s3=s3) report_dict = json.loads(report_json) if report_dict.get("missing", None) is None: raise Exception("Missing scenes not found") missing_scene_paths = [ scene_path.strip() for scene_path in report_dict["missing"] if scene_path ] if limit: missing_scene_paths = missing_scene_paths[: int(limit)] return missing_scene_paths
def find_latest_report( report_folder_path: str, contains: str = None, not_contains: str = None ) -> str: """ Function to find the latest gap report :return:(str) return the latest report file name """ s3 = s3_client(region_name="af-south-1") report_files = list(s3_ls_dir(uri=report_folder_path, s3=s3)) if contains is not None: report_files = [report for report in report_files if contains in report] if not_contains is not None: report_files = [report for report in report_files if not_contains not in report] report_files.sort() if not report_files: raise RuntimeError("Report not found!") return report_files[-1]
from odc.aws.inventory import find_latest_manifest, list_inventory from odc.aws import s3_head_object, s3_client INVENTORY_BUCKET = "deafrica-sentinel-2-inventory" PREFIX = "deafrica-sentinel-2/deafrica-sentinel-2-inventory/" DO_FIX = False if DO_FIX: client = s3_client(region_name="af-south-1") else: client = s3_client(aws_unsigned=True, region_name="af-south-1") manifest = find_latest_manifest( f"s3://{INVENTORY_BUCKET}/{PREFIX}", client, ) inventory = list_inventory(manifest, s3=client) report_every = 10000 count = 0 json_docs = 0 to_fix = 0 for obj in inventory: count += 1 if count % report_every == 0: print(f"Processing {count}") if obj.Key.endswith(".json"):
def create_mosaic( dc: Datacube, product: str, out_product: str, time: Tuple[str, str], time_str: str, bands: Tuple[str], s3_output_root: str, split_bands: bool = False, resolution: int = 120, overwrite: bool = False, ): log = setup_logging() log.info(f"Creating mosaic for {product} over {time}") client = start_local_dask() assets = {} data = dc.load( product=product, time=time, resolution=(-resolution, resolution), dask_chunks={"x": 2048, "y": 2048}, measurements=bands, ) # This is a bad idea, we run out of memory # data.persist() if not split_bands: log.info("Creating a single tif file") out_file = _get_path(s3_output_root, out_product, time_str, "tif") exists = s3_head_object(out_file) is not None skip_writing = not (not exists or overwrite) try: asset, _ = _save_opinionated_cog( data, out_file, skip_writing=skip_writing, ) except ValueError: log.exception( "Failed to create COG, please check that you only have one timestep in the period." ) exit(1) assets[bands[0]] = asset if skip_writing: log.info(f"File exists, and overwrite is False. Not writing {out_file}") else: log.info(f"Finished writing: {asset.href}") else: log.info("Creating multiple tif files") for band in bands: out_file = _get_path( s3_output_root, out_product, time_str, "tif", band=band ) exists = s3_head_object(out_file) is not None skip_writing = not (not exists or overwrite) try: asset, band = _save_opinionated_cog( data=data, out_file=out_file, band=band, skip_writing=skip_writing, ) except ValueError: log.exception( "Failed to create COG, please check that you only have one timestep in the period." ) exit(1) assets[band] = asset if skip_writing: log.info(f"File exists, and overwrite is False. Not writing {out_file}") else: log.info(f"Finished writing: {asset.href}") # Aggressively heavy handed, but we get memory leaks otherwise client.restart() out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json") item = create_stac_item( assets[bands[0]].href, id=f"{product}_{time_str}", assets=assets, with_proj=True, properties={ "odc:product": out_product, "start_datetime": f"{time[0]}T00:00:00Z", "end_datetime": f"{time[1]}T23:59:59Z", }, ) item.set_self_href(out_stac_file) log.info(f"Writing STAC: {out_stac_file}") client = s3_client(aws_unsigned=False) s3_dump( data=json.dumps(item.to_dict(), indent=2), url=item.self_href, ACL="bucket-owner-full-control", ContentType="application/json", s3=client, )
def list_inventory( manifest, s3=None, prefix: str = "", suffix: str = "", contains: str = "", multiple_contains: tuple[str, str] = None, n_threads: int = None, **kw, ): """ Returns a generator of inventory records manifest -- s3:// url to manifest.json or a folder in which case latest one is chosen. :param manifest: (str) :param s3: (aws client) :param prefix: (str) :param prefixes: (List(str)) allow multiple prefixes to be searched :param suffix: (str) :param contains: (str) :param n_threads: (int) number of threads, if not sent does not use threads :return: SimpleNamespace """ s3 = s3 or s3_client() if manifest.endswith("/"): manifest = find_latest_manifest(manifest, s3, **kw) info = s3_fetch(manifest, s3=s3, **kw) info = json.loads(info) must_have_keys = {"fileFormat", "fileSchema", "files", "destinationBucket"} missing_keys = must_have_keys - set(info) if missing_keys: raise ValueError("Manifest file haven't parsed correctly") if info["fileFormat"].upper() != "CSV": raise ValueError("Data is not in CSV format") s3_prefix = "s3://" + info["destinationBucket"].split(":")[-1] + "/" data_urls = [s3_prefix + f["key"] for f in info["files"]] schema = tuple(info["fileSchema"].split(", ")) if n_threads: with ThreadPoolExecutor(max_workers=1000) as executor: tasks = [ executor.submit(retrieve_manifest_files, key, s3, schema) for key in data_urls ] for future in as_completed(tasks): for namespace in future.result(): key = namespace.Key if test_key( key, prefix=prefix, suffix=suffix, contains=contains, multiple_contains=multiple_contains, ): yield namespace else: for u in data_urls: for namespace in retrieve_manifest_files(u, s3, schema): key = namespace.Key if test_key( key, prefix=prefix, suffix=suffix, contains=contains, multiple_contains=multiple_contains, ): yield namespace
def generate_buckets_diff( bucket_name: str, update_stac: bool = False, notification_url: str = None, ) -> None: """ Compare Sentinel-2 buckets in US and Africa and detect differences A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report :param bucket_name: (str) Bucket where the gap report is :param update_stac: (bool) Define if the report will contain all scenes from the source for an update :param notification_url: (str) Optional slack URL in case of you want to send a slack notification """ log = setup_logging() log.info("Task started") # defines where the report will be saved s2_status_report_path = URL(f"s3://{bucket_name}/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" log.info(f"Environment {environment}") date_string = datetime.now().strftime("%Y-%m-%d") # Retrieve keys from inventory bucket source_keys = get_and_filter_cogs_keys() output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys) orphaned_keys = set() else: destination_keys = set(ns.Key for ns in list_inventory( manifest=f"{SENTINEL_2_INVENTORY_PATH}", prefix=BASE_FOLDER_NAME, contains=".json", n_threads=200, )) # Keys that are missing, they are in the source but not in the bucket missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys if key not in destination_keys) # Keys that are lost, they are in the bucket but not found in the source orphaned_keys = destination_keys.difference(source_keys) s2_s3 = s3_client(region_name=SENTINEL_2_REGION) if len(missing_scenes) > 0 or len(orphaned_keys) > 0: output_filename = (f"{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")) log.info( f"File will be saved in {s2_status_report_path}/{output_filename}") missing_orphan_scenes_json = json.dumps({ "orphan": list(orphaned_keys), "missing": list(missing_scenes) }) s3_dump( data=missing_orphan_scenes_json, url=str(URL(s2_status_report_path) / output_filename), s3=s2_s3, ContentType="application/json", ) report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}" message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n" f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_keys)}\n" f"Report: {report_http_link}\n") log.info(message) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_keys) > 200): if notification_url is not None: send_slack_notification(notification_url, "S2 Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")
def generate_buckets_diff( bucket_name: str, satellites: str, file_name: str, update_stac: bool = False, notification_url: str = None, ): """ Compare USGS bulk files and Africa inventory bucket detecting differences A report containing missing keys will be written to AFRICA_S3_BUCKET_PATH """ log = setup_logging() start_timer = time.time() log.info("Task started") landsat_status_report_path = URL(f"s3://{bucket_name}/status-report/") landsat_status_report_url = URL( f"https://{bucket_name}.s3.af-south-1.amazonaws.com/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" title = " & ".join(satellites).replace("ls", "Landsat ") log.info(f"Environment {environment}") log.info(f"Bucket Name {bucket_name}") log.info(f"Satellites {satellites}") log.info(f"File Name {file_name}") log.info(f"Update all ({update_stac})") log.info(f"Notification URL ({notification_url})") # Create connection to the inventory S3 bucket log.info(f"Retrieving keys from inventory bucket {LANDSAT_INVENTORY_PATH}") dest_paths = get_and_filter_keys(satellites=satellites) log.info(f"INVENTORY bucket number of objects {len(dest_paths)}") log.info(f"INVENTORY 10 first {list(dest_paths)[0:10]}") date_string = datetime.now().strftime("%Y-%m-%d") # Download bulk file log.info("Download Bulk file") file_path = download_file_to_tmp(url=str(BASE_BULK_CSV_URL), file_name=file_name) # Retrieve keys from the bulk file log.info("Filtering keys from bulk file") source_paths = get_and_filter_keys_from_files(file_path) log.info(f"BULK FILE number of objects {len(source_paths)}") log.info(f"BULK 10 First {list(source_paths)[0:10]}") output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = source_paths orphaned_scenes = [] else: # collect missing scenes # missing scenes = keys that are in the bulk file but missing in PDS sync bucket and/or in source bucket log.info("Filtering missing scenes") missing_scenes = [ str(USGS_S3_BUCKET_PATH / path) for path in source_paths.difference(dest_paths) ] # collect orphan scenes # orphan scenes = keys that are in PDS sync bucket but missing in the bulk file and/or in source bucket log.info("Filtering orphan scenes") orphaned_scenes = [ str(URL(f"s3://{bucket_name}") / path) for path in dest_paths.difference(source_paths) ] log.info(f"Found {len(missing_scenes)} missing scenes") log.info(f"missing_scenes 10 first keys {list(missing_scenes)[0:10]}") log.info(f"Found {len(orphaned_scenes)} orphaned scenes") log.info( f"orphaned_scenes 10 first keys {list(orphaned_scenes)[0:10]}") landsat_s3 = s3_client(region_name="af-south-1") if len(missing_scenes) > 0 or len(orphaned_scenes) > 0: output_filename = ( (f"{title}_{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")).replace( " ", "_").replace("_&", "")) log.info( f"Report file will be saved in {landsat_status_report_path / output_filename}" ) missing_orphan_scenes_json = json.dumps({ "orphan": orphaned_scenes, "missing": missing_scenes }) s3_dump( data=missing_orphan_scenes_json, url=str(landsat_status_report_path / output_filename), s3=landsat_s3, ContentType="application/json", ) report_output = (str(landsat_status_report_url / output_filename) if len(missing_scenes) > 0 or len(orphaned_scenes) > 0 else output_filename) message = dedent(f"*{title} GAP REPORT - {environment}*\n " f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_scenes)}\n" f"Report: {report_output}\n") log.info(message) log.info( f"File {file_name} processed and sent in {time_process(start=start_timer)}" ) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_scenes) > 200): if notification_url is not None: send_slack_notification(notification_url, f"{satellites} Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")