def check_deadletter_queues( slack_url: Optional[str] = None, log: Optional[logging.Logger] = None ): bad_queue_messages = [] dead_queues = get_queues(contains="deadletter") environment = "Unknown" for dead_queue in dead_queues: queue_size = int(dead_queue.attributes.get("ApproximateNumberOfMessages", 0)) if queue_size > 0: queue_name = dead_queue.url.split("/")[-1] try: environment = queue_name.split("-")[1].upper() except Exception: pass bad_queue_messages.append(f"Queue `{queue_name}` has {queue_size} items") if len(bad_queue_messages) > 0: bad_queues_str = "\n".join(f" - {q}" for q in bad_queue_messages) message = dedent( f"*Environment*: {environment}\n " f"Found {len(bad_queue_messages)} dead queues with messages:\n" f"{bad_queues_str}" ) if log is not None: log.error(message) # Send a Slack message if slack_url is not None: send_slack_notification(slack_url, "Dead Letter Checker", message) sys.exit(1) # Exit with 0 if no errors sys.exit(0)
def send_messages( idx: int, queue_name: str, max_workers: int = 2, limit: int = None, slack_url: str = None, ) -> None: """ Publish a list of missing scenes to an specific queue and by the end of that it's able to notify slack the result :param limit: (int) optional limit of messages to be read from the report :param max_workers: (int) total number of pods used for the task. This number is used to split the number of scenes equally among the PODS :param idx: (int) sequential index which will be used to define the range of scenes that the POD will work with :param queue_name: (str) queue to be sens to :param slack_url: (str) Optional slack URL in case of you want to send a slack notification """ log = setup_logging() latest_report = find_latest_report( report_folder_path=S3_BUCKET_PATH, not_contains="orphaned" ) if "update" in latest_report: log.info("FORCED UPDATE FLAGGED!") log.info(f"Limited: {int(limit) if limit else 'No limit'}") log.info(f"Number of workers: {max_workers}") files = read_report_missing_scenes(report_path=latest_report, limit=limit) log.info(f"Number of scenes found {len(files)}") log.info(f"Example scenes: {files[0:10]}") # Split scenes equally among the workers split_list_scenes = split_list_equally( list_to_split=files, num_inter_lists=int(max_workers) ) # In case of the index being bigger than the number of positions in the array, the extra POD isn' necessary if len(split_list_scenes) <= idx: log.warning(f"Worker {idx} Skipped!") sys.exit(0) log.info(f"Executing worker {idx}") messages = prepare_message(scene_paths=split_list_scenes[idx], log=log) queue = get_queue(queue_name=queue_name) batch = [] failed = 0 sent = 0 error_list = [] for message in messages: try: batch.append(message) if len(batch) == 10: publish_messages(queue=queue, messages=batch) batch = [] sent += 10 except Exception as exc: failed += 1 error_list.append(exc) batch = [] if len(batch) > 0: publish_messages(queue=queue, messages=batch) sent += len(batch) environment = "DEV" if "dev" in queue_name else "PDS" error_flag = ":red_circle:" if failed > 0 else "" message = dedent( f"{error_flag}*Sentinel 2 GAP Filler - {environment}*\n" f"Sent Messages: {sent}\n" f"Failed Messages: {failed}\n" ) if slack_url is not None: send_slack_notification(slack_url, "S2 Gap Filler", message) log.info(message) if failed > 0: sys.exit(1)
def generate_buckets_diff( bucket_name: str, update_stac: bool = False, notification_url: str = None, ) -> None: """ Compare Sentinel-2 buckets in US and Africa and detect differences A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report :param bucket_name: (str) Bucket where the gap report is :param update_stac: (bool) Define if the report will contain all scenes from the source for an update :param notification_url: (str) Optional slack URL in case of you want to send a slack notification """ log = setup_logging() log.info("Task started") # defines where the report will be saved s2_status_report_path = URL(f"s3://{bucket_name}/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" log.info(f"Environment {environment}") date_string = datetime.now().strftime("%Y-%m-%d") # Retrieve keys from inventory bucket source_keys = get_and_filter_cogs_keys() output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys) orphaned_keys = set() else: destination_keys = set(ns.Key for ns in list_inventory( manifest=f"{SENTINEL_2_INVENTORY_PATH}", prefix=BASE_FOLDER_NAME, contains=".json", n_threads=200, )) # Keys that are missing, they are in the source but not in the bucket missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys if key not in destination_keys) # Keys that are lost, they are in the bucket but not found in the source orphaned_keys = destination_keys.difference(source_keys) s2_s3 = s3_client(region_name=SENTINEL_2_REGION) if len(missing_scenes) > 0 or len(orphaned_keys) > 0: output_filename = (f"{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")) log.info( f"File will be saved in {s2_status_report_path}/{output_filename}") missing_orphan_scenes_json = json.dumps({ "orphan": list(orphaned_keys), "missing": list(missing_scenes) }) s3_dump( data=missing_orphan_scenes_json, url=str(URL(s2_status_report_path) / output_filename), s3=s2_s3, ContentType="application/json", ) report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}" message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n" f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_keys)}\n" f"Report: {report_http_link}\n") log.info(message) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_keys) > 200): if notification_url is not None: send_slack_notification(notification_url, "S2 Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")
def fill_the_gap( landsat: str, sync_queue_name: str, scenes_limit: Optional[int] = None, notification_url: str = None, ) -> None: """ Function to retrieve the latest gap report and create messages to the filter queue process. :param landsat:(str) satellite name :param sync_queue_name:(str) Queue name :param scenes_limit:(int) limit of how many scenes will be filled :param notification_url:(str) Slack notification URL :return:(None) """ log = setup_logging() log.info(f"Satellite: {landsat}") log.info(f"Queue: {sync_queue_name}") log.info(f"Limit: {scenes_limit if scenes_limit else 'No limit'}") log.info(f"Notification URL: {notification_url}") environment = "DEV" if "dev" in sync_queue_name else "PDS" latest_report = find_latest_report(report_folder_path=S3_BUCKET_PATH, contains=landsat) if not latest_report: raise RuntimeError("Report not found!") update_stac = False if "update" in latest_report: log.info("FORCED UPDATE FLAGGED!") update_stac = True log.info(f"Reading missing scenes from the report {latest_report}") missing_scene_paths = read_report_missing_scenes(report_path=latest_report, limit=scenes_limit) log.info(f"Number of scenes found {len(missing_scene_paths)}") log.info(f"Example scenes: {missing_scene_paths[0:10]}") returned = build_messages(missing_scene_paths=missing_scene_paths, update_stac=update_stac) messages_to_send = returned["message_list"] log.info("Publishing messages") result = post_messages(message_list=messages_to_send, queue_name=sync_queue_name) error_flag = (":red_circle:" if result["failed"] > 0 or len(returned["failed"]) > 0 else "") extra_issues = "\n".join(returned["failed"]) message = dedent( f"{error_flag}*Landsat GAP Filler - {environment}*\n" f"Sent Messages: {result['sent']}\n" f"Failed Messages: {int(result['failed']) + len(returned['failed'])}\n" f"Failed sending: {int(result['failed'])}\n" f"Other issues presented: {extra_issues}") log.info(message) if notification_url is not None and result["sent"] > 0: send_slack_notification(notification_url, "Landsat Gap Filler", message) if (int(result["failed"]) + len(returned["failed"])) > 0: sys.exit(1)
def gmw_download_stac_cog(year: str, s3_dst: str, slack_url: str = None) -> None: """ Mangrove download, COG and STAC process """ gmw_shp = "" try: if year not in VALID_YEARS: raise ValueError( f"Chosen year {year} is not valid, please choose from one of {VALID_YEARS}" ) log.info(f"Starting GMW downloader for year {year}") log.info("download extents if needed") gmw_shp = f"GMW_001_GlobalMangroveWatch_{year}/01_Data/GMW_{year}_v2.shp" local_filename = FILE_NAME.format(year=year) if not os.path.exists(gmw_shp): gmw_shp = download_and_unzip_gmw(local_filename=local_filename) local_extracted_file_path = LOCAL_DIR / gmw_shp output_file = LOCAL_DIR / gmw_shp.replace(".shp", ".tif") log.info(f"Output TIF file is {output_file}") log.info(f"Extracted SHP file is {local_extracted_file_path}") log.info("Start gdal_rasterize") cmd = ("gdal_rasterize " "-a_nodata 0 " "-ot Byte " "-a pxlval " "-of GTiff " "-tr 0.0002 0.0002 " f"{local_extracted_file_path} {output_file} " "-te -26.36 -47.97 64.50 38.35") check_output(cmd, stderr=STDOUT, shell=True) log.info(f"File {output_file} rasterized successfully") # Create cloud optimised GeoTIFF cloud_optimised_file = LOCAL_DIR / f"deafrica_gmw_{year}.tif" cmd = f"rio cogeo create --overview-resampling nearest {output_file} {cloud_optimised_file}" check_output(cmd, stderr=STDOUT, shell=True) log.info(f"File {cloud_optimised_file} cloud optimised successfully") create_and_upload_stac(cog_file=cloud_optimised_file, s3_dst=s3_dst, year=year) # All done! log.info(f"Completed work on {s3_dst}/{year}") except Exception as e: message = f"Failed to handle GMW {gmw_shp} with error {e}" if slack_url is not None: send_slack_notification(slack_url, "GMW", message) log.exception(message) exit(1)
def generate_buckets_diff( bucket_name: str, satellites: str, file_name: str, update_stac: bool = False, notification_url: str = None, ): """ Compare USGS bulk files and Africa inventory bucket detecting differences A report containing missing keys will be written to AFRICA_S3_BUCKET_PATH """ log = setup_logging() start_timer = time.time() log.info("Task started") landsat_status_report_path = URL(f"s3://{bucket_name}/status-report/") landsat_status_report_url = URL( f"https://{bucket_name}.s3.af-south-1.amazonaws.com/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" title = " & ".join(satellites).replace("ls", "Landsat ") log.info(f"Environment {environment}") log.info(f"Bucket Name {bucket_name}") log.info(f"Satellites {satellites}") log.info(f"File Name {file_name}") log.info(f"Update all ({update_stac})") log.info(f"Notification URL ({notification_url})") # Create connection to the inventory S3 bucket log.info(f"Retrieving keys from inventory bucket {LANDSAT_INVENTORY_PATH}") dest_paths = get_and_filter_keys(satellites=satellites) log.info(f"INVENTORY bucket number of objects {len(dest_paths)}") log.info(f"INVENTORY 10 first {list(dest_paths)[0:10]}") date_string = datetime.now().strftime("%Y-%m-%d") # Download bulk file log.info("Download Bulk file") file_path = download_file_to_tmp(url=str(BASE_BULK_CSV_URL), file_name=file_name) # Retrieve keys from the bulk file log.info("Filtering keys from bulk file") source_paths = get_and_filter_keys_from_files(file_path) log.info(f"BULK FILE number of objects {len(source_paths)}") log.info(f"BULK 10 First {list(source_paths)[0:10]}") output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = source_paths orphaned_scenes = [] else: # collect missing scenes # missing scenes = keys that are in the bulk file but missing in PDS sync bucket and/or in source bucket log.info("Filtering missing scenes") missing_scenes = [ str(USGS_S3_BUCKET_PATH / path) for path in source_paths.difference(dest_paths) ] # collect orphan scenes # orphan scenes = keys that are in PDS sync bucket but missing in the bulk file and/or in source bucket log.info("Filtering orphan scenes") orphaned_scenes = [ str(URL(f"s3://{bucket_name}") / path) for path in dest_paths.difference(source_paths) ] log.info(f"Found {len(missing_scenes)} missing scenes") log.info(f"missing_scenes 10 first keys {list(missing_scenes)[0:10]}") log.info(f"Found {len(orphaned_scenes)} orphaned scenes") log.info( f"orphaned_scenes 10 first keys {list(orphaned_scenes)[0:10]}") landsat_s3 = s3_client(region_name="af-south-1") if len(missing_scenes) > 0 or len(orphaned_scenes) > 0: output_filename = ( (f"{title}_{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")).replace( " ", "_").replace("_&", "")) log.info( f"Report file will be saved in {landsat_status_report_path / output_filename}" ) missing_orphan_scenes_json = json.dumps({ "orphan": orphaned_scenes, "missing": missing_scenes }) s3_dump( data=missing_orphan_scenes_json, url=str(landsat_status_report_path / output_filename), s3=landsat_s3, ContentType="application/json", ) report_output = (str(landsat_status_report_url / output_filename) if len(missing_scenes) > 0 or len(orphaned_scenes) > 0 else output_filename) message = dedent(f"*{title} GAP REPORT - {environment}*\n " f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_scenes)}\n" f"Report: {report_output}\n") log.info(message) log.info( f"File {file_name} processed and sent in {time_process(start=start_timer)}" ) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_scenes) > 200): if notification_url is not None: send_slack_notification(notification_url, f"{satellites} Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")
def download_and_cog_chirps( year: str, month: str, s3_dst: str, day: str = None, overwrite: bool = False, slack_url: str = None, ): # Cleaning and sanity checks s3_dst = s3_dst.rstrip("/") # Set up file strings if day is not None: # Set up a daily process in_file = f"chirps-v2.0.{year}.{month}.{day}.tif.gz" in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file) in_data = f"/vsigzip//vsicurl/{in_href}" if not check_for_url_existence(in_href): log.warning("Couldn't find the gzipped file, trying the .tif") in_file = f"chirps-v2.0.{year}.{month}.{day}.tif" in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file) in_data = f"/vsicurl/{in_href}" if not check_for_url_existence(in_href): log.error("Couldn't find the .tif file either, aborting") sys.exit(1) file_base = f"{s3_dst}/{year}/{month}/chirps-v2.0_{year}.{month}.{day}" out_data = f"{file_base}.tif" out_stac = f"{file_base}.stac-item.json" start_datetime = f"{year}-{month}-{day}T00:00:00Z" end_datetime = f"{year}-{month}-{day}T23:59:59Z" product_name = "rainfall_chirps_daily" else: # Set up a monthly process in_file = f"chirps-v2.0.{year}.{month}.tif.gz" in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file) in_data = f"/vsigzip//vsicurl/{in_href}" if not check_for_url_existence(in_href): log.warning("Couldn't find the gzipped file, trying the .tif") in_file = f"chirps-v2.0.{year}.{month}.tif" in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file) in_data = f"/vsicurl/{in_href}" if not check_for_url_existence(in_href): log.error("Couldn't find the .tif file either, aborting") sys.exit(1) file_base = f"{s3_dst}/chirps-v2.0_{year}.{month}" out_data = f"{file_base}.tif" out_stac = f"{file_base}.stac-item.json" _, end = calendar.monthrange(int(year), int(month)) start_datetime = f"{year}-{month}-01T00:00:00Z" end_datetime = f"{year}-{month}-{end}T23:59:59Z" product_name = "rainfall_chirps_monthly" # Set to 15 for the STAC metadata day = 15 try: # Check if file already exists log.info(f"Working on {in_file}") if not overwrite and s3_head_object(out_stac) is not None: log.warning(f"File {out_stac} already exists. Skipping.") return # COG and STAC with MemoryFile() as mem_dst: # Creating the COG, with a memory cache and no download. Shiny. cog_translate( in_data, mem_dst.name, cog_profiles.get("deflate"), in_memory=True, nodata=-9999, ) # Creating the STAC document with appropriate date range _, end = calendar.monthrange(int(year), int(month)) item = create_stac_item( mem_dst, id=str(odc_uuid("chirps", "2.0", [in_file])), with_proj=True, input_datetime=datetime(int(year), int(month), int(day)), properties={ "odc:processing_datetime": datetime_to_str(datetime.now()), "odc:product": product_name, "start_datetime": start_datetime, "end_datetime": end_datetime, }, ) item.set_self_href(out_stac) # Manually redo the asset del item.assets["asset"] item.assets["rainfall"] = pystac.Asset( href=out_data, title="CHIRPS-v2.0", media_type=pystac.MediaType.COG, roles=["data"], ) # Let's add a link to the source item.add_links([ pystac.Link( target=in_href, title="Source file", rel=pystac.RelType.DERIVED_FROM, media_type="application/gzip", ) ]) # Dump the data to S3 mem_dst.seek(0) log.info(f"Writing DATA to: {out_data}") s3_dump(mem_dst, out_data, ACL="bucket-owner-full-control") # Write STAC to S3 log.info(f"Writing STAC to: {out_stac}") s3_dump( json.dumps(item.to_dict(), indent=2), out_stac, ContentType="application/json", ACL="bucket-owner-full-control", ) # All done! log.info(f"Completed work on {in_file}") except Exception as e: message = f"Failed to handle {in_file} with error {e}" if slack_url is not None: send_slack_notification(slack_url, "Chirps Rainfall Monthly", message) log.exception(message) exit(1)