コード例 #1
0
def check_deadletter_queues(
    slack_url: Optional[str] = None, log: Optional[logging.Logger] = None
):
    bad_queue_messages = []
    dead_queues = get_queues(contains="deadletter")
    environment = "Unknown"
    for dead_queue in dead_queues:
        queue_size = int(dead_queue.attributes.get("ApproximateNumberOfMessages", 0))
        if queue_size > 0:
            queue_name = dead_queue.url.split("/")[-1]
            try:
                environment = queue_name.split("-")[1].upper()
            except Exception:
                pass
            bad_queue_messages.append(f"Queue `{queue_name}` has {queue_size} items")

    if len(bad_queue_messages) > 0:
        bad_queues_str = "\n".join(f" - {q}" for q in bad_queue_messages)
        message = dedent(
            f"*Environment*: {environment}\n "
            f"Found {len(bad_queue_messages)} dead queues with messages:\n"
            f"{bad_queues_str}"
        )
        if log is not None:
            log.error(message)
        # Send a Slack message
        if slack_url is not None:
            send_slack_notification(slack_url, "Dead Letter Checker", message)
        sys.exit(1)

    # Exit with 0 if no errors
    sys.exit(0)
コード例 #2
0
def send_messages(
    idx: int,
    queue_name: str,
    max_workers: int = 2,
    limit: int = None,
    slack_url: str = None,
) -> None:
    """
    Publish a list of missing scenes to an specific queue and by the end of that it's able to notify slack the result

    :param limit: (int) optional limit of messages to be read from the report
    :param max_workers: (int) total number of pods used for the task. This number is used to split the number of scenes
    equally among the PODS
    :param idx: (int) sequential index which will be used to define the range of scenes that the POD will work with
    :param queue_name: (str) queue to be sens to
    :param slack_url: (str) Optional slack URL in case of you want to send a slack notification
    """
    log = setup_logging()

    latest_report = find_latest_report(
        report_folder_path=S3_BUCKET_PATH, not_contains="orphaned"
    )

    if "update" in latest_report:
        log.info("FORCED UPDATE FLAGGED!")

    log.info(f"Limited: {int(limit) if limit else 'No limit'}")
    log.info(f"Number of workers: {max_workers}")

    files = read_report_missing_scenes(report_path=latest_report, limit=limit)

    log.info(f"Number of scenes found {len(files)}")
    log.info(f"Example scenes: {files[0:10]}")

    # Split scenes equally among the workers
    split_list_scenes = split_list_equally(
        list_to_split=files, num_inter_lists=int(max_workers)
    )

    # In case of the index being bigger than the number of positions in the array, the extra POD isn' necessary
    if len(split_list_scenes) <= idx:
        log.warning(f"Worker {idx} Skipped!")
        sys.exit(0)

    log.info(f"Executing worker {idx}")
    messages = prepare_message(scene_paths=split_list_scenes[idx], log=log)

    queue = get_queue(queue_name=queue_name)

    batch = []
    failed = 0
    sent = 0
    error_list = []
    for message in messages:
        try:
            batch.append(message)
            if len(batch) == 10:
                publish_messages(queue=queue, messages=batch)
                batch = []
                sent += 10
        except Exception as exc:
            failed += 1
            error_list.append(exc)
            batch = []

    if len(batch) > 0:
        publish_messages(queue=queue, messages=batch)
        sent += len(batch)

    environment = "DEV" if "dev" in queue_name else "PDS"
    error_flag = ":red_circle:" if failed > 0 else ""

    message = dedent(
        f"{error_flag}*Sentinel 2 GAP Filler - {environment}*\n"
        f"Sent Messages: {sent}\n"
        f"Failed Messages: {failed}\n"
    )
    if slack_url is not None:
        send_slack_notification(slack_url, "S2 Gap Filler", message)

    log.info(message)

    if failed > 0:
        sys.exit(1)
コード例 #3
0
def generate_buckets_diff(
    bucket_name: str,
    update_stac: bool = False,
    notification_url: str = None,
) -> None:
    """
    Compare Sentinel-2 buckets in US and Africa and detect differences
    A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report

    :param bucket_name: (str) Bucket where the gap report is
    :param update_stac: (bool) Define if the report will contain all scenes from the source for an update
    :param notification_url: (str) Optional slack URL in case of you want to send a slack notification
    """

    log = setup_logging()

    log.info("Task started")

    # defines where the report will be saved
    s2_status_report_path = URL(f"s3://{bucket_name}/status-report/")

    environment = "DEV" if "dev" in bucket_name else "PDS"
    log.info(f"Environment {environment}")

    date_string = datetime.now().strftime("%Y-%m-%d")

    # Retrieve keys from inventory bucket
    source_keys = get_and_filter_cogs_keys()

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = set(f"s3://sentinel-cogs/{key}"
                             for key in source_keys)
        orphaned_keys = set()

    else:

        destination_keys = set(ns.Key for ns in list_inventory(
            manifest=f"{SENTINEL_2_INVENTORY_PATH}",
            prefix=BASE_FOLDER_NAME,
            contains=".json",
            n_threads=200,
        ))

        # Keys that are missing, they are in the source but not in the bucket
        missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys
                             if key not in destination_keys)

        # Keys that are lost, they are in the bucket but not found in the source
        orphaned_keys = destination_keys.difference(source_keys)

    s2_s3 = s3_client(region_name=SENTINEL_2_REGION)

    if len(missing_scenes) > 0 or len(orphaned_keys) > 0:
        output_filename = (f"{date_string}_gap_report.json" if not update_stac
                           else URL(f"{date_string}_gap_report_update.json"))

        log.info(
            f"File will be saved in {s2_status_report_path}/{output_filename}")

        missing_orphan_scenes_json = json.dumps({
            "orphan": list(orphaned_keys),
            "missing": list(missing_scenes)
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(URL(s2_status_report_path) / output_filename),
            s3=s2_s3,
            ContentType="application/json",
        )

    report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}"
    message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n"
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_keys)}\n"
                     f"Report: {report_http_link}\n")

    log.info(message)

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_keys) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url, "S2 Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")
コード例 #4
0
def fill_the_gap(
    landsat: str,
    sync_queue_name: str,
    scenes_limit: Optional[int] = None,
    notification_url: str = None,
) -> None:
    """
    Function to retrieve the latest gap report and create messages to the filter queue process.

    :param landsat:(str) satellite name
    :param sync_queue_name:(str) Queue name
    :param scenes_limit:(int) limit of how many scenes will be filled
    :param notification_url:(str) Slack notification URL
    :return:(None)
    """
    log = setup_logging()

    log.info(f"Satellite: {landsat}")
    log.info(f"Queue: {sync_queue_name}")
    log.info(f"Limit: {scenes_limit if scenes_limit else 'No limit'}")
    log.info(f"Notification URL: {notification_url}")

    environment = "DEV" if "dev" in sync_queue_name else "PDS"

    latest_report = find_latest_report(report_folder_path=S3_BUCKET_PATH,
                                       contains=landsat)

    if not latest_report:
        raise RuntimeError("Report not found!")

    update_stac = False
    if "update" in latest_report:
        log.info("FORCED UPDATE FLAGGED!")
        update_stac = True

    log.info(f"Reading missing scenes from the report {latest_report}")

    missing_scene_paths = read_report_missing_scenes(report_path=latest_report,
                                                     limit=scenes_limit)

    log.info(f"Number of scenes found {len(missing_scene_paths)}")
    log.info(f"Example scenes: {missing_scene_paths[0:10]}")

    returned = build_messages(missing_scene_paths=missing_scene_paths,
                              update_stac=update_stac)

    messages_to_send = returned["message_list"]

    log.info("Publishing messages")
    result = post_messages(message_list=messages_to_send,
                           queue_name=sync_queue_name)

    error_flag = (":red_circle:" if result["failed"] > 0
                  or len(returned["failed"]) > 0 else "")

    extra_issues = "\n".join(returned["failed"])
    message = dedent(
        f"{error_flag}*Landsat GAP Filler - {environment}*\n"
        f"Sent Messages: {result['sent']}\n"
        f"Failed Messages: {int(result['failed']) + len(returned['failed'])}\n"
        f"Failed sending: {int(result['failed'])}\n"
        f"Other issues presented: {extra_issues}")

    log.info(message)
    if notification_url is not None and result["sent"] > 0:
        send_slack_notification(notification_url, "Landsat Gap Filler",
                                message)

    if (int(result["failed"]) + len(returned["failed"])) > 0:
        sys.exit(1)
コード例 #5
0
ファイル: gmw.py プロジェクト: digitalearthafrica/scripts
def gmw_download_stac_cog(year: str,
                          s3_dst: str,
                          slack_url: str = None) -> None:
    """
    Mangrove download, COG and STAC process

    """

    gmw_shp = ""

    try:
        if year not in VALID_YEARS:
            raise ValueError(
                f"Chosen year {year} is not valid, please choose from one of {VALID_YEARS}"
            )

        log.info(f"Starting GMW downloader for year {year}")

        log.info("download extents if needed")
        gmw_shp = f"GMW_001_GlobalMangroveWatch_{year}/01_Data/GMW_{year}_v2.shp"
        local_filename = FILE_NAME.format(year=year)
        if not os.path.exists(gmw_shp):
            gmw_shp = download_and_unzip_gmw(local_filename=local_filename)

        local_extracted_file_path = LOCAL_DIR / gmw_shp

        output_file = LOCAL_DIR / gmw_shp.replace(".shp", ".tif")
        log.info(f"Output TIF file is {output_file}")
        log.info(f"Extracted SHP file is {local_extracted_file_path}")
        log.info("Start gdal_rasterize")
        cmd = ("gdal_rasterize "
               "-a_nodata 0 "
               "-ot Byte "
               "-a pxlval "
               "-of GTiff "
               "-tr 0.0002 0.0002 "
               f"{local_extracted_file_path} {output_file} "
               "-te -26.36 -47.97 64.50 38.35")
        check_output(cmd, stderr=STDOUT, shell=True)

        log.info(f"File {output_file} rasterized successfully")

        # Create cloud optimised GeoTIFF
        cloud_optimised_file = LOCAL_DIR / f"deafrica_gmw_{year}.tif"
        cmd = f"rio cogeo create --overview-resampling nearest {output_file} {cloud_optimised_file}"
        check_output(cmd, stderr=STDOUT, shell=True)

        log.info(f"File {cloud_optimised_file} cloud optimised successfully")

        create_and_upload_stac(cog_file=cloud_optimised_file,
                               s3_dst=s3_dst,
                               year=year)

        # All done!
        log.info(f"Completed work on {s3_dst}/{year}")

    except Exception as e:
        message = f"Failed to handle GMW {gmw_shp} with error {e}"

        if slack_url is not None:
            send_slack_notification(slack_url, "GMW", message)
        log.exception(message)

        exit(1)
コード例 #6
0
def generate_buckets_diff(
    bucket_name: str,
    satellites: str,
    file_name: str,
    update_stac: bool = False,
    notification_url: str = None,
):
    """
    Compare USGS bulk files and Africa inventory bucket detecting differences
    A report containing missing keys will be written to AFRICA_S3_BUCKET_PATH
    """

    log = setup_logging()

    start_timer = time.time()

    log.info("Task started")

    landsat_status_report_path = URL(f"s3://{bucket_name}/status-report/")
    landsat_status_report_url = URL(
        f"https://{bucket_name}.s3.af-south-1.amazonaws.com/status-report/")
    environment = "DEV" if "dev" in bucket_name else "PDS"

    title = " & ".join(satellites).replace("ls", "Landsat ")

    log.info(f"Environment {environment}")
    log.info(f"Bucket Name {bucket_name}")
    log.info(f"Satellites {satellites}")
    log.info(f"File Name {file_name}")
    log.info(f"Update all ({update_stac})")
    log.info(f"Notification URL ({notification_url})")

    # Create connection to the inventory S3 bucket
    log.info(f"Retrieving keys from inventory bucket {LANDSAT_INVENTORY_PATH}")
    dest_paths = get_and_filter_keys(satellites=satellites)

    log.info(f"INVENTORY bucket number of objects {len(dest_paths)}")
    log.info(f"INVENTORY 10 first {list(dest_paths)[0:10]}")
    date_string = datetime.now().strftime("%Y-%m-%d")

    # Download bulk file
    log.info("Download Bulk file")
    file_path = download_file_to_tmp(url=str(BASE_BULK_CSV_URL),
                                     file_name=file_name)

    # Retrieve keys from the bulk file
    log.info("Filtering keys from bulk file")
    source_paths = get_and_filter_keys_from_files(file_path)

    log.info(f"BULK FILE number of objects {len(source_paths)}")
    log.info(f"BULK 10 First {list(source_paths)[0:10]}")

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = source_paths
        orphaned_scenes = []

    else:
        # collect missing scenes
        # missing scenes = keys that are in the bulk file but missing in PDS sync bucket and/or in source bucket
        log.info("Filtering missing scenes")
        missing_scenes = [
            str(USGS_S3_BUCKET_PATH / path)
            for path in source_paths.difference(dest_paths)
        ]

        # collect orphan scenes
        # orphan scenes = keys that are in PDS sync bucket but missing in the bulk file and/or in source bucket
        log.info("Filtering orphan scenes")
        orphaned_scenes = [
            str(URL(f"s3://{bucket_name}") / path)
            for path in dest_paths.difference(source_paths)
        ]

        log.info(f"Found {len(missing_scenes)} missing scenes")
        log.info(f"missing_scenes 10 first keys {list(missing_scenes)[0:10]}")
        log.info(f"Found {len(orphaned_scenes)} orphaned scenes")
        log.info(
            f"orphaned_scenes 10 first keys {list(orphaned_scenes)[0:10]}")

    landsat_s3 = s3_client(region_name="af-south-1")

    if len(missing_scenes) > 0 or len(orphaned_scenes) > 0:
        output_filename = (
            (f"{title}_{date_string}_gap_report.json" if not update_stac
             else URL(f"{date_string}_gap_report_update.json")).replace(
                 " ", "_").replace("_&", ""))

        log.info(
            f"Report file will be saved in {landsat_status_report_path / output_filename}"
        )
        missing_orphan_scenes_json = json.dumps({
            "orphan": orphaned_scenes,
            "missing": missing_scenes
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(landsat_status_report_path / output_filename),
            s3=landsat_s3,
            ContentType="application/json",
        )

    report_output = (str(landsat_status_report_url /
                         output_filename) if len(missing_scenes) > 0
                     or len(orphaned_scenes) > 0 else output_filename)

    message = dedent(f"*{title} GAP REPORT - {environment}*\n "
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_scenes)}\n"
                     f"Report: {report_output}\n")

    log.info(message)

    log.info(
        f"File {file_name} processed and sent in {time_process(start=start_timer)}"
    )

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_scenes) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url,
                                    f"{satellites} Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")
コード例 #7
0
def download_and_cog_chirps(
    year: str,
    month: str,
    s3_dst: str,
    day: str = None,
    overwrite: bool = False,
    slack_url: str = None,
):
    # Cleaning and sanity checks
    s3_dst = s3_dst.rstrip("/")

    # Set up file strings
    if day is not None:
        # Set up a daily process
        in_file = f"chirps-v2.0.{year}.{month}.{day}.tif.gz"
        in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file)
        in_data = f"/vsigzip//vsicurl/{in_href}"
        if not check_for_url_existence(in_href):
            log.warning("Couldn't find the gzipped file, trying the .tif")
            in_file = f"chirps-v2.0.{year}.{month}.{day}.tif"
            in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file)
            in_data = f"/vsicurl/{in_href}"

            if not check_for_url_existence(in_href):
                log.error("Couldn't find the .tif file either, aborting")
                sys.exit(1)

        file_base = f"{s3_dst}/{year}/{month}/chirps-v2.0_{year}.{month}.{day}"
        out_data = f"{file_base}.tif"
        out_stac = f"{file_base}.stac-item.json"

        start_datetime = f"{year}-{month}-{day}T00:00:00Z"
        end_datetime = f"{year}-{month}-{day}T23:59:59Z"
        product_name = "rainfall_chirps_daily"
    else:
        # Set up a monthly process
        in_file = f"chirps-v2.0.{year}.{month}.tif.gz"
        in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file)
        in_data = f"/vsigzip//vsicurl/{in_href}"
        if not check_for_url_existence(in_href):
            log.warning("Couldn't find the gzipped file, trying the .tif")
            in_file = f"chirps-v2.0.{year}.{month}.tif"
            in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file)
            in_data = f"/vsicurl/{in_href}"

            if not check_for_url_existence(in_href):
                log.error("Couldn't find the .tif file either, aborting")
                sys.exit(1)

        file_base = f"{s3_dst}/chirps-v2.0_{year}.{month}"
        out_data = f"{file_base}.tif"
        out_stac = f"{file_base}.stac-item.json"

        _, end = calendar.monthrange(int(year), int(month))
        start_datetime = f"{year}-{month}-01T00:00:00Z"
        end_datetime = f"{year}-{month}-{end}T23:59:59Z"
        product_name = "rainfall_chirps_monthly"

        # Set to 15 for the STAC metadata
        day = 15

    try:
        # Check if file already exists
        log.info(f"Working on {in_file}")
        if not overwrite and s3_head_object(out_stac) is not None:
            log.warning(f"File {out_stac} already exists. Skipping.")
            return

        # COG and STAC
        with MemoryFile() as mem_dst:
            # Creating the COG, with a memory cache and no download. Shiny.
            cog_translate(
                in_data,
                mem_dst.name,
                cog_profiles.get("deflate"),
                in_memory=True,
                nodata=-9999,
            )
            # Creating the STAC document with appropriate date range
            _, end = calendar.monthrange(int(year), int(month))
            item = create_stac_item(
                mem_dst,
                id=str(odc_uuid("chirps", "2.0", [in_file])),
                with_proj=True,
                input_datetime=datetime(int(year), int(month), int(day)),
                properties={
                    "odc:processing_datetime": datetime_to_str(datetime.now()),
                    "odc:product": product_name,
                    "start_datetime": start_datetime,
                    "end_datetime": end_datetime,
                },
            )
            item.set_self_href(out_stac)
            # Manually redo the asset
            del item.assets["asset"]
            item.assets["rainfall"] = pystac.Asset(
                href=out_data,
                title="CHIRPS-v2.0",
                media_type=pystac.MediaType.COG,
                roles=["data"],
            )
            # Let's add a link to the source
            item.add_links([
                pystac.Link(
                    target=in_href,
                    title="Source file",
                    rel=pystac.RelType.DERIVED_FROM,
                    media_type="application/gzip",
                )
            ])

            # Dump the data to S3
            mem_dst.seek(0)
            log.info(f"Writing DATA to: {out_data}")
            s3_dump(mem_dst, out_data, ACL="bucket-owner-full-control")
            # Write STAC to S3
            log.info(f"Writing STAC to: {out_stac}")
            s3_dump(
                json.dumps(item.to_dict(), indent=2),
                out_stac,
                ContentType="application/json",
                ACL="bucket-owner-full-control",
            )
            # All done!
            log.info(f"Completed work on {in_file}")

    except Exception as e:
        message = f"Failed to handle {in_file} with error {e}"

        if slack_url is not None:
            send_slack_notification(slack_url, "Chirps Rainfall Monthly",
                                    message)
        log.exception(message)

        exit(1)