Ejemplo n.º 1
0
def cli(slack_url, version: bool = False):
    """
    Check all dead queues which the user is allowed to
    """

    if version:
        click.echo(__version__)

    log = setup_logging()
    check_deadletter_queues(slack_url=slack_url, log=log)
Ejemplo n.º 2
0
def cli(tile_string, workdir, s3_bucket, update_metadata, s3_path):
    """
    Example command:

    download-alos-palsar --tile-string 2020/N10E010 -w /tmp/download -s example-bucket -p alos_palsar_mosaic
    """
    log = setup_logging()

    s3_destination = s3_bucket.rstrip("/").lstrip(
        "s3://") + "/" + s3_path.rstrip("/")

    run_one(tile_string, Path(workdir), s3_destination, update_metadata, log)
Ejemplo n.º 3
0
def send_messages(
    idx: int,
    queue_name: str,
    max_workers: int = 2,
    limit: int = None,
    slack_url: str = None,
) -> None:
    """
    Publish a list of missing scenes to an specific queue and by the end of that it's able to notify slack the result

    :param limit: (int) optional limit of messages to be read from the report
    :param max_workers: (int) total number of pods used for the task. This number is used to split the number of scenes
    equally among the PODS
    :param idx: (int) sequential index which will be used to define the range of scenes that the POD will work with
    :param queue_name: (str) queue to be sens to
    :param slack_url: (str) Optional slack URL in case of you want to send a slack notification
    """
    log = setup_logging()

    latest_report = find_latest_report(
        report_folder_path=S3_BUCKET_PATH, not_contains="orphaned"
    )

    if "update" in latest_report:
        log.info("FORCED UPDATE FLAGGED!")

    log.info(f"Limited: {int(limit) if limit else 'No limit'}")
    log.info(f"Number of workers: {max_workers}")

    files = read_report_missing_scenes(report_path=latest_report, limit=limit)

    log.info(f"Number of scenes found {len(files)}")
    log.info(f"Example scenes: {files[0:10]}")

    # Split scenes equally among the workers
    split_list_scenes = split_list_equally(
        list_to_split=files, num_inter_lists=int(max_workers)
    )

    # In case of the index being bigger than the number of positions in the array, the extra POD isn' necessary
    if len(split_list_scenes) <= idx:
        log.warning(f"Worker {idx} Skipped!")
        sys.exit(0)

    log.info(f"Executing worker {idx}")
    messages = prepare_message(scene_paths=split_list_scenes[idx], log=log)

    queue = get_queue(queue_name=queue_name)

    batch = []
    failed = 0
    sent = 0
    error_list = []
    for message in messages:
        try:
            batch.append(message)
            if len(batch) == 10:
                publish_messages(queue=queue, messages=batch)
                batch = []
                sent += 10
        except Exception as exc:
            failed += 1
            error_list.append(exc)
            batch = []

    if len(batch) > 0:
        publish_messages(queue=queue, messages=batch)
        sent += len(batch)

    environment = "DEV" if "dev" in queue_name else "PDS"
    error_flag = ":red_circle:" if failed > 0 else ""

    message = dedent(
        f"{error_flag}*Sentinel 2 GAP Filler - {environment}*\n"
        f"Sent Messages: {sent}\n"
        f"Failed Messages: {failed}\n"
    )
    if slack_url is not None:
        send_slack_notification(slack_url, "S2 Gap Filler", message)

    log.info(message)

    if failed > 0:
        sys.exit(1)
Ejemplo n.º 4
0
def create_mosaic(
    dc: Datacube,
    product: str,
    out_product: str,
    time: Tuple[str, str],
    time_str: str,
    bands: Tuple[str],
    s3_output_root: str,
    split_bands: bool = False,
    resolution: int = 120,
    overwrite: bool = False,
):
    log = setup_logging()
    log.info(f"Creating mosaic for {product} over {time}")

    client = start_local_dask()

    assets = {}
    data = dc.load(
        product=product,
        time=time,
        resolution=(-resolution, resolution),
        dask_chunks={"x": 2048, "y": 2048},
        measurements=bands,
    )

    # This is a bad idea, we run out of memory
    # data.persist()

    if not split_bands:
        log.info("Creating a single tif file")
        out_file = _get_path(s3_output_root, out_product, time_str, "tif")
        exists = s3_head_object(out_file) is not None
        skip_writing = not (not exists or overwrite)
        try:
            asset, _ = _save_opinionated_cog(
                data,
                out_file,
                skip_writing=skip_writing,
            )
        except ValueError:
            log.exception(
                "Failed to create COG, please check that you only have one timestep in the period."
            )
            exit(1)
        assets[bands[0]] = asset
        if skip_writing:
            log.info(f"File exists, and overwrite is False. Not writing {out_file}")
        else:
            log.info(f"Finished writing: {asset.href}")
    else:
        log.info("Creating multiple tif files")

        for band in bands:
            out_file = _get_path(
                s3_output_root, out_product, time_str, "tif", band=band
            )
            exists = s3_head_object(out_file) is not None
            skip_writing = not (not exists or overwrite)

            try:
                asset, band = _save_opinionated_cog(
                    data=data,
                    out_file=out_file,
                    band=band,
                    skip_writing=skip_writing,
                )
            except ValueError:
                log.exception(
                    "Failed to create COG, please check that you only have one timestep in the period."
                )
                exit(1)
            assets[band] = asset
            if skip_writing:
                log.info(f"File exists, and overwrite is False. Not writing {out_file}")
            else:
                log.info(f"Finished writing: {asset.href}")
                # Aggressively heavy handed, but we get memory leaks otherwise
                client.restart()

    out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json")
    item = create_stac_item(
        assets[bands[0]].href,
        id=f"{product}_{time_str}",
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": out_product,
            "start_datetime": f"{time[0]}T00:00:00Z",
            "end_datetime": f"{time[1]}T23:59:59Z",
        },
    )
    item.set_self_href(out_stac_file)

    log.info(f"Writing STAC: {out_stac_file}")
    client = s3_client(aws_unsigned=False)
    s3_dump(
        data=json.dumps(item.to_dict(), indent=2),
        url=item.self_href,
        ACL="bucket-owner-full-control",
        ContentType="application/json",
        s3=client,
    )
Ejemplo n.º 5
0
def download_gls(year: str, s3_dst: str, workdir: Path, overwrite: bool = False):
    log = setup_logging()
    assets = {}
    out_stac = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}.stac-item.json"

    if s3_head_object(str(out_stac)) is not None and not overwrite:
        log.info(f"{out_stac} exists, skipping")
        return

    # Download the files
    for name, file in FILES.items():
        # Create a temporary directory to work with
        with TemporaryDirectory(prefix=workdir) as tmpdir:
            log.info(f"Working on {file}")
            url = URL(
                BASE_URL.format(
                    record_id=YEARS[year][1], year_key=YEARS[year][0], file=file
                )
            )

            dest_url = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}_{name}.tif"

            if s3_head_object(str(dest_url)) is None or overwrite:
                log.info(f"Downloading {url}")

                try:
                    local_file = Path(tmpdir) / str(url.name)
                    # Download the file
                    download_file(url, local_file)

                    log.info(f"Downloaded file to {local_file}")
                    local_file_small = translate_file_deafrica_extent(local_file)
                    log.info(f"Clipped Africa out and saved to {local_file_small}")
                    resampling = "nearest" if name in DO_NEAREST else "bilinear"

                    # Create a COG in memory and upload to S3
                    with MemoryFile() as mem_dst:
                        # Creating the COG, with a memory cache and no download. Shiny.
                        cog_translate(
                            local_file_small,
                            mem_dst.name,
                            cog_profiles.get("deflate"),
                            in_memory=True,
                            nodata=255,
                            overview_resampling=resampling,
                        )
                        mem_dst.seek(0)
                        s3_dump(mem_dst, str(dest_url), ACL="bucket-owner-full-control")
                        log.info(f"File written to {dest_url}")
                except Exception:
                    log.exception(f"Failed to process {url}")
                    exit(1)
            else:
                log.info(f"{dest_url} exists, skipping")

            assets[name] = pystac.Asset(
                href=str(dest_url), roles=["data"], media_type=pystac.MediaType.COG
            )

    # Write STAC document from the last-written file
    source_doc = f"https://zenodo.org/record/{YEARS[year][1]}"
    item = create_stac_item(
        str(dest_url),
        id=str(odc_uuid("Copernicus Global Land Cover", "3.0.1", [source_doc])),
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": PRODUCT_NAME,
            "start_datetime": f"{year}-01-01T00:00:00Z",
            "end_datetime": f"{year}-12-31T23:59:59Z",
        },
    )
    item.add_links(
        [
            pystac.Link(
                target=source_doc,
                title="Source",
                rel=pystac.RelType.DERIVED_FROM,
                media_type="text/html",
            )
        ]
    )
    s3_dump(
        json.dumps(item.to_dict(), indent=2),
        str(out_stac),
        ContentType="application/json",
        ACL="bucket-owner-full-control",
    )
    log.info(f"STAC written to {out_stac}")
import boto3
import click as click
from datetime import datetime
from datetime import timedelta
from deafrica.utils import setup_logging
from kubernetes import config, client

# Set log level to info
log = setup_logging()


def delete_volumes(cluster_name, dryrun):
    """
    Cleanup sandbox unused Volumes, k8s PVs & PVCs by
    looking into CloudTrail "AttachVolume" events over the past 90 days
    """
    # configure kubernetes API client
    try:
        config.load_incluster_config()
    except config.ConfigException:
        try:
            config.load_kube_config()
        except config.ConfigException:
            log.exception("Could not configure kubernetes python client")
    k8s_api = client.CoreV1Api()
    k8s_namespace = "sandbox"

    # configure boto3 client
    ec2_resource = boto3.resource("ec2")
    ct_client = boto3.client("cloudtrail")
Ejemplo n.º 7
0
def generate_buckets_diff(
    bucket_name: str,
    update_stac: bool = False,
    notification_url: str = None,
) -> None:
    """
    Compare Sentinel-2 buckets in US and Africa and detect differences
    A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report

    :param bucket_name: (str) Bucket where the gap report is
    :param update_stac: (bool) Define if the report will contain all scenes from the source for an update
    :param notification_url: (str) Optional slack URL in case of you want to send a slack notification
    """

    log = setup_logging()

    log.info("Task started")

    # defines where the report will be saved
    s2_status_report_path = URL(f"s3://{bucket_name}/status-report/")

    environment = "DEV" if "dev" in bucket_name else "PDS"
    log.info(f"Environment {environment}")

    date_string = datetime.now().strftime("%Y-%m-%d")

    # Retrieve keys from inventory bucket
    source_keys = get_and_filter_cogs_keys()

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = set(f"s3://sentinel-cogs/{key}"
                             for key in source_keys)
        orphaned_keys = set()

    else:

        destination_keys = set(ns.Key for ns in list_inventory(
            manifest=f"{SENTINEL_2_INVENTORY_PATH}",
            prefix=BASE_FOLDER_NAME,
            contains=".json",
            n_threads=200,
        ))

        # Keys that are missing, they are in the source but not in the bucket
        missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys
                             if key not in destination_keys)

        # Keys that are lost, they are in the bucket but not found in the source
        orphaned_keys = destination_keys.difference(source_keys)

    s2_s3 = s3_client(region_name=SENTINEL_2_REGION)

    if len(missing_scenes) > 0 or len(orphaned_keys) > 0:
        output_filename = (f"{date_string}_gap_report.json" if not update_stac
                           else URL(f"{date_string}_gap_report_update.json"))

        log.info(
            f"File will be saved in {s2_status_report_path}/{output_filename}")

        missing_orphan_scenes_json = json.dumps({
            "orphan": list(orphaned_keys),
            "missing": list(missing_scenes)
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(URL(s2_status_report_path) / output_filename),
            s3=s2_s3,
            ContentType="application/json",
        )

    report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}"
    message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n"
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_keys)}\n"
                     f"Report: {report_http_link}\n")

    log.info(message)

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_keys) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url, "S2 Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")
Ejemplo n.º 8
0
def download_cci_lc(year: str,
                    s3_dst: str,
                    workdir: str,
                    overwrite: bool = False):
    log = setup_logging()
    assets = {}

    cci_lc_version = get_version_from_year(year)
    name = f"{PRODUCT_NAME}_{year}_{cci_lc_version}"

    out_cog = URL(s3_dst) / year / f"{name}.tif"
    out_stac = URL(s3_dst) / year / f"{name}.stac-item.json"

    if s3_head_object(str(out_stac)) is not None and not overwrite:
        log.info(f"{out_stac} exists, skipping")
        return

    workdir = Path(workdir)
    if not workdir.exists():
        workdir.mkdir(parents=True, exist_ok=True)

    # Create a temporary directory to work with
    tmpdir = mkdtemp(prefix=str(f"{workdir}/"))
    log.info(f"Working on {year} in the path {tmpdir}")

    if s3_head_object(str(out_cog)) is None or overwrite:
        log.info(f"Downloading {year}")
        try:
            local_file = Path(tmpdir) / f"{name}.zip"
            if not local_file.exists():
                # Download the file
                c = cdsapi.Client()

                # We could also retrieve the object metadata from the CDS.
                # e.g. f = c.retrieve("series",{params}) | f.location = URL to download
                c.retrieve(
                    "satellite-land-cover",
                    {
                        "format": "zip",
                        "variable": "all",
                        "version": cci_lc_version,
                        "year": str(year),
                    },
                    local_file,
                )

                log.info(f"Downloaded file to {local_file}")
            else:
                log.info(
                    f"File {local_file} exists, continuing without downloading"
                )

            # Unzip the file
            log.info(f"Unzipping {local_file}")
            unzipped = None
            with zipfile.ZipFile(local_file, "r") as zip_ref:
                unzipped = local_file.parent / zip_ref.namelist()[0]
                zip_ref.extractall(tmpdir)

            # Process data
            ds = xr.open_dataset(unzipped)
            # Subset to Africa
            ulx, uly, lrx, lry = AFRICA_BBOX
            # Note: lats are upside down!
            ds_small = ds.sel(lat=slice(uly, lry), lon=slice(ulx, lrx))
            ds_small = assign_crs(ds_small, crs="epsg:4326")

            # Create cog (in memory - :mem: returns bytes object)
            mem_dst = write_cog(
                ds_small.lccs_class,
                ":mem:",
                nodata=0,
                overview_resampling="nearest",
            )

            # Write to s3
            s3_dump(mem_dst, str(out_cog), ACL="bucket-owner-full-control")
            log.info(f"File written to {out_cog}")

        except Exception:
            log.exception(f"Failed to process {name}")
            exit(1)
    else:
        log.info(f"{out_cog} exists, skipping")

    assets["classification"] = pystac.Asset(href=str(out_cog),
                                            roles=["data"],
                                            media_type=pystac.MediaType.COG)

    # Write STAC document
    source_doc = (
        "https://cds.climate.copernicus.eu/cdsapp#!/dataset/satellite-land-cover"
    )
    item = create_stac_item(
        str(out_cog),
        id=str(
            odc_uuid("Copernicus Land Cover", cci_lc_version,
                     [source_doc, name])),
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": PRODUCT_NAME,
            "start_datetime": f"{year}-01-01T00:00:00Z",
            "end_datetime": f"{year}-12-31T23:59:59Z",
        },
    )
    item.add_links([
        pystac.Link(
            target=source_doc,
            title="Source",
            rel=pystac.RelType.DERIVED_FROM,
            media_type="text/html",
        )
    ])
    s3_dump(
        json.dumps(item.to_dict(), indent=2),
        str(out_stac),
        ContentType="application/json",
        ACL="bucket-owner-full-control",
    )
    log.info(f"STAC written to {out_stac}")
Ejemplo n.º 9
0
def fill_the_gap(
    landsat: str,
    sync_queue_name: str,
    scenes_limit: Optional[int] = None,
    notification_url: str = None,
) -> None:
    """
    Function to retrieve the latest gap report and create messages to the filter queue process.

    :param landsat:(str) satellite name
    :param sync_queue_name:(str) Queue name
    :param scenes_limit:(int) limit of how many scenes will be filled
    :param notification_url:(str) Slack notification URL
    :return:(None)
    """
    log = setup_logging()

    log.info(f"Satellite: {landsat}")
    log.info(f"Queue: {sync_queue_name}")
    log.info(f"Limit: {scenes_limit if scenes_limit else 'No limit'}")
    log.info(f"Notification URL: {notification_url}")

    environment = "DEV" if "dev" in sync_queue_name else "PDS"

    latest_report = find_latest_report(report_folder_path=S3_BUCKET_PATH,
                                       contains=landsat)

    if not latest_report:
        raise RuntimeError("Report not found!")

    update_stac = False
    if "update" in latest_report:
        log.info("FORCED UPDATE FLAGGED!")
        update_stac = True

    log.info(f"Reading missing scenes from the report {latest_report}")

    missing_scene_paths = read_report_missing_scenes(report_path=latest_report,
                                                     limit=scenes_limit)

    log.info(f"Number of scenes found {len(missing_scene_paths)}")
    log.info(f"Example scenes: {missing_scene_paths[0:10]}")

    returned = build_messages(missing_scene_paths=missing_scene_paths,
                              update_stac=update_stac)

    messages_to_send = returned["message_list"]

    log.info("Publishing messages")
    result = post_messages(message_list=messages_to_send,
                           queue_name=sync_queue_name)

    error_flag = (":red_circle:" if result["failed"] > 0
                  or len(returned["failed"]) > 0 else "")

    extra_issues = "\n".join(returned["failed"])
    message = dedent(
        f"{error_flag}*Landsat GAP Filler - {environment}*\n"
        f"Sent Messages: {result['sent']}\n"
        f"Failed Messages: {int(result['failed']) + len(returned['failed'])}\n"
        f"Failed sending: {int(result['failed'])}\n"
        f"Other issues presented: {extra_issues}")

    log.info(message)
    if notification_url is not None and result["sent"] > 0:
        send_slack_notification(notification_url, "Landsat Gap Filler",
                                message)

    if (int(result["failed"]) + len(returned["failed"])) > 0:
        sys.exit(1)
def generate_buckets_diff(
    bucket_name: str,
    satellites: str,
    file_name: str,
    update_stac: bool = False,
    notification_url: str = None,
):
    """
    Compare USGS bulk files and Africa inventory bucket detecting differences
    A report containing missing keys will be written to AFRICA_S3_BUCKET_PATH
    """

    log = setup_logging()

    start_timer = time.time()

    log.info("Task started")

    landsat_status_report_path = URL(f"s3://{bucket_name}/status-report/")
    landsat_status_report_url = URL(
        f"https://{bucket_name}.s3.af-south-1.amazonaws.com/status-report/")
    environment = "DEV" if "dev" in bucket_name else "PDS"

    title = " & ".join(satellites).replace("ls", "Landsat ")

    log.info(f"Environment {environment}")
    log.info(f"Bucket Name {bucket_name}")
    log.info(f"Satellites {satellites}")
    log.info(f"File Name {file_name}")
    log.info(f"Update all ({update_stac})")
    log.info(f"Notification URL ({notification_url})")

    # Create connection to the inventory S3 bucket
    log.info(f"Retrieving keys from inventory bucket {LANDSAT_INVENTORY_PATH}")
    dest_paths = get_and_filter_keys(satellites=satellites)

    log.info(f"INVENTORY bucket number of objects {len(dest_paths)}")
    log.info(f"INVENTORY 10 first {list(dest_paths)[0:10]}")
    date_string = datetime.now().strftime("%Y-%m-%d")

    # Download bulk file
    log.info("Download Bulk file")
    file_path = download_file_to_tmp(url=str(BASE_BULK_CSV_URL),
                                     file_name=file_name)

    # Retrieve keys from the bulk file
    log.info("Filtering keys from bulk file")
    source_paths = get_and_filter_keys_from_files(file_path)

    log.info(f"BULK FILE number of objects {len(source_paths)}")
    log.info(f"BULK 10 First {list(source_paths)[0:10]}")

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = source_paths
        orphaned_scenes = []

    else:
        # collect missing scenes
        # missing scenes = keys that are in the bulk file but missing in PDS sync bucket and/or in source bucket
        log.info("Filtering missing scenes")
        missing_scenes = [
            str(USGS_S3_BUCKET_PATH / path)
            for path in source_paths.difference(dest_paths)
        ]

        # collect orphan scenes
        # orphan scenes = keys that are in PDS sync bucket but missing in the bulk file and/or in source bucket
        log.info("Filtering orphan scenes")
        orphaned_scenes = [
            str(URL(f"s3://{bucket_name}") / path)
            for path in dest_paths.difference(source_paths)
        ]

        log.info(f"Found {len(missing_scenes)} missing scenes")
        log.info(f"missing_scenes 10 first keys {list(missing_scenes)[0:10]}")
        log.info(f"Found {len(orphaned_scenes)} orphaned scenes")
        log.info(
            f"orphaned_scenes 10 first keys {list(orphaned_scenes)[0:10]}")

    landsat_s3 = s3_client(region_name="af-south-1")

    if len(missing_scenes) > 0 or len(orphaned_scenes) > 0:
        output_filename = (
            (f"{title}_{date_string}_gap_report.json" if not update_stac
             else URL(f"{date_string}_gap_report_update.json")).replace(
                 " ", "_").replace("_&", ""))

        log.info(
            f"Report file will be saved in {landsat_status_report_path / output_filename}"
        )
        missing_orphan_scenes_json = json.dumps({
            "orphan": orphaned_scenes,
            "missing": missing_scenes
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(landsat_status_report_path / output_filename),
            s3=landsat_s3,
            ContentType="application/json",
        )

    report_output = (str(landsat_status_report_url /
                         output_filename) if len(missing_scenes) > 0
                     or len(orphaned_scenes) > 0 else output_filename)

    message = dedent(f"*{title} GAP REPORT - {environment}*\n "
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_scenes)}\n"
                     f"Report: {report_output}\n")

    log.info(message)

    log.info(
        f"File {file_name} processed and sent in {time_process(start=start_timer)}"
    )

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_scenes) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url,
                                    f"{satellites} Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")