def get_orphans():
    s3 = s3_client(region_name=DEAFRICA_AWS_REGION)

    print("Finding Orphans")
    report_files = list(s3_ls_dir(uri=DEAFRICA_GAP_REPORT_S3_PATH, s3=s3))
    report_files_json = [
        report_file for report_file in report_files
        if report_file.endswith(".json")
    ]

    # fetch the latest report: Landsat 5, Landsat 7 and Landsat 8
    report_files_json.sort()
    landsat_8_report = [
        report_file for report_file in report_files_json
        if "landsat_8" in report_file
    ][-1]
    landsat_7_report = [
        report_file for report_file in report_files_json
        if "landsat_7" in report_file
    ][-1]
    landsat_5_report = [
        report_file for report_file in report_files_json
        if "landsat_5" in report_file
    ][-1]

    # collect orphan paths
    list_orphan_paths = []
    for report in [landsat_5_report, landsat_7_report, landsat_8_report]:
        file = s3_fetch(url=report, s3=s3)
        dict_file = json.loads(file)
        orphans = set(dict_file.get("orphan"))
        print(f"collected orphan scenes from {report}: {len(orphans)}")
        list_orphan_paths.extend(orphans)

    return list_orphan_paths
def prepare_message(scene_paths: list, log: Optional[logging.Logger] = None):
    """
    Prepare a single message for each stac file
    """

    s3 = s3_client(region_name=SOURCE_REGION)

    message_id = 0
    for s3_path in scene_paths:
        try:
            contents = s3_fetch(url=s3_path, s3=s3)
            contents_dict = json.loads(contents)

            attributes = get_common_message_attributes(contents_dict)

            message = {
                "Id": str(message_id),
                "MessageBody": json.dumps(
                    {
                        "Message": json.dumps(contents_dict),
                        "MessageAttributes": attributes,
                    }
                ),
            }
            message_id += 1
            yield message
        except Exception as exc:
            if log:
                log.error(f"{s3_path} does not exist - {exc}")
Example #3
0
def get_and_filter_cogs_keys():
    """
    Retrieve key list from a inventory bucket and filter
    :return:
    """

    s3 = s3_client(region_name=SOURCE_REGION)
    source_keys = list_inventory(
        manifest=f"{SOURCE_INVENTORY_PATH}",
        s3=s3,
        prefix=BASE_FOLDER_NAME,
        contains=".json",
        n_threads=200,
    )

    africa_tile_ids = set(
        pd.read_csv(
            "https://raw.githubusercontent.com/digitalearthafrica/deafrica-extent/master/deafrica-mgrs-tiles.csv.gz",
            header=None,
        ).values.ravel())

    return set(
        key.Key for key in source_keys
        if (key.Key.split("/")[-2].split("_")[1] in africa_tile_ids
            # We need to ensure we're ignoring the old format data
            and re.match(r"sentinel-s2-l2a-cogs/\d{4}/", key.Key) is None))
Example #4
0
def cli(file_list, no_sign_request=None):
    global s3
    s3 = s3_client(aws_unsigned=no_sign_request)

    urls = [line.rstrip() for line in file_list.readlines()]
    for url in tqdm(urls):
        if not url:
            continue
        tqdm.write(f"Updating {url}", end='')
        replace_in_s3_obj(url)
def check_scene_exist_in_source(path: str):
    """
    check scene exists in usgs source bucket
    """
    s3 = s3_client(region_name=USGS_AWS_REGION)
    usgs_path = path.replace(f"s3://{DEAFRICA_LANDSAT_BUCKET_NAME}",
                             f"s3://{USGS_S3_BUCKET_NAME}")

    returned = set(s3_ls(usgs_path, s3=s3, **{"RequestPayer": "requester"}))
    if returned:
        return True

    return False
Example #6
0
def cli(
    inventory,
    prefix,
    regex,
    glob,
    aws_profile,
    no_sign_request=None,
    request_payer=False,
):
    """List S3 inventory entries.

        prefix can be combined with regex or glob pattern, but supplying both
        regex and glob doesn't make sense.

    \b
    Example:
       s3-inventory s3://my-inventory-bucket/path-to-inventory/ '*yaml'

    """

    def entry_to_url(entry):
        return "s3://{e.Bucket}/{e.Key}".format(e=entry)

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    flush_freq = 100
    s3 = s3_client(profile=aws_profile, aws_unsigned=no_sign_request)

    if glob == "":
        glob = None

    if glob is not None and regex is not None:
        click.echo("Can not mix regex and shell patterns")
        sys.exit(1)

    if inventory is None:
        # TODO: read from config file
        inventory = "s3://dea-public-data-inventory/dea-public-data/dea-public-data-csv-inventory/"

    predicate = build_predicate(glob=glob, regex=regex, prefix=prefix)

    to_str = entry_to_url

    for i, entry in enumerate(list_inventory(inventory, s3=s3, **opts)):
        if predicate(entry):
            print(to_str(entry), flush=(i % flush_freq) == 0)
def publish_to_s3(data: list,
                  output_filename: str,
                  content_type: str = "text/plain"):
    """
    write report to s3
    """
    s3 = s3_client(region_name=DEAFRICA_AWS_REGION)
    s3_dump(
        data=data,
        url=str(DEAFRICA_ORPHAN_REPORT_S3_PATH / output_filename),
        s3=s3,
        ContentType=content_type,
    )
    print(
        f"Report can be accessed from {DEAFRICA_ORPHAN_REPORT_S3_PATH / output_filename}"
    )
Example #8
0
def main(s3_urls, workers):
    """
    Script to sync Sentinel-2 data from NCI to AWS S3 bucket

    Pass in a file containing destination S3 urls that need to be uploaded.

    """
    setup_logging()

    global S3
    S3 = s3_client()
    urls_to_upload = [url.strip() for url in s3_urls.readlines()]

    _LOG.info(f"{len(urls_to_upload)} datasets to upload.")
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(upload_dataset, s3_url) for s3_url in urls_to_upload]

        for future in tqdm(as_completed(futures), total=len(urls_to_upload), unit='datasets', disable=None):
            _LOG.info(f"Completed uploaded: {future.result()}")
Example #9
0
def read_report_missing_scenes(report_path: str, limit=None):
    """
    read the gap report
    """

    s3 = s3_client(region_name="af-south-1")
    report_json = s3_fetch(url=report_path, s3=s3)
    report_dict = json.loads(report_json)

    if report_dict.get("missing", None) is None:
        raise Exception("Missing scenes not found")

    missing_scene_paths = [
        scene_path.strip() for scene_path in report_dict["missing"] if scene_path
    ]

    if limit:
        missing_scene_paths = missing_scene_paths[: int(limit)]

    return missing_scene_paths
Example #10
0
def find_latest_report(
    report_folder_path: str, contains: str = None, not_contains: str = None
) -> str:
    """
    Function to find the latest gap report
    :return:(str) return the latest report file name
    """

    s3 = s3_client(region_name="af-south-1")

    report_files = list(s3_ls_dir(uri=report_folder_path, s3=s3))

    if contains is not None:
        report_files = [report for report in report_files if contains in report]

    if not_contains is not None:
        report_files = [report for report in report_files if not_contains not in report]

    report_files.sort()

    if not report_files:
        raise RuntimeError("Report not found!")

    return report_files[-1]
from odc.aws.inventory import find_latest_manifest, list_inventory
from odc.aws import s3_head_object, s3_client

INVENTORY_BUCKET = "deafrica-sentinel-2-inventory"
PREFIX = "deafrica-sentinel-2/deafrica-sentinel-2-inventory/"

DO_FIX = False

if DO_FIX:
    client = s3_client(region_name="af-south-1")
else:
    client = s3_client(aws_unsigned=True, region_name="af-south-1")

manifest = find_latest_manifest(
    f"s3://{INVENTORY_BUCKET}/{PREFIX}",
    client,
)

inventory = list_inventory(manifest, s3=client)

report_every = 10000
count = 0

json_docs = 0
to_fix = 0

for obj in inventory:
    count += 1
    if count % report_every == 0:
        print(f"Processing {count}")
    if obj.Key.endswith(".json"):
def create_mosaic(
    dc: Datacube,
    product: str,
    out_product: str,
    time: Tuple[str, str],
    time_str: str,
    bands: Tuple[str],
    s3_output_root: str,
    split_bands: bool = False,
    resolution: int = 120,
    overwrite: bool = False,
):
    log = setup_logging()
    log.info(f"Creating mosaic for {product} over {time}")

    client = start_local_dask()

    assets = {}
    data = dc.load(
        product=product,
        time=time,
        resolution=(-resolution, resolution),
        dask_chunks={"x": 2048, "y": 2048},
        measurements=bands,
    )

    # This is a bad idea, we run out of memory
    # data.persist()

    if not split_bands:
        log.info("Creating a single tif file")
        out_file = _get_path(s3_output_root, out_product, time_str, "tif")
        exists = s3_head_object(out_file) is not None
        skip_writing = not (not exists or overwrite)
        try:
            asset, _ = _save_opinionated_cog(
                data,
                out_file,
                skip_writing=skip_writing,
            )
        except ValueError:
            log.exception(
                "Failed to create COG, please check that you only have one timestep in the period."
            )
            exit(1)
        assets[bands[0]] = asset
        if skip_writing:
            log.info(f"File exists, and overwrite is False. Not writing {out_file}")
        else:
            log.info(f"Finished writing: {asset.href}")
    else:
        log.info("Creating multiple tif files")

        for band in bands:
            out_file = _get_path(
                s3_output_root, out_product, time_str, "tif", band=band
            )
            exists = s3_head_object(out_file) is not None
            skip_writing = not (not exists or overwrite)

            try:
                asset, band = _save_opinionated_cog(
                    data=data,
                    out_file=out_file,
                    band=band,
                    skip_writing=skip_writing,
                )
            except ValueError:
                log.exception(
                    "Failed to create COG, please check that you only have one timestep in the period."
                )
                exit(1)
            assets[band] = asset
            if skip_writing:
                log.info(f"File exists, and overwrite is False. Not writing {out_file}")
            else:
                log.info(f"Finished writing: {asset.href}")
                # Aggressively heavy handed, but we get memory leaks otherwise
                client.restart()

    out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json")
    item = create_stac_item(
        assets[bands[0]].href,
        id=f"{product}_{time_str}",
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": out_product,
            "start_datetime": f"{time[0]}T00:00:00Z",
            "end_datetime": f"{time[1]}T23:59:59Z",
        },
    )
    item.set_self_href(out_stac_file)

    log.info(f"Writing STAC: {out_stac_file}")
    client = s3_client(aws_unsigned=False)
    s3_dump(
        data=json.dumps(item.to_dict(), indent=2),
        url=item.self_href,
        ACL="bucket-owner-full-control",
        ContentType="application/json",
        s3=client,
    )
Example #13
0
def list_inventory(
    manifest,
    s3=None,
    prefix: str = "",
    suffix: str = "",
    contains: str = "",
    multiple_contains: tuple[str, str] = None,
    n_threads: int = None,
    **kw,
):
    """
    Returns a generator of inventory records

    manifest -- s3:// url to manifest.json or a folder in which case latest one is chosen.

    :param manifest: (str)
    :param s3: (aws client)
    :param prefix: (str)
    :param prefixes: (List(str)) allow multiple prefixes to be searched
    :param suffix: (str)
    :param contains: (str)
    :param n_threads: (int) number of threads, if not sent does not use threads
    :return: SimpleNamespace
    """
    s3 = s3 or s3_client()

    if manifest.endswith("/"):
        manifest = find_latest_manifest(manifest, s3, **kw)

    info = s3_fetch(manifest, s3=s3, **kw)
    info = json.loads(info)

    must_have_keys = {"fileFormat", "fileSchema", "files", "destinationBucket"}
    missing_keys = must_have_keys - set(info)
    if missing_keys:
        raise ValueError("Manifest file haven't parsed correctly")

    if info["fileFormat"].upper() != "CSV":
        raise ValueError("Data is not in CSV format")

    s3_prefix = "s3://" + info["destinationBucket"].split(":")[-1] + "/"
    data_urls = [s3_prefix + f["key"] for f in info["files"]]
    schema = tuple(info["fileSchema"].split(", "))

    if n_threads:
        with ThreadPoolExecutor(max_workers=1000) as executor:
            tasks = [
                executor.submit(retrieve_manifest_files, key, s3, schema)
                for key in data_urls
            ]

            for future in as_completed(tasks):
                for namespace in future.result():
                    key = namespace.Key
                    if test_key(
                        key,
                        prefix=prefix,
                        suffix=suffix,
                        contains=contains,
                        multiple_contains=multiple_contains,
                    ):
                        yield namespace
    else:
        for u in data_urls:
            for namespace in retrieve_manifest_files(u, s3, schema):
                key = namespace.Key
                if test_key(
                    key,
                    prefix=prefix,
                    suffix=suffix,
                    contains=contains,
                    multiple_contains=multiple_contains,
                ):
                    yield namespace
Example #14
0
def generate_buckets_diff(
    bucket_name: str,
    update_stac: bool = False,
    notification_url: str = None,
) -> None:
    """
    Compare Sentinel-2 buckets in US and Africa and detect differences
    A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report

    :param bucket_name: (str) Bucket where the gap report is
    :param update_stac: (bool) Define if the report will contain all scenes from the source for an update
    :param notification_url: (str) Optional slack URL in case of you want to send a slack notification
    """

    log = setup_logging()

    log.info("Task started")

    # defines where the report will be saved
    s2_status_report_path = URL(f"s3://{bucket_name}/status-report/")

    environment = "DEV" if "dev" in bucket_name else "PDS"
    log.info(f"Environment {environment}")

    date_string = datetime.now().strftime("%Y-%m-%d")

    # Retrieve keys from inventory bucket
    source_keys = get_and_filter_cogs_keys()

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = set(f"s3://sentinel-cogs/{key}"
                             for key in source_keys)
        orphaned_keys = set()

    else:

        destination_keys = set(ns.Key for ns in list_inventory(
            manifest=f"{SENTINEL_2_INVENTORY_PATH}",
            prefix=BASE_FOLDER_NAME,
            contains=".json",
            n_threads=200,
        ))

        # Keys that are missing, they are in the source but not in the bucket
        missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys
                             if key not in destination_keys)

        # Keys that are lost, they are in the bucket but not found in the source
        orphaned_keys = destination_keys.difference(source_keys)

    s2_s3 = s3_client(region_name=SENTINEL_2_REGION)

    if len(missing_scenes) > 0 or len(orphaned_keys) > 0:
        output_filename = (f"{date_string}_gap_report.json" if not update_stac
                           else URL(f"{date_string}_gap_report_update.json"))

        log.info(
            f"File will be saved in {s2_status_report_path}/{output_filename}")

        missing_orphan_scenes_json = json.dumps({
            "orphan": list(orphaned_keys),
            "missing": list(missing_scenes)
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(URL(s2_status_report_path) / output_filename),
            s3=s2_s3,
            ContentType="application/json",
        )

    report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}"
    message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n"
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_keys)}\n"
                     f"Report: {report_http_link}\n")

    log.info(message)

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_keys) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url, "S2 Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")
def generate_buckets_diff(
    bucket_name: str,
    satellites: str,
    file_name: str,
    update_stac: bool = False,
    notification_url: str = None,
):
    """
    Compare USGS bulk files and Africa inventory bucket detecting differences
    A report containing missing keys will be written to AFRICA_S3_BUCKET_PATH
    """

    log = setup_logging()

    start_timer = time.time()

    log.info("Task started")

    landsat_status_report_path = URL(f"s3://{bucket_name}/status-report/")
    landsat_status_report_url = URL(
        f"https://{bucket_name}.s3.af-south-1.amazonaws.com/status-report/")
    environment = "DEV" if "dev" in bucket_name else "PDS"

    title = " & ".join(satellites).replace("ls", "Landsat ")

    log.info(f"Environment {environment}")
    log.info(f"Bucket Name {bucket_name}")
    log.info(f"Satellites {satellites}")
    log.info(f"File Name {file_name}")
    log.info(f"Update all ({update_stac})")
    log.info(f"Notification URL ({notification_url})")

    # Create connection to the inventory S3 bucket
    log.info(f"Retrieving keys from inventory bucket {LANDSAT_INVENTORY_PATH}")
    dest_paths = get_and_filter_keys(satellites=satellites)

    log.info(f"INVENTORY bucket number of objects {len(dest_paths)}")
    log.info(f"INVENTORY 10 first {list(dest_paths)[0:10]}")
    date_string = datetime.now().strftime("%Y-%m-%d")

    # Download bulk file
    log.info("Download Bulk file")
    file_path = download_file_to_tmp(url=str(BASE_BULK_CSV_URL),
                                     file_name=file_name)

    # Retrieve keys from the bulk file
    log.info("Filtering keys from bulk file")
    source_paths = get_and_filter_keys_from_files(file_path)

    log.info(f"BULK FILE number of objects {len(source_paths)}")
    log.info(f"BULK 10 First {list(source_paths)[0:10]}")

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = source_paths
        orphaned_scenes = []

    else:
        # collect missing scenes
        # missing scenes = keys that are in the bulk file but missing in PDS sync bucket and/or in source bucket
        log.info("Filtering missing scenes")
        missing_scenes = [
            str(USGS_S3_BUCKET_PATH / path)
            for path in source_paths.difference(dest_paths)
        ]

        # collect orphan scenes
        # orphan scenes = keys that are in PDS sync bucket but missing in the bulk file and/or in source bucket
        log.info("Filtering orphan scenes")
        orphaned_scenes = [
            str(URL(f"s3://{bucket_name}") / path)
            for path in dest_paths.difference(source_paths)
        ]

        log.info(f"Found {len(missing_scenes)} missing scenes")
        log.info(f"missing_scenes 10 first keys {list(missing_scenes)[0:10]}")
        log.info(f"Found {len(orphaned_scenes)} orphaned scenes")
        log.info(
            f"orphaned_scenes 10 first keys {list(orphaned_scenes)[0:10]}")

    landsat_s3 = s3_client(region_name="af-south-1")

    if len(missing_scenes) > 0 or len(orphaned_scenes) > 0:
        output_filename = (
            (f"{title}_{date_string}_gap_report.json" if not update_stac
             else URL(f"{date_string}_gap_report_update.json")).replace(
                 " ", "_").replace("_&", ""))

        log.info(
            f"Report file will be saved in {landsat_status_report_path / output_filename}"
        )
        missing_orphan_scenes_json = json.dumps({
            "orphan": orphaned_scenes,
            "missing": missing_scenes
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(landsat_status_report_path / output_filename),
            s3=landsat_s3,
            ContentType="application/json",
        )

    report_output = (str(landsat_status_report_url /
                         output_filename) if len(missing_scenes) > 0
                     or len(orphaned_scenes) > 0 else output_filename)

    message = dedent(f"*{title} GAP REPORT - {environment}*\n "
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_scenes)}\n"
                     f"Report: {report_output}\n")

    log.info(message)

    log.info(
        f"File {file_name} processed and sent in {time_process(start=start_timer)}"
    )

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_scenes) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url,
                                    f"{satellites} Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")