Python s3_dump Examples, odc.aws.s3_dump Python Examples

Example #1

0

Show file

File: upload_s2_nbart.py Project: GeoscienceAustralia/dea-airflow

def upload_metadata(granule_id):
    """
    Creates and uploads metadata in stac and eo3 formats.
    :param granule_id: the id of the granule in format 'date/tile_id'
    :return: serialized stac metadata
    """

    local_path = Path(NCI_DIR) / granule_id
    granule_s3_path = get_granule_s3_path(granule_id)

    s3_path = f"s3://{S3_BUCKET}/{granule_s3_path}/"
    s3_eo3_path = f"{s3_path}eo3-ARD-METADATA.yaml"
    s3_stac_path = f"{s3_path}stac-ARD-METADATA.json"

    eo3 = create_eo3(local_path, granule_id)
    stac = to_stac_item(
        eo3,
        stac_item_destination_url=s3_stac_path,
        odc_dataset_metadata_url=s3_eo3_path,
        dataset_location=s3_path,
    )
    stac_dump = json.dumps(stac, default=json_fallback, indent=4)

    s3_dump(
        yaml.safe_dump(serialise.to_doc(eo3), default_flow_style=False), 
        s3_eo3_path, 
        ACL="bucket-owner-full-control",
        ContentType="text/vnd.yaml"
    )

    return stac_dump, s3_stac_path

Example #2

0

Show file

File: upload_s2_nbart.py Project: GeoscienceAustralia/dea-airflow

def upload_granule(granule_id, sns_topic_arn):
    """
    :param granule_id: the id of the granule in format 'date/tile_id'
    """
    session = boto3.session.Session()
    bucket_stac_path = f"{get_granule_s3_path(granule_id)}/stac-ARD-METADATA.json"

    if not check_granule_exists(S3_BUCKET, bucket_stac_path, session=session):
        sync_granule(
            granule_id,
            NCI_DIR,
            Path(get_granule_s3_path(granule_id)).parent.parent,
            S3_BUCKET,
            exclude=["NBAR/*", "ARD-METADATA.yaml", "*NBAR_CONTIGUITY.TIF"],
            cross_account=True,
        )

        stac_dump, s3_stac_path = upload_metadata(granule_id)

        message_attributes = get_common_message_attributes(json.loads(stac_dump))
        message_attributes.update(
            {"action": {"DataType": "String", "StringValue": "ADDED"}}
        )

        _LOG.info(f"Sending SNS. Granule id: {granule_id}")
        try:
            publish_sns(sns_topic_arn, stac_dump, message_attributes, session=session)
        except Exception as e:
            _LOG.info(f"SNS send failed: {e}. Granule id: {granule_id}")

        _LOG.info(f"Uploading STAC: {granule_id}")
        s3_dump(stac_dump, s3_stac_path, ACL="bucket-owner-full-control", ContentType="application/json")  # upload STAC last
    else:
        _LOG.info(f"Granule {granule_id} already uploaded, skipping.")

Example #3

0

Show file

File: gmw.py Project: digitalearthafrica/scripts

def create_and_upload_stac(cog_file: Path, s3_dst: str, year) -> Item:
    out_path = URL(f"{s3_dst}/{year}/")

    log.info("Item base creation")
    item = create_stac_item(
        str(cog_file),
        id=str(odc_uuid("gmw", "2.0", [cog_file.name.replace("tif", "")])),
        with_proj=True,
        input_datetime=datetime(int(year), 12, 31),
        properties={
            "odc:product": "gmw",
            "start_datetime": f"{year}-01-01T00:00:00Z",
            "end_datetime": f"{year}-12-31T23:59:59Z",
        },
    )

    log.info("links creation")
    item.set_self_href(str(out_path / f"gmw_{year}_stac-item.json"))
    item.add_links([
        pystac.Link(
            target=str(SOURCE_URL_PATH / FILE_NAME.format(year=year)),
            title="Source file",
            rel=pystac.RelType.DERIVED_FROM,
            media_type="application/zip",
        )
    ])

    out_data = out_path / cog_file.name
    # Remove asset created by create_stac_item and add our own
    del item.assets["asset"]
    item.assets["mangrove"] = pystac.Asset(
        href=str(out_data),
        title="gmw-v1.0",
        media_type=pystac.MediaType.COG,
        roles=["data"],
    )

    log.info(f"Item created {item.to_dict()}")
    log.info(f"Item validated {item.validate()}")

    log.info(f"Dump the data to S3 {str(cog_file)}")
    s3_dump(
        data=open(str(cog_file), "rb").read(),
        url=str(out_data),
        ACL="bucket-owner-full-control",
        ContentType="image/tiff",
    )
    log.info(f"File written to {out_data}")

    log.info("Write STAC to S3")
    s3_dump(
        data=json.dumps(item.to_dict(), indent=2),
        url=item.self_href,
        ACL="bucket-owner-full-control",
        ContentType="application/json",
    )
    log.info(f"STAC written to {item.self_href}")

    return item

Example #4

0

Show file

def process_dataset(s3_obj):
    
    s3_eo3_path = s3_obj.url
    s3_stac_path = s3_eo3_path.replace("eo3", "stac")
    s3_stac_path = s3_stac_path.replace("yaml", "json")
    s3_path = s3_eo3_path.replace("eo3-ARD-METADATA.yaml", "")
    granule = os.path.join(*s3_eo3_path.split('/')[5:-1])
    nci_path = os.path.join(NCI_DIR, *s3_eo3_path.split('/')[5:-1], "ARD-METADATA.yaml")
    
    if "S2A_OPER_MSI_ARD" in granule:
        platform = "SENTINEL_2A"
    elif "S2B_OPER_MSI_ARD" in granule:
        platform = "SENTINEL_2B"
    else:
        raise ValueError(
            f"Expected granule id to contain either 'S2A_OPER_MSI_ARD' or 'S2B_OPER_MSI_ARD', found '{granule}'"
        )
    
    with open(nci_path) as fin:
        eo_metadata = yaml.safe_load(fin)
    
    eo3_metadata = yaml.safe_load(s3_obj.data)
    
    eo3_metadata["properties"]["odc:region_code"] = eo_metadata["provider"]["reference_code"]
    eo3_metadata["properties"]["gqa:cep90"] = eo_metadata["gqa"]["residual"]["cep90"]
    eo3_metadata["properties"]["gqa:error_message"] = eo_metadata["gqa"]["error_message"]
    eo3_metadata["properties"]["gqa:final_gcp_count"] = eo_metadata["gqa"]["final_gcp_count"]
    eo3_metadata["properties"]["gqa:ref_source"] = eo_metadata["gqa"]["ref_source"]
    eo3_metadata["properties"]["sentinel:datatake_start_datetime"] = granule.split("_")[-4]
    eo3_metadata["properties"]["eo:platform"] = platform
    eo3_metadata["properties"]["eo:instrument"] = "MSI"
    
    for key in ["abs_iterative_mean", "abs", "iterative_mean", "iterative_stddev", "mean", "stddev"]:
        eo3_metadata["properties"][f"gqa:{key}_xy"] = eo_metadata["gqa"]["residual"][key]["xy"]

    eo3 = serialise.from_doc(eo3_metadata)
    stac = to_stac_item(
        eo3,
        stac_item_destination_url=s3_stac_path,
        odc_dataset_metadata_url=s3_eo3_path,
        dataset_location=s3_path,
    )
    stac_dump = json.dumps(stac, default=json_fallback, indent=4)
    eo3_dump = yaml.safe_dump(eo3_metadata, default_flow_style=False)

    s3_dump(
        eo3_dump, 
        s3_eo3_path, 
        ACL="bucket-owner-full-control",
        ContentType="text/vnd.yaml",
    )

    s3_dump(
        stac_dump, 
        s3_stac_path, 
        ACL="bucket-owner-full-control",
        ContentType="application/json"
    )

Example #5

0

Show file

def upload_dataset_doc(src_yaml, s3_url):
    """
    Replace metadata with additional info
    :param src_yaml: metadata file in NCI
    :param s3_url: path to upload metadata to in s3
    """
    with open(src_yaml) as fin:
        nci_dataset = yaml.safe_load(fin)

    metadata_to_upload = munge_metadata(nci_dataset)

    s3_dump(yaml.safe_dump(metadata_to_upload, default_flow_style=False), s3_url, S3)

Example #6

0

Show file

def replace_in_s3_obj(s3_url):
    try:
        original = s3_fetch(s3_url, s3)
    except ValueError as e:
        tqdm.write(str(e))
        return
    contents = original.replace(b'LANDSAT_8', b'LANDSAT_7')
    contents = contents.replace(b'OLI', b'ETM')

    if original != contents:
        s3_dump(contents, s3_url, s3)
        tqdm.write('.')
    else:
        tqdm.write(' - Skipped.')

Example #7

0

Show file

File: check_landsat_orphans.py Project: digitalearthafrica/scripts

def publish_to_s3(data: list,
                  output_filename: str,
                  content_type: str = "text/plain"):
    """
    write report to s3
    """
    s3 = s3_client(region_name=DEAFRICA_AWS_REGION)
    s3_dump(
        data=data,
        url=str(DEAFRICA_ORPHAN_REPORT_S3_PATH / output_filename),
        s3=s3,
        ContentType=content_type,
    )
    print(
        f"Report can be accessed from {DEAFRICA_ORPHAN_REPORT_S3_PATH / output_filename}"
    )

Example #8

0

Show file

File: alos_palsar.py Project: digitalearthafrica/scripts

def upload_to_s3(s3_destination, files, log):
    log.info(f"Uploading to {s3_destination}")
    # Upload data
    for out_file in files:
        out_name = os.path.basename(out_file)
        dest = f"S3://{s3_destination}/{out_name}"
        log.info(f"Uploading file to {dest}")
        if "yaml" in out_name:
            content_type = "text/yaml"
        else:
            content_type = "image/tiff"
        s3_dump(
            data=open(out_file, "rb").read(),
            url=dest,
            ACL="bucket-owner-full-control",
            ContentType=content_type,
        )

Example #9

0

Show file

File: create_mosaic.py Project: digitalearthafrica/scripts

def create_mosaic(
    dc: Datacube,
    product: str,
    out_product: str,
    time: Tuple[str, str],
    time_str: str,
    bands: Tuple[str],
    s3_output_root: str,
    split_bands: bool = False,
    resolution: int = 120,
    overwrite: bool = False,
):
    log = setup_logging()
    log.info(f"Creating mosaic for {product} over {time}")

    client = start_local_dask()

    assets = {}
    data = dc.load(
        product=product,
        time=time,
        resolution=(-resolution, resolution),
        dask_chunks={"x": 2048, "y": 2048},
        measurements=bands,
    )

    # This is a bad idea, we run out of memory
    # data.persist()

    if not split_bands:
        log.info("Creating a single tif file")
        out_file = _get_path(s3_output_root, out_product, time_str, "tif")
        exists = s3_head_object(out_file) is not None
        skip_writing = not (not exists or overwrite)
        try:
            asset, _ = _save_opinionated_cog(
                data,
                out_file,
                skip_writing=skip_writing,
            )
        except ValueError:
            log.exception(
                "Failed to create COG, please check that you only have one timestep in the period."
            )
            exit(1)
        assets[bands[0]] = asset
        if skip_writing:
            log.info(f"File exists, and overwrite is False. Not writing {out_file}")
        else:
            log.info(f"Finished writing: {asset.href}")
    else:
        log.info("Creating multiple tif files")

        for band in bands:
            out_file = _get_path(
                s3_output_root, out_product, time_str, "tif", band=band
            )
            exists = s3_head_object(out_file) is not None
            skip_writing = not (not exists or overwrite)

            try:
                asset, band = _save_opinionated_cog(
                    data=data,
                    out_file=out_file,
                    band=band,
                    skip_writing=skip_writing,
                )
            except ValueError:
                log.exception(
                    "Failed to create COG, please check that you only have one timestep in the period."
                )
                exit(1)
            assets[band] = asset
            if skip_writing:
                log.info(f"File exists, and overwrite is False. Not writing {out_file}")
            else:
                log.info(f"Finished writing: {asset.href}")
                # Aggressively heavy handed, but we get memory leaks otherwise
                client.restart()

    out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json")
    item = create_stac_item(
        assets[bands[0]].href,
        id=f"{product}_{time_str}",
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": out_product,
            "start_datetime": f"{time[0]}T00:00:00Z",
            "end_datetime": f"{time[1]}T23:59:59Z",
        },
    )
    item.set_self_href(out_stac_file)

    log.info(f"Writing STAC: {out_stac_file}")
    client = s3_client(aws_unsigned=False)
    s3_dump(
        data=json.dumps(item.to_dict(), indent=2),
        url=item.self_href,
        ACL="bucket-owner-full-control",
        ContentType="application/json",
        s3=client,
    )

Example #10

0

Show file

def download_gls(year: str, s3_dst: str, workdir: Path, overwrite: bool = False):
    log = setup_logging()
    assets = {}
    out_stac = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}.stac-item.json"

    if s3_head_object(str(out_stac)) is not None and not overwrite:
        log.info(f"{out_stac} exists, skipping")
        return

    # Download the files
    for name, file in FILES.items():
        # Create a temporary directory to work with
        with TemporaryDirectory(prefix=workdir) as tmpdir:
            log.info(f"Working on {file}")
            url = URL(
                BASE_URL.format(
                    record_id=YEARS[year][1], year_key=YEARS[year][0], file=file
                )
            )

            dest_url = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}_{name}.tif"

            if s3_head_object(str(dest_url)) is None or overwrite:
                log.info(f"Downloading {url}")

                try:
                    local_file = Path(tmpdir) / str(url.name)
                    # Download the file
                    download_file(url, local_file)

                    log.info(f"Downloaded file to {local_file}")
                    local_file_small = translate_file_deafrica_extent(local_file)
                    log.info(f"Clipped Africa out and saved to {local_file_small}")
                    resampling = "nearest" if name in DO_NEAREST else "bilinear"

                    # Create a COG in memory and upload to S3
                    with MemoryFile() as mem_dst:
                        # Creating the COG, with a memory cache and no download. Shiny.
                        cog_translate(
                            local_file_small,
                            mem_dst.name,
                            cog_profiles.get("deflate"),
                            in_memory=True,
                            nodata=255,
                            overview_resampling=resampling,
                        )
                        mem_dst.seek(0)
                        s3_dump(mem_dst, str(dest_url), ACL="bucket-owner-full-control")
                        log.info(f"File written to {dest_url}")
                except Exception:
                    log.exception(f"Failed to process {url}")
                    exit(1)
            else:
                log.info(f"{dest_url} exists, skipping")

            assets[name] = pystac.Asset(
                href=str(dest_url), roles=["data"], media_type=pystac.MediaType.COG
            )

    # Write STAC document from the last-written file
    source_doc = f"https://zenodo.org/record/{YEARS[year][1]}"
    item = create_stac_item(
        str(dest_url),
        id=str(odc_uuid("Copernicus Global Land Cover", "3.0.1", [source_doc])),
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": PRODUCT_NAME,
            "start_datetime": f"{year}-01-01T00:00:00Z",
            "end_datetime": f"{year}-12-31T23:59:59Z",
        },
    )
    item.add_links(
        [
            pystac.Link(
                target=source_doc,
                title="Source",
                rel=pystac.RelType.DERIVED_FROM,
                media_type="text/html",
            )
        ]
    )
    s3_dump(
        json.dumps(item.to_dict(), indent=2),
        str(out_stac),
        ContentType="application/json",
        ACL="bucket-owner-full-control",
    )
    log.info(f"STAC written to {out_stac}")

Example #11

0

Show file

def generate_buckets_diff(
    bucket_name: str,
    update_stac: bool = False,
    notification_url: str = None,
) -> None:
    """
    Compare Sentinel-2 buckets in US and Africa and detect differences
    A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report

    :param bucket_name: (str) Bucket where the gap report is
    :param update_stac: (bool) Define if the report will contain all scenes from the source for an update
    :param notification_url: (str) Optional slack URL in case of you want to send a slack notification
    """

    log = setup_logging()

    log.info("Task started")

    # defines where the report will be saved
    s2_status_report_path = URL(f"s3://{bucket_name}/status-report/")

    environment = "DEV" if "dev" in bucket_name else "PDS"
    log.info(f"Environment {environment}")

    date_string = datetime.now().strftime("%Y-%m-%d")

    # Retrieve keys from inventory bucket
    source_keys = get_and_filter_cogs_keys()

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = set(f"s3://sentinel-cogs/{key}"
                             for key in source_keys)
        orphaned_keys = set()

    else:

        destination_keys = set(ns.Key for ns in list_inventory(
            manifest=f"{SENTINEL_2_INVENTORY_PATH}",
            prefix=BASE_FOLDER_NAME,
            contains=".json",
            n_threads=200,
        ))

        # Keys that are missing, they are in the source but not in the bucket
        missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys
                             if key not in destination_keys)

        # Keys that are lost, they are in the bucket but not found in the source
        orphaned_keys = destination_keys.difference(source_keys)

    s2_s3 = s3_client(region_name=SENTINEL_2_REGION)

    if len(missing_scenes) > 0 or len(orphaned_keys) > 0:
        output_filename = (f"{date_string}_gap_report.json" if not update_stac
                           else URL(f"{date_string}_gap_report_update.json"))

        log.info(
            f"File will be saved in {s2_status_report_path}/{output_filename}")

        missing_orphan_scenes_json = json.dumps({
            "orphan": list(orphaned_keys),
            "missing": list(missing_scenes)
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(URL(s2_status_report_path) / output_filename),
            s3=s2_s3,
            ContentType="application/json",
        )

    report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}"
    message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n"
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_keys)}\n"
                     f"Report: {report_http_link}\n")

    log.info(message)

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_keys) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url, "S2 Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")

Example #12

0

Show file

def download_cci_lc(year: str,
                    s3_dst: str,
                    workdir: str,
                    overwrite: bool = False):
    log = setup_logging()
    assets = {}

    cci_lc_version = get_version_from_year(year)
    name = f"{PRODUCT_NAME}_{year}_{cci_lc_version}"

    out_cog = URL(s3_dst) / year / f"{name}.tif"
    out_stac = URL(s3_dst) / year / f"{name}.stac-item.json"

    if s3_head_object(str(out_stac)) is not None and not overwrite:
        log.info(f"{out_stac} exists, skipping")
        return

    workdir = Path(workdir)
    if not workdir.exists():
        workdir.mkdir(parents=True, exist_ok=True)

    # Create a temporary directory to work with
    tmpdir = mkdtemp(prefix=str(f"{workdir}/"))
    log.info(f"Working on {year} in the path {tmpdir}")

    if s3_head_object(str(out_cog)) is None or overwrite:
        log.info(f"Downloading {year}")
        try:
            local_file = Path(tmpdir) / f"{name}.zip"
            if not local_file.exists():
                # Download the file
                c = cdsapi.Client()

                # We could also retrieve the object metadata from the CDS.
                # e.g. f = c.retrieve("series",{params}) | f.location = URL to download
                c.retrieve(
                    "satellite-land-cover",
                    {
                        "format": "zip",
                        "variable": "all",
                        "version": cci_lc_version,
                        "year": str(year),
                    },
                    local_file,
                )

                log.info(f"Downloaded file to {local_file}")
            else:
                log.info(
                    f"File {local_file} exists, continuing without downloading"
                )

            # Unzip the file
            log.info(f"Unzipping {local_file}")
            unzipped = None
            with zipfile.ZipFile(local_file, "r") as zip_ref:
                unzipped = local_file.parent / zip_ref.namelist()[0]
                zip_ref.extractall(tmpdir)

            # Process data
            ds = xr.open_dataset(unzipped)
            # Subset to Africa
            ulx, uly, lrx, lry = AFRICA_BBOX
            # Note: lats are upside down!
            ds_small = ds.sel(lat=slice(uly, lry), lon=slice(ulx, lrx))
            ds_small = assign_crs(ds_small, crs="epsg:4326")

            # Create cog (in memory - :mem: returns bytes object)
            mem_dst = write_cog(
                ds_small.lccs_class,
                ":mem:",
                nodata=0,
                overview_resampling="nearest",
            )

            # Write to s3
            s3_dump(mem_dst, str(out_cog), ACL="bucket-owner-full-control")
            log.info(f"File written to {out_cog}")

        except Exception:
            log.exception(f"Failed to process {name}")
            exit(1)
    else:
        log.info(f"{out_cog} exists, skipping")

    assets["classification"] = pystac.Asset(href=str(out_cog),
                                            roles=["data"],
                                            media_type=pystac.MediaType.COG)

    # Write STAC document
    source_doc = (
        "https://cds.climate.copernicus.eu/cdsapp#!/dataset/satellite-land-cover"
    )
    item = create_stac_item(
        str(out_cog),
        id=str(
            odc_uuid("Copernicus Land Cover", cci_lc_version,
                     [source_doc, name])),
        assets=assets,
        with_proj=True,
        properties={
            "odc:product": PRODUCT_NAME,
            "start_datetime": f"{year}-01-01T00:00:00Z",
            "end_datetime": f"{year}-12-31T23:59:59Z",
        },
    )
    item.add_links([
        pystac.Link(
            target=source_doc,
            title="Source",
            rel=pystac.RelType.DERIVED_FROM,
            media_type="text/html",
        )
    ])
    s3_dump(
        json.dumps(item.to_dict(), indent=2),
        str(out_stac),
        ContentType="application/json",
        ACL="bucket-owner-full-control",
    )
    log.info(f"STAC written to {out_stac}")

Example #13

0

Show file

File: landsat_gap_report.py Project: digitalearthafrica/scripts

def generate_buckets_diff(
    bucket_name: str,
    satellites: str,
    file_name: str,
    update_stac: bool = False,
    notification_url: str = None,
):
    """
    Compare USGS bulk files and Africa inventory bucket detecting differences
    A report containing missing keys will be written to AFRICA_S3_BUCKET_PATH
    """

    log = setup_logging()

    start_timer = time.time()

    log.info("Task started")

    landsat_status_report_path = URL(f"s3://{bucket_name}/status-report/")
    landsat_status_report_url = URL(
        f"https://{bucket_name}.s3.af-south-1.amazonaws.com/status-report/")
    environment = "DEV" if "dev" in bucket_name else "PDS"

    title = " & ".join(satellites).replace("ls", "Landsat ")

    log.info(f"Environment {environment}")
    log.info(f"Bucket Name {bucket_name}")
    log.info(f"Satellites {satellites}")
    log.info(f"File Name {file_name}")
    log.info(f"Update all ({update_stac})")
    log.info(f"Notification URL ({notification_url})")

    # Create connection to the inventory S3 bucket
    log.info(f"Retrieving keys from inventory bucket {LANDSAT_INVENTORY_PATH}")
    dest_paths = get_and_filter_keys(satellites=satellites)

    log.info(f"INVENTORY bucket number of objects {len(dest_paths)}")
    log.info(f"INVENTORY 10 first {list(dest_paths)[0:10]}")
    date_string = datetime.now().strftime("%Y-%m-%d")

    # Download bulk file
    log.info("Download Bulk file")
    file_path = download_file_to_tmp(url=str(BASE_BULK_CSV_URL),
                                     file_name=file_name)

    # Retrieve keys from the bulk file
    log.info("Filtering keys from bulk file")
    source_paths = get_and_filter_keys_from_files(file_path)

    log.info(f"BULK FILE number of objects {len(source_paths)}")
    log.info(f"BULK 10 First {list(source_paths)[0:10]}")

    output_filename = "No missing scenes were found"

    if update_stac:
        log.info("FORCED UPDATE ACTIVE!")
        missing_scenes = source_paths
        orphaned_scenes = []

    else:
        # collect missing scenes
        # missing scenes = keys that are in the bulk file but missing in PDS sync bucket and/or in source bucket
        log.info("Filtering missing scenes")
        missing_scenes = [
            str(USGS_S3_BUCKET_PATH / path)
            for path in source_paths.difference(dest_paths)
        ]

        # collect orphan scenes
        # orphan scenes = keys that are in PDS sync bucket but missing in the bulk file and/or in source bucket
        log.info("Filtering orphan scenes")
        orphaned_scenes = [
            str(URL(f"s3://{bucket_name}") / path)
            for path in dest_paths.difference(source_paths)
        ]

        log.info(f"Found {len(missing_scenes)} missing scenes")
        log.info(f"missing_scenes 10 first keys {list(missing_scenes)[0:10]}")
        log.info(f"Found {len(orphaned_scenes)} orphaned scenes")
        log.info(
            f"orphaned_scenes 10 first keys {list(orphaned_scenes)[0:10]}")

    landsat_s3 = s3_client(region_name="af-south-1")

    if len(missing_scenes) > 0 or len(orphaned_scenes) > 0:
        output_filename = (
            (f"{title}_{date_string}_gap_report.json" if not update_stac
             else URL(f"{date_string}_gap_report_update.json")).replace(
                 " ", "_").replace("_&", ""))

        log.info(
            f"Report file will be saved in {landsat_status_report_path / output_filename}"
        )
        missing_orphan_scenes_json = json.dumps({
            "orphan": orphaned_scenes,
            "missing": missing_scenes
        })

        s3_dump(
            data=missing_orphan_scenes_json,
            url=str(landsat_status_report_path / output_filename),
            s3=landsat_s3,
            ContentType="application/json",
        )

    report_output = (str(landsat_status_report_url /
                         output_filename) if len(missing_scenes) > 0
                     or len(orphaned_scenes) > 0 else output_filename)

    message = dedent(f"*{title} GAP REPORT - {environment}*\n "
                     f"Missing Scenes: {len(missing_scenes)}\n"
                     f"Orphan Scenes: {len(orphaned_scenes)}\n"
                     f"Report: {report_output}\n")

    log.info(message)

    log.info(
        f"File {file_name} processed and sent in {time_process(start=start_timer)}"
    )

    if not update_stac and (len(missing_scenes) > 200
                            or len(orphaned_scenes) > 200):
        if notification_url is not None:
            send_slack_notification(notification_url,
                                    f"{satellites} Gap Report", message)
        raise Exception(f"More than 200 scenes were found \n {message}")

Example #14

0

Show file

def download_and_cog_chirps(
    year: str,
    month: str,
    s3_dst: str,
    day: str = None,
    overwrite: bool = False,
    slack_url: str = None,
):
    # Cleaning and sanity checks
    s3_dst = s3_dst.rstrip("/")

    # Set up file strings
    if day is not None:
        # Set up a daily process
        in_file = f"chirps-v2.0.{year}.{month}.{day}.tif.gz"
        in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file)
        in_data = f"/vsigzip//vsicurl/{in_href}"
        if not check_for_url_existence(in_href):
            log.warning("Couldn't find the gzipped file, trying the .tif")
            in_file = f"chirps-v2.0.{year}.{month}.{day}.tif"
            in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file)
            in_data = f"/vsicurl/{in_href}"

            if not check_for_url_existence(in_href):
                log.error("Couldn't find the .tif file either, aborting")
                sys.exit(1)

        file_base = f"{s3_dst}/{year}/{month}/chirps-v2.0_{year}.{month}.{day}"
        out_data = f"{file_base}.tif"
        out_stac = f"{file_base}.stac-item.json"

        start_datetime = f"{year}-{month}-{day}T00:00:00Z"
        end_datetime = f"{year}-{month}-{day}T23:59:59Z"
        product_name = "rainfall_chirps_daily"
    else:
        # Set up a monthly process
        in_file = f"chirps-v2.0.{year}.{month}.tif.gz"
        in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file)
        in_data = f"/vsigzip//vsicurl/{in_href}"
        if not check_for_url_existence(in_href):
            log.warning("Couldn't find the gzipped file, trying the .tif")
            in_file = f"chirps-v2.0.{year}.{month}.tif"
            in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file)
            in_data = f"/vsicurl/{in_href}"

            if not check_for_url_existence(in_href):
                log.error("Couldn't find the .tif file either, aborting")
                sys.exit(1)

        file_base = f"{s3_dst}/chirps-v2.0_{year}.{month}"
        out_data = f"{file_base}.tif"
        out_stac = f"{file_base}.stac-item.json"

        _, end = calendar.monthrange(int(year), int(month))
        start_datetime = f"{year}-{month}-01T00:00:00Z"
        end_datetime = f"{year}-{month}-{end}T23:59:59Z"
        product_name = "rainfall_chirps_monthly"

        # Set to 15 for the STAC metadata
        day = 15

    try:
        # Check if file already exists
        log.info(f"Working on {in_file}")
        if not overwrite and s3_head_object(out_stac) is not None:
            log.warning(f"File {out_stac} already exists. Skipping.")
            return

        # COG and STAC
        with MemoryFile() as mem_dst:
            # Creating the COG, with a memory cache and no download. Shiny.
            cog_translate(
                in_data,
                mem_dst.name,
                cog_profiles.get("deflate"),
                in_memory=True,
                nodata=-9999,
            )
            # Creating the STAC document with appropriate date range
            _, end = calendar.monthrange(int(year), int(month))
            item = create_stac_item(
                mem_dst,
                id=str(odc_uuid("chirps", "2.0", [in_file])),
                with_proj=True,
                input_datetime=datetime(int(year), int(month), int(day)),
                properties={
                    "odc:processing_datetime": datetime_to_str(datetime.now()),
                    "odc:product": product_name,
                    "start_datetime": start_datetime,
                    "end_datetime": end_datetime,
                },
            )
            item.set_self_href(out_stac)
            # Manually redo the asset
            del item.assets["asset"]
            item.assets["rainfall"] = pystac.Asset(
                href=out_data,
                title="CHIRPS-v2.0",
                media_type=pystac.MediaType.COG,
                roles=["data"],
            )
            # Let's add a link to the source
            item.add_links([
                pystac.Link(
                    target=in_href,
                    title="Source file",
                    rel=pystac.RelType.DERIVED_FROM,
                    media_type="application/gzip",
                )
            ])

            # Dump the data to S3
            mem_dst.seek(0)
            log.info(f"Writing DATA to: {out_data}")
            s3_dump(mem_dst, out_data, ACL="bucket-owner-full-control")
            # Write STAC to S3
            log.info(f"Writing STAC to: {out_stac}")
            s3_dump(
                json.dumps(item.to_dict(), indent=2),
                out_stac,
                ContentType="application/json",
                ACL="bucket-owner-full-control",
            )
            # All done!
            log.info(f"Completed work on {in_file}")

    except Exception as e:
        message = f"Failed to handle {in_file} with error {e}"

        if slack_url is not None:
            send_slack_notification(slack_url, "Chirps Rainfall Monthly",
                                    message)
        log.exception(message)

        exit(1)

Example #15

0

Show file

File: alos_palsar.py Project: digitalearthafrica/scripts

def write_stac(s3_destination: str, file_path: str, file_key: str, year: str,
               log: Logger) -> str:
    region_code = file_key.split("_")[0]
    stac_href = f"s3://{s3_destination}/{file_key}.stac-item.json"
    log.info(f"Creating STAC file in memory, targeting here: {stac_href}")

    if int(year) > 2010:
        hhpath = f"{file_key}_sl_HH_F02DAR.tif"
        hvpath = f"{file_key}_sl_HV_F02DAR.tif"
        lincipath = f"{file_key}_sl_linci_F02DAR.tif"
        maskpath = f"{file_key}_sl_mask_F02DAR.tif"
        datepath = f"{file_key}_sl_date_F02DAR.tif"
        launch_date = "2014-05-24"
        shortname = "alos"
    else:
        hhpath = f"{file_key}_sl_HH.tif"
        hvpath = f"{file_key}_sl_HV.tif"
        lincipath = f"{file_key}_sl_linci.tif"
        maskpath = f"{file_key}_sl_mask.tif"
        datepath = f"{file_key}_sl_date.tif"
        if int(year) > 2000:
            launch_date = "2006-01-24"
            shortname = "alos"
        else:
            launch_date = "1992-02-11"
            shortname = "jers"
    if shortname == "alos":
        product_name = "alos_palsar_mosaic"
        platform = "ALOS/ALOS-2"
        instrument = "PALSAR/PALSAR-2"
        cf = "83.0 dB"
        bandpaths = {
            "hh": hhpath,
            "hv": hvpath,
            "linci": lincipath,
            "mask": maskpath,
            "date": datepath,
        }
    else:
        product_name = "jers_sar_mosaic"
        platform = "JERS-1"
        instrument = "SAR"
        cf = "84.66 dB"
        bandpaths = {
            "hh": hhpath,
            "linci": lincipath,
            "mask": maskpath,
            "date": datepath,
        }

    properties = {
        "odc:product": product_name,
        "odc:region_code": region_code,
        "platform": platform,
        "instruments": [instrument],
        "cf": cf,
        "launchdate": launch_date,
        "start_datetime": f"{year}-01-01T00:00:00Z",
        "end_datetime": f"{year}-12-31T23:59:59Z",
    }

    assets = {}
    for name, path in bandpaths.items():
        href = f"s3://{s3_destination}/{path}"
        assets[name] = pystac.Asset(href=href,
                                    media_type=pystac.MediaType.COG,
                                    roles=["data"])

    item = create_stac_item(
        file_path,
        id=str(
            odc_uuid(shortname,
                     "1", [],
                     year=year,
                     tile=file_key.split("_")[0])),
        properties=properties,
        assets=assets,
        with_proj=True,
    )
    item.set_self_href(stac_href)

    s3_dump(
        json.dumps(item.to_dict(), indent=2),
        item.self_href,
        ContentType="application/json",
        ACL="bucket-owner-full-control",
    )
    log.info(f"STAC written to {item.self_href}")