def upload_metadata(granule_id): """ Creates and uploads metadata in stac and eo3 formats. :param granule_id: the id of the granule in format 'date/tile_id' :return: serialized stac metadata """ local_path = Path(NCI_DIR) / granule_id granule_s3_path = get_granule_s3_path(granule_id) s3_path = f"s3://{S3_BUCKET}/{granule_s3_path}/" s3_eo3_path = f"{s3_path}eo3-ARD-METADATA.yaml" s3_stac_path = f"{s3_path}stac-ARD-METADATA.json" eo3 = create_eo3(local_path, granule_id) stac = to_stac_item( eo3, stac_item_destination_url=s3_stac_path, odc_dataset_metadata_url=s3_eo3_path, dataset_location=s3_path, ) stac_dump = json.dumps(stac, default=json_fallback, indent=4) s3_dump( yaml.safe_dump(serialise.to_doc(eo3), default_flow_style=False), s3_eo3_path, ACL="bucket-owner-full-control", ContentType="text/vnd.yaml" ) return stac_dump, s3_stac_path
def upload_granule(granule_id, sns_topic_arn): """ :param granule_id: the id of the granule in format 'date/tile_id' """ session = boto3.session.Session() bucket_stac_path = f"{get_granule_s3_path(granule_id)}/stac-ARD-METADATA.json" if not check_granule_exists(S3_BUCKET, bucket_stac_path, session=session): sync_granule( granule_id, NCI_DIR, Path(get_granule_s3_path(granule_id)).parent.parent, S3_BUCKET, exclude=["NBAR/*", "ARD-METADATA.yaml", "*NBAR_CONTIGUITY.TIF"], cross_account=True, ) stac_dump, s3_stac_path = upload_metadata(granule_id) message_attributes = get_common_message_attributes(json.loads(stac_dump)) message_attributes.update( {"action": {"DataType": "String", "StringValue": "ADDED"}} ) _LOG.info(f"Sending SNS. Granule id: {granule_id}") try: publish_sns(sns_topic_arn, stac_dump, message_attributes, session=session) except Exception as e: _LOG.info(f"SNS send failed: {e}. Granule id: {granule_id}") _LOG.info(f"Uploading STAC: {granule_id}") s3_dump(stac_dump, s3_stac_path, ACL="bucket-owner-full-control", ContentType="application/json") # upload STAC last else: _LOG.info(f"Granule {granule_id} already uploaded, skipping.")
def create_and_upload_stac(cog_file: Path, s3_dst: str, year) -> Item: out_path = URL(f"{s3_dst}/{year}/") log.info("Item base creation") item = create_stac_item( str(cog_file), id=str(odc_uuid("gmw", "2.0", [cog_file.name.replace("tif", "")])), with_proj=True, input_datetime=datetime(int(year), 12, 31), properties={ "odc:product": "gmw", "start_datetime": f"{year}-01-01T00:00:00Z", "end_datetime": f"{year}-12-31T23:59:59Z", }, ) log.info("links creation") item.set_self_href(str(out_path / f"gmw_{year}_stac-item.json")) item.add_links([ pystac.Link( target=str(SOURCE_URL_PATH / FILE_NAME.format(year=year)), title="Source file", rel=pystac.RelType.DERIVED_FROM, media_type="application/zip", ) ]) out_data = out_path / cog_file.name # Remove asset created by create_stac_item and add our own del item.assets["asset"] item.assets["mangrove"] = pystac.Asset( href=str(out_data), title="gmw-v1.0", media_type=pystac.MediaType.COG, roles=["data"], ) log.info(f"Item created {item.to_dict()}") log.info(f"Item validated {item.validate()}") log.info(f"Dump the data to S3 {str(cog_file)}") s3_dump( data=open(str(cog_file), "rb").read(), url=str(out_data), ACL="bucket-owner-full-control", ContentType="image/tiff", ) log.info(f"File written to {out_data}") log.info("Write STAC to S3") s3_dump( data=json.dumps(item.to_dict(), indent=2), url=item.self_href, ACL="bucket-owner-full-control", ContentType="application/json", ) log.info(f"STAC written to {item.self_href}") return item
def process_dataset(s3_obj): s3_eo3_path = s3_obj.url s3_stac_path = s3_eo3_path.replace("eo3", "stac") s3_stac_path = s3_stac_path.replace("yaml", "json") s3_path = s3_eo3_path.replace("eo3-ARD-METADATA.yaml", "") granule = os.path.join(*s3_eo3_path.split('/')[5:-1]) nci_path = os.path.join(NCI_DIR, *s3_eo3_path.split('/')[5:-1], "ARD-METADATA.yaml") if "S2A_OPER_MSI_ARD" in granule: platform = "SENTINEL_2A" elif "S2B_OPER_MSI_ARD" in granule: platform = "SENTINEL_2B" else: raise ValueError( f"Expected granule id to contain either 'S2A_OPER_MSI_ARD' or 'S2B_OPER_MSI_ARD', found '{granule}'" ) with open(nci_path) as fin: eo_metadata = yaml.safe_load(fin) eo3_metadata = yaml.safe_load(s3_obj.data) eo3_metadata["properties"]["odc:region_code"] = eo_metadata["provider"]["reference_code"] eo3_metadata["properties"]["gqa:cep90"] = eo_metadata["gqa"]["residual"]["cep90"] eo3_metadata["properties"]["gqa:error_message"] = eo_metadata["gqa"]["error_message"] eo3_metadata["properties"]["gqa:final_gcp_count"] = eo_metadata["gqa"]["final_gcp_count"] eo3_metadata["properties"]["gqa:ref_source"] = eo_metadata["gqa"]["ref_source"] eo3_metadata["properties"]["sentinel:datatake_start_datetime"] = granule.split("_")[-4] eo3_metadata["properties"]["eo:platform"] = platform eo3_metadata["properties"]["eo:instrument"] = "MSI" for key in ["abs_iterative_mean", "abs", "iterative_mean", "iterative_stddev", "mean", "stddev"]: eo3_metadata["properties"][f"gqa:{key}_xy"] = eo_metadata["gqa"]["residual"][key]["xy"] eo3 = serialise.from_doc(eo3_metadata) stac = to_stac_item( eo3, stac_item_destination_url=s3_stac_path, odc_dataset_metadata_url=s3_eo3_path, dataset_location=s3_path, ) stac_dump = json.dumps(stac, default=json_fallback, indent=4) eo3_dump = yaml.safe_dump(eo3_metadata, default_flow_style=False) s3_dump( eo3_dump, s3_eo3_path, ACL="bucket-owner-full-control", ContentType="text/vnd.yaml", ) s3_dump( stac_dump, s3_stac_path, ACL="bucket-owner-full-control", ContentType="application/json" )
def upload_dataset_doc(src_yaml, s3_url): """ Replace metadata with additional info :param src_yaml: metadata file in NCI :param s3_url: path to upload metadata to in s3 """ with open(src_yaml) as fin: nci_dataset = yaml.safe_load(fin) metadata_to_upload = munge_metadata(nci_dataset) s3_dump(yaml.safe_dump(metadata_to_upload, default_flow_style=False), s3_url, S3)
def replace_in_s3_obj(s3_url): try: original = s3_fetch(s3_url, s3) except ValueError as e: tqdm.write(str(e)) return contents = original.replace(b'LANDSAT_8', b'LANDSAT_7') contents = contents.replace(b'OLI', b'ETM') if original != contents: s3_dump(contents, s3_url, s3) tqdm.write('.') else: tqdm.write(' - Skipped.')
def publish_to_s3(data: list, output_filename: str, content_type: str = "text/plain"): """ write report to s3 """ s3 = s3_client(region_name=DEAFRICA_AWS_REGION) s3_dump( data=data, url=str(DEAFRICA_ORPHAN_REPORT_S3_PATH / output_filename), s3=s3, ContentType=content_type, ) print( f"Report can be accessed from {DEAFRICA_ORPHAN_REPORT_S3_PATH / output_filename}" )
def upload_to_s3(s3_destination, files, log): log.info(f"Uploading to {s3_destination}") # Upload data for out_file in files: out_name = os.path.basename(out_file) dest = f"S3://{s3_destination}/{out_name}" log.info(f"Uploading file to {dest}") if "yaml" in out_name: content_type = "text/yaml" else: content_type = "image/tiff" s3_dump( data=open(out_file, "rb").read(), url=dest, ACL="bucket-owner-full-control", ContentType=content_type, )
def create_mosaic( dc: Datacube, product: str, out_product: str, time: Tuple[str, str], time_str: str, bands: Tuple[str], s3_output_root: str, split_bands: bool = False, resolution: int = 120, overwrite: bool = False, ): log = setup_logging() log.info(f"Creating mosaic for {product} over {time}") client = start_local_dask() assets = {} data = dc.load( product=product, time=time, resolution=(-resolution, resolution), dask_chunks={"x": 2048, "y": 2048}, measurements=bands, ) # This is a bad idea, we run out of memory # data.persist() if not split_bands: log.info("Creating a single tif file") out_file = _get_path(s3_output_root, out_product, time_str, "tif") exists = s3_head_object(out_file) is not None skip_writing = not (not exists or overwrite) try: asset, _ = _save_opinionated_cog( data, out_file, skip_writing=skip_writing, ) except ValueError: log.exception( "Failed to create COG, please check that you only have one timestep in the period." ) exit(1) assets[bands[0]] = asset if skip_writing: log.info(f"File exists, and overwrite is False. Not writing {out_file}") else: log.info(f"Finished writing: {asset.href}") else: log.info("Creating multiple tif files") for band in bands: out_file = _get_path( s3_output_root, out_product, time_str, "tif", band=band ) exists = s3_head_object(out_file) is not None skip_writing = not (not exists or overwrite) try: asset, band = _save_opinionated_cog( data=data, out_file=out_file, band=band, skip_writing=skip_writing, ) except ValueError: log.exception( "Failed to create COG, please check that you only have one timestep in the period." ) exit(1) assets[band] = asset if skip_writing: log.info(f"File exists, and overwrite is False. Not writing {out_file}") else: log.info(f"Finished writing: {asset.href}") # Aggressively heavy handed, but we get memory leaks otherwise client.restart() out_stac_file = _get_path(s3_output_root, out_product, time_str, "stac-item.json") item = create_stac_item( assets[bands[0]].href, id=f"{product}_{time_str}", assets=assets, with_proj=True, properties={ "odc:product": out_product, "start_datetime": f"{time[0]}T00:00:00Z", "end_datetime": f"{time[1]}T23:59:59Z", }, ) item.set_self_href(out_stac_file) log.info(f"Writing STAC: {out_stac_file}") client = s3_client(aws_unsigned=False) s3_dump( data=json.dumps(item.to_dict(), indent=2), url=item.self_href, ACL="bucket-owner-full-control", ContentType="application/json", s3=client, )
def download_gls(year: str, s3_dst: str, workdir: Path, overwrite: bool = False): log = setup_logging() assets = {} out_stac = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}.stac-item.json" if s3_head_object(str(out_stac)) is not None and not overwrite: log.info(f"{out_stac} exists, skipping") return # Download the files for name, file in FILES.items(): # Create a temporary directory to work with with TemporaryDirectory(prefix=workdir) as tmpdir: log.info(f"Working on {file}") url = URL( BASE_URL.format( record_id=YEARS[year][1], year_key=YEARS[year][0], file=file ) ) dest_url = URL(s3_dst) / year / f"{PRODUCT_NAME}_{year}_{name}.tif" if s3_head_object(str(dest_url)) is None or overwrite: log.info(f"Downloading {url}") try: local_file = Path(tmpdir) / str(url.name) # Download the file download_file(url, local_file) log.info(f"Downloaded file to {local_file}") local_file_small = translate_file_deafrica_extent(local_file) log.info(f"Clipped Africa out and saved to {local_file_small}") resampling = "nearest" if name in DO_NEAREST else "bilinear" # Create a COG in memory and upload to S3 with MemoryFile() as mem_dst: # Creating the COG, with a memory cache and no download. Shiny. cog_translate( local_file_small, mem_dst.name, cog_profiles.get("deflate"), in_memory=True, nodata=255, overview_resampling=resampling, ) mem_dst.seek(0) s3_dump(mem_dst, str(dest_url), ACL="bucket-owner-full-control") log.info(f"File written to {dest_url}") except Exception: log.exception(f"Failed to process {url}") exit(1) else: log.info(f"{dest_url} exists, skipping") assets[name] = pystac.Asset( href=str(dest_url), roles=["data"], media_type=pystac.MediaType.COG ) # Write STAC document from the last-written file source_doc = f"https://zenodo.org/record/{YEARS[year][1]}" item = create_stac_item( str(dest_url), id=str(odc_uuid("Copernicus Global Land Cover", "3.0.1", [source_doc])), assets=assets, with_proj=True, properties={ "odc:product": PRODUCT_NAME, "start_datetime": f"{year}-01-01T00:00:00Z", "end_datetime": f"{year}-12-31T23:59:59Z", }, ) item.add_links( [ pystac.Link( target=source_doc, title="Source", rel=pystac.RelType.DERIVED_FROM, media_type="text/html", ) ] ) s3_dump( json.dumps(item.to_dict(), indent=2), str(out_stac), ContentType="application/json", ACL="bucket-owner-full-control", ) log.info(f"STAC written to {out_stac}")
def generate_buckets_diff( bucket_name: str, update_stac: bool = False, notification_url: str = None, ) -> None: """ Compare Sentinel-2 buckets in US and Africa and detect differences A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report :param bucket_name: (str) Bucket where the gap report is :param update_stac: (bool) Define if the report will contain all scenes from the source for an update :param notification_url: (str) Optional slack URL in case of you want to send a slack notification """ log = setup_logging() log.info("Task started") # defines where the report will be saved s2_status_report_path = URL(f"s3://{bucket_name}/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" log.info(f"Environment {environment}") date_string = datetime.now().strftime("%Y-%m-%d") # Retrieve keys from inventory bucket source_keys = get_and_filter_cogs_keys() output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys) orphaned_keys = set() else: destination_keys = set(ns.Key for ns in list_inventory( manifest=f"{SENTINEL_2_INVENTORY_PATH}", prefix=BASE_FOLDER_NAME, contains=".json", n_threads=200, )) # Keys that are missing, they are in the source but not in the bucket missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys if key not in destination_keys) # Keys that are lost, they are in the bucket but not found in the source orphaned_keys = destination_keys.difference(source_keys) s2_s3 = s3_client(region_name=SENTINEL_2_REGION) if len(missing_scenes) > 0 or len(orphaned_keys) > 0: output_filename = (f"{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")) log.info( f"File will be saved in {s2_status_report_path}/{output_filename}") missing_orphan_scenes_json = json.dumps({ "orphan": list(orphaned_keys), "missing": list(missing_scenes) }) s3_dump( data=missing_orphan_scenes_json, url=str(URL(s2_status_report_path) / output_filename), s3=s2_s3, ContentType="application/json", ) report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}" message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n" f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_keys)}\n" f"Report: {report_http_link}\n") log.info(message) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_keys) > 200): if notification_url is not None: send_slack_notification(notification_url, "S2 Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")
def download_cci_lc(year: str, s3_dst: str, workdir: str, overwrite: bool = False): log = setup_logging() assets = {} cci_lc_version = get_version_from_year(year) name = f"{PRODUCT_NAME}_{year}_{cci_lc_version}" out_cog = URL(s3_dst) / year / f"{name}.tif" out_stac = URL(s3_dst) / year / f"{name}.stac-item.json" if s3_head_object(str(out_stac)) is not None and not overwrite: log.info(f"{out_stac} exists, skipping") return workdir = Path(workdir) if not workdir.exists(): workdir.mkdir(parents=True, exist_ok=True) # Create a temporary directory to work with tmpdir = mkdtemp(prefix=str(f"{workdir}/")) log.info(f"Working on {year} in the path {tmpdir}") if s3_head_object(str(out_cog)) is None or overwrite: log.info(f"Downloading {year}") try: local_file = Path(tmpdir) / f"{name}.zip" if not local_file.exists(): # Download the file c = cdsapi.Client() # We could also retrieve the object metadata from the CDS. # e.g. f = c.retrieve("series",{params}) | f.location = URL to download c.retrieve( "satellite-land-cover", { "format": "zip", "variable": "all", "version": cci_lc_version, "year": str(year), }, local_file, ) log.info(f"Downloaded file to {local_file}") else: log.info( f"File {local_file} exists, continuing without downloading" ) # Unzip the file log.info(f"Unzipping {local_file}") unzipped = None with zipfile.ZipFile(local_file, "r") as zip_ref: unzipped = local_file.parent / zip_ref.namelist()[0] zip_ref.extractall(tmpdir) # Process data ds = xr.open_dataset(unzipped) # Subset to Africa ulx, uly, lrx, lry = AFRICA_BBOX # Note: lats are upside down! ds_small = ds.sel(lat=slice(uly, lry), lon=slice(ulx, lrx)) ds_small = assign_crs(ds_small, crs="epsg:4326") # Create cog (in memory - :mem: returns bytes object) mem_dst = write_cog( ds_small.lccs_class, ":mem:", nodata=0, overview_resampling="nearest", ) # Write to s3 s3_dump(mem_dst, str(out_cog), ACL="bucket-owner-full-control") log.info(f"File written to {out_cog}") except Exception: log.exception(f"Failed to process {name}") exit(1) else: log.info(f"{out_cog} exists, skipping") assets["classification"] = pystac.Asset(href=str(out_cog), roles=["data"], media_type=pystac.MediaType.COG) # Write STAC document source_doc = ( "https://cds.climate.copernicus.eu/cdsapp#!/dataset/satellite-land-cover" ) item = create_stac_item( str(out_cog), id=str( odc_uuid("Copernicus Land Cover", cci_lc_version, [source_doc, name])), assets=assets, with_proj=True, properties={ "odc:product": PRODUCT_NAME, "start_datetime": f"{year}-01-01T00:00:00Z", "end_datetime": f"{year}-12-31T23:59:59Z", }, ) item.add_links([ pystac.Link( target=source_doc, title="Source", rel=pystac.RelType.DERIVED_FROM, media_type="text/html", ) ]) s3_dump( json.dumps(item.to_dict(), indent=2), str(out_stac), ContentType="application/json", ACL="bucket-owner-full-control", ) log.info(f"STAC written to {out_stac}")
def generate_buckets_diff( bucket_name: str, satellites: str, file_name: str, update_stac: bool = False, notification_url: str = None, ): """ Compare USGS bulk files and Africa inventory bucket detecting differences A report containing missing keys will be written to AFRICA_S3_BUCKET_PATH """ log = setup_logging() start_timer = time.time() log.info("Task started") landsat_status_report_path = URL(f"s3://{bucket_name}/status-report/") landsat_status_report_url = URL( f"https://{bucket_name}.s3.af-south-1.amazonaws.com/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" title = " & ".join(satellites).replace("ls", "Landsat ") log.info(f"Environment {environment}") log.info(f"Bucket Name {bucket_name}") log.info(f"Satellites {satellites}") log.info(f"File Name {file_name}") log.info(f"Update all ({update_stac})") log.info(f"Notification URL ({notification_url})") # Create connection to the inventory S3 bucket log.info(f"Retrieving keys from inventory bucket {LANDSAT_INVENTORY_PATH}") dest_paths = get_and_filter_keys(satellites=satellites) log.info(f"INVENTORY bucket number of objects {len(dest_paths)}") log.info(f"INVENTORY 10 first {list(dest_paths)[0:10]}") date_string = datetime.now().strftime("%Y-%m-%d") # Download bulk file log.info("Download Bulk file") file_path = download_file_to_tmp(url=str(BASE_BULK_CSV_URL), file_name=file_name) # Retrieve keys from the bulk file log.info("Filtering keys from bulk file") source_paths = get_and_filter_keys_from_files(file_path) log.info(f"BULK FILE number of objects {len(source_paths)}") log.info(f"BULK 10 First {list(source_paths)[0:10]}") output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = source_paths orphaned_scenes = [] else: # collect missing scenes # missing scenes = keys that are in the bulk file but missing in PDS sync bucket and/or in source bucket log.info("Filtering missing scenes") missing_scenes = [ str(USGS_S3_BUCKET_PATH / path) for path in source_paths.difference(dest_paths) ] # collect orphan scenes # orphan scenes = keys that are in PDS sync bucket but missing in the bulk file and/or in source bucket log.info("Filtering orphan scenes") orphaned_scenes = [ str(URL(f"s3://{bucket_name}") / path) for path in dest_paths.difference(source_paths) ] log.info(f"Found {len(missing_scenes)} missing scenes") log.info(f"missing_scenes 10 first keys {list(missing_scenes)[0:10]}") log.info(f"Found {len(orphaned_scenes)} orphaned scenes") log.info( f"orphaned_scenes 10 first keys {list(orphaned_scenes)[0:10]}") landsat_s3 = s3_client(region_name="af-south-1") if len(missing_scenes) > 0 or len(orphaned_scenes) > 0: output_filename = ( (f"{title}_{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")).replace( " ", "_").replace("_&", "")) log.info( f"Report file will be saved in {landsat_status_report_path / output_filename}" ) missing_orphan_scenes_json = json.dumps({ "orphan": orphaned_scenes, "missing": missing_scenes }) s3_dump( data=missing_orphan_scenes_json, url=str(landsat_status_report_path / output_filename), s3=landsat_s3, ContentType="application/json", ) report_output = (str(landsat_status_report_url / output_filename) if len(missing_scenes) > 0 or len(orphaned_scenes) > 0 else output_filename) message = dedent(f"*{title} GAP REPORT - {environment}*\n " f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_scenes)}\n" f"Report: {report_output}\n") log.info(message) log.info( f"File {file_name} processed and sent in {time_process(start=start_timer)}" ) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_scenes) > 200): if notification_url is not None: send_slack_notification(notification_url, f"{satellites} Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")
def download_and_cog_chirps( year: str, month: str, s3_dst: str, day: str = None, overwrite: bool = False, slack_url: str = None, ): # Cleaning and sanity checks s3_dst = s3_dst.rstrip("/") # Set up file strings if day is not None: # Set up a daily process in_file = f"chirps-v2.0.{year}.{month}.{day}.tif.gz" in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file) in_data = f"/vsigzip//vsicurl/{in_href}" if not check_for_url_existence(in_href): log.warning("Couldn't find the gzipped file, trying the .tif") in_file = f"chirps-v2.0.{year}.{month}.{day}.tif" in_href = DAILY_URL_TEMPLATE.format(year=year, in_file=in_file) in_data = f"/vsicurl/{in_href}" if not check_for_url_existence(in_href): log.error("Couldn't find the .tif file either, aborting") sys.exit(1) file_base = f"{s3_dst}/{year}/{month}/chirps-v2.0_{year}.{month}.{day}" out_data = f"{file_base}.tif" out_stac = f"{file_base}.stac-item.json" start_datetime = f"{year}-{month}-{day}T00:00:00Z" end_datetime = f"{year}-{month}-{day}T23:59:59Z" product_name = "rainfall_chirps_daily" else: # Set up a monthly process in_file = f"chirps-v2.0.{year}.{month}.tif.gz" in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file) in_data = f"/vsigzip//vsicurl/{in_href}" if not check_for_url_existence(in_href): log.warning("Couldn't find the gzipped file, trying the .tif") in_file = f"chirps-v2.0.{year}.{month}.tif" in_href = MONTHLY_URL_TEMPLATE.format(in_file=in_file) in_data = f"/vsicurl/{in_href}" if not check_for_url_existence(in_href): log.error("Couldn't find the .tif file either, aborting") sys.exit(1) file_base = f"{s3_dst}/chirps-v2.0_{year}.{month}" out_data = f"{file_base}.tif" out_stac = f"{file_base}.stac-item.json" _, end = calendar.monthrange(int(year), int(month)) start_datetime = f"{year}-{month}-01T00:00:00Z" end_datetime = f"{year}-{month}-{end}T23:59:59Z" product_name = "rainfall_chirps_monthly" # Set to 15 for the STAC metadata day = 15 try: # Check if file already exists log.info(f"Working on {in_file}") if not overwrite and s3_head_object(out_stac) is not None: log.warning(f"File {out_stac} already exists. Skipping.") return # COG and STAC with MemoryFile() as mem_dst: # Creating the COG, with a memory cache and no download. Shiny. cog_translate( in_data, mem_dst.name, cog_profiles.get("deflate"), in_memory=True, nodata=-9999, ) # Creating the STAC document with appropriate date range _, end = calendar.monthrange(int(year), int(month)) item = create_stac_item( mem_dst, id=str(odc_uuid("chirps", "2.0", [in_file])), with_proj=True, input_datetime=datetime(int(year), int(month), int(day)), properties={ "odc:processing_datetime": datetime_to_str(datetime.now()), "odc:product": product_name, "start_datetime": start_datetime, "end_datetime": end_datetime, }, ) item.set_self_href(out_stac) # Manually redo the asset del item.assets["asset"] item.assets["rainfall"] = pystac.Asset( href=out_data, title="CHIRPS-v2.0", media_type=pystac.MediaType.COG, roles=["data"], ) # Let's add a link to the source item.add_links([ pystac.Link( target=in_href, title="Source file", rel=pystac.RelType.DERIVED_FROM, media_type="application/gzip", ) ]) # Dump the data to S3 mem_dst.seek(0) log.info(f"Writing DATA to: {out_data}") s3_dump(mem_dst, out_data, ACL="bucket-owner-full-control") # Write STAC to S3 log.info(f"Writing STAC to: {out_stac}") s3_dump( json.dumps(item.to_dict(), indent=2), out_stac, ContentType="application/json", ACL="bucket-owner-full-control", ) # All done! log.info(f"Completed work on {in_file}") except Exception as e: message = f"Failed to handle {in_file} with error {e}" if slack_url is not None: send_slack_notification(slack_url, "Chirps Rainfall Monthly", message) log.exception(message) exit(1)
def write_stac(s3_destination: str, file_path: str, file_key: str, year: str, log: Logger) -> str: region_code = file_key.split("_")[0] stac_href = f"s3://{s3_destination}/{file_key}.stac-item.json" log.info(f"Creating STAC file in memory, targeting here: {stac_href}") if int(year) > 2010: hhpath = f"{file_key}_sl_HH_F02DAR.tif" hvpath = f"{file_key}_sl_HV_F02DAR.tif" lincipath = f"{file_key}_sl_linci_F02DAR.tif" maskpath = f"{file_key}_sl_mask_F02DAR.tif" datepath = f"{file_key}_sl_date_F02DAR.tif" launch_date = "2014-05-24" shortname = "alos" else: hhpath = f"{file_key}_sl_HH.tif" hvpath = f"{file_key}_sl_HV.tif" lincipath = f"{file_key}_sl_linci.tif" maskpath = f"{file_key}_sl_mask.tif" datepath = f"{file_key}_sl_date.tif" if int(year) > 2000: launch_date = "2006-01-24" shortname = "alos" else: launch_date = "1992-02-11" shortname = "jers" if shortname == "alos": product_name = "alos_palsar_mosaic" platform = "ALOS/ALOS-2" instrument = "PALSAR/PALSAR-2" cf = "83.0 dB" bandpaths = { "hh": hhpath, "hv": hvpath, "linci": lincipath, "mask": maskpath, "date": datepath, } else: product_name = "jers_sar_mosaic" platform = "JERS-1" instrument = "SAR" cf = "84.66 dB" bandpaths = { "hh": hhpath, "linci": lincipath, "mask": maskpath, "date": datepath, } properties = { "odc:product": product_name, "odc:region_code": region_code, "platform": platform, "instruments": [instrument], "cf": cf, "launchdate": launch_date, "start_datetime": f"{year}-01-01T00:00:00Z", "end_datetime": f"{year}-12-31T23:59:59Z", } assets = {} for name, path in bandpaths.items(): href = f"s3://{s3_destination}/{path}" assets[name] = pystac.Asset(href=href, media_type=pystac.MediaType.COG, roles=["data"]) item = create_stac_item( file_path, id=str( odc_uuid(shortname, "1", [], year=year, tile=file_key.split("_")[0])), properties=properties, assets=assets, with_proj=True, ) item.set_self_href(stac_href) s3_dump( json.dumps(item.to_dict(), indent=2), item.self_href, ContentType="application/json", ACL="bucket-owner-full-control", ) log.info(f"STAC written to {item.self_href}")