def cli(config, inventory_manifest, queue_url, bucket, from_date, s3_keys=None): """ Send messages (yaml s3 keys) to stac_queue """ with open(config, "r") as cfg_file: cfg = YAML.load(cfg_file) if not s3_keys: s3_client = make_s3_client() inventory_items = list_inventory(inventory_manifest, s3=s3_client) if from_date: inventory_items = ( item for item in inventory_items if dateutil.parser.parse(item.LastModifiedDate) > from_date ) s3_keys = yamls_in_inventory_list(inventory_items, cfg) else: # Filter out non yaml keys s3_keys = [item for item in s3_keys if item.endswith(".yaml")] LOG.info("Sending %s update messages", len(s3_keys)) messages_to_sqs(s3_keys, bucket, queue_url) LOG.info("Done")
def cli(inventory, prefix, regex, glob, aws_profile): """List S3 inventory entries. prefix can be combined with regex or glob pattern, but supplying both regex and glob doesn't make sense. \b Example: s3-inventory s3://my-inventory-bucket/path-to-inventory/ '*yaml' """ def entry_to_url(entry): return 's3://{e.Bucket}/{e.Key}'.format(e=entry) flush_freq = 100 s3 = make_s3_client(profile=aws_profile) if glob == '': glob = None if glob is not None and regex is not None: click.echo("Can not mix regex and shell patterns") sys.exit(1) if inventory is None: # TODO: read from config file inventory = 's3://dea-public-data-inventory/dea-public-data/dea-public-data-csv-inventory/' predicate = build_predicate(glob=glob, regex=regex, prefix=prefix) to_str = entry_to_url for i, entry in enumerate(list_inventory(inventory, s3=s3)): if predicate(entry): print(to_str(entry), flush=(i % flush_freq) == 0)
def update_parent_catalogs( bucket, cfg, from_date, inventory_manifest, contents_file, s3_keys=None, dry_run=False, ): if contents_file is not None: with open(contents_file) as fin: s3_keys = (line.strip() for line in fin.readlines()) elif not s3_keys: s3_client = make_s3_client() inventory_items = list_inventory(inventory_manifest, s3=s3_client) if from_date: inventory_items = ( item for item in inventory_items if dateutil.parser.parse(item.LastModifiedDate) > from_date) s3_keys = yamls_in_inventory_list(inventory_items, cfg) cu = StacCollections(cfg, dry_run) cu.add_items(s3_keys) cu.persist_all_catalogs(bucket, dry_run=dry_run)
def get_and_filter_cogs_keys(): """ Retrieve key list from a inventory bucket and filter :return: """ s3 = s3_client(region_name=SOURCE_REGION) source_keys = list_inventory( manifest=f"{SOURCE_INVENTORY_PATH}", s3=s3, prefix=BASE_FOLDER_NAME, contains=".json", n_threads=200, ) africa_tile_ids = set( pd.read_csv( "https://raw.githubusercontent.com/digitalearthafrica/deafrica-extent/master/deafrica-mgrs-tiles.csv.gz", header=None, ).values.ravel()) return set( key.Key for key in source_keys if (key.Key.split("/")[-2].split("_")[1] in africa_tile_ids # We need to ensure we're ignoring the old format data and re.match(r"sentinel-s2-l2a-cogs/\d{4}/", key.Key) is None))
def cli( inventory, prefix, regex, glob, aws_profile, no_sign_request=None, request_payer=False, ): """List S3 inventory entries. prefix can be combined with regex or glob pattern, but supplying both regex and glob doesn't make sense. \b Example: s3-inventory s3://my-inventory-bucket/path-to-inventory/ '*yaml' """ def entry_to_url(entry): return "s3://{e.Bucket}/{e.Key}".format(e=entry) opts = {} if request_payer: opts["RequestPayer"] = "requester" flush_freq = 100 s3 = s3_client(profile=aws_profile, aws_unsigned=no_sign_request) if glob == "": glob = None if glob is not None and regex is not None: click.echo("Can not mix regex and shell patterns") sys.exit(1) if inventory is None: # TODO: read from config file inventory = "s3://dea-public-data-inventory/dea-public-data/dea-public-data-csv-inventory/" predicate = build_predicate(glob=glob, regex=regex, prefix=prefix) to_str = entry_to_url for i, entry in enumerate(list_inventory(inventory, s3=s3, **opts)): if predicate(entry): print(to_str(entry), flush=(i % flush_freq) == 0)
def delete_stac_catalog_parents(aws_product_prefix, bucket, inventory_bucket): s3_client = boto3.client("s3") delete_files = dict(Objects=[]) for item in list_inventory( f"s3://{inventory_bucket}/{bucket}/{bucket}-csv-inventory/", s3=make_s3_client()): s3_key_file = PurePosixPath(item.Key) # add to delete list if s3_key_file.name == "catalog.json" and aws_product_prefix in item.Key: print(item.Key) delete_files["Objects"].append(dict(Key=item.Key)) # flush out the delete list if aws limit reached if len(delete_files["Objects"]) >= AWS_DELETE_LIMIT: s3_client.delete_objects(Bucket=bucket, Delete=delete_files) delete_files = dict(Objects=[]) # flush out the remaining if len(delete_files["Objects"]) >= AWS_DELETE_LIMIT: s3_client.delete_objects(Bucket=bucket, Delete=delete_files)
INVENTORY_BUCKET = "deafrica-sentinel-2-inventory" PREFIX = "deafrica-sentinel-2/deafrica-sentinel-2-inventory/" DO_FIX = False if DO_FIX: client = s3_client(region_name="af-south-1") else: client = s3_client(aws_unsigned=True, region_name="af-south-1") manifest = find_latest_manifest( f"s3://{INVENTORY_BUCKET}/{PREFIX}", client, ) inventory = list_inventory(manifest, s3=client) report_every = 10000 count = 0 json_docs = 0 to_fix = 0 for obj in inventory: count += 1 if count % report_every == 0: print(f"Processing {count}") if obj.Key.endswith(".json"): json_docs += 1 o_dict = s3_head_object(f"s3://{obj.Bucket}/{obj.Key}", s3=client) if o_dict["ContentType"] != "application/json":
def generate_buckets_diff( bucket_name: str, update_stac: bool = False, notification_url: str = None, ) -> None: """ Compare Sentinel-2 buckets in US and Africa and detect differences A report containing missing keys will be written to s3://deafrica-sentinel-2/status-report :param bucket_name: (str) Bucket where the gap report is :param update_stac: (bool) Define if the report will contain all scenes from the source for an update :param notification_url: (str) Optional slack URL in case of you want to send a slack notification """ log = setup_logging() log.info("Task started") # defines where the report will be saved s2_status_report_path = URL(f"s3://{bucket_name}/status-report/") environment = "DEV" if "dev" in bucket_name else "PDS" log.info(f"Environment {environment}") date_string = datetime.now().strftime("%Y-%m-%d") # Retrieve keys from inventory bucket source_keys = get_and_filter_cogs_keys() output_filename = "No missing scenes were found" if update_stac: log.info("FORCED UPDATE ACTIVE!") missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys) orphaned_keys = set() else: destination_keys = set(ns.Key for ns in list_inventory( manifest=f"{SENTINEL_2_INVENTORY_PATH}", prefix=BASE_FOLDER_NAME, contains=".json", n_threads=200, )) # Keys that are missing, they are in the source but not in the bucket missing_scenes = set(f"s3://sentinel-cogs/{key}" for key in source_keys if key not in destination_keys) # Keys that are lost, they are in the bucket but not found in the source orphaned_keys = destination_keys.difference(source_keys) s2_s3 = s3_client(region_name=SENTINEL_2_REGION) if len(missing_scenes) > 0 or len(orphaned_keys) > 0: output_filename = (f"{date_string}_gap_report.json" if not update_stac else URL(f"{date_string}_gap_report_update.json")) log.info( f"File will be saved in {s2_status_report_path}/{output_filename}") missing_orphan_scenes_json = json.dumps({ "orphan": list(orphaned_keys), "missing": list(missing_scenes) }) s3_dump( data=missing_orphan_scenes_json, url=str(URL(s2_status_report_path) / output_filename), s3=s2_s3, ContentType="application/json", ) report_http_link = f"https://{bucket_name}.s3.{SENTINEL_2_REGION}.amazonaws.com/status-report/{output_filename}" message = dedent(f"*SENTINEL 2 GAP REPORT - {environment}*\n" f"Missing Scenes: {len(missing_scenes)}\n" f"Orphan Scenes: {len(orphaned_keys)}\n" f"Report: {report_http_link}\n") log.info(message) if not update_stac and (len(missing_scenes) > 200 or len(orphaned_keys) > 200): if notification_url is not None: send_slack_notification(notification_url, "S2 Gap Report", message) raise Exception(f"More than 200 scenes were found \n {message}")