def cli(skip_lineage, fail_on_missing_lineage, verify_lineage, uri, product): """ Iterate through files in an S3 bucket and add them to datacube""" # Get a generator from supplied S3 Uri for metadata definitions fetcher = S3Fetcher() # TODO: Share Fetcher s3_obj_stream = s3_find_glob(uri, False) # Extract URL's from output of iterator before passing to Fetcher s3_url_stream = (o.url for o in s3_obj_stream) # TODO: Capture S3 URL's in batches and perform bulk_location_has # Consume generator and fetch YAML's dc = Datacube() added, failed = dump_to_odc( fetcher(s3_url_stream), dc, product, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, ) print(f"Added {added} Datasets, Failed {failed} Datasets")
def cli( skip_lineage, fail_on_missing_lineage, verify_lineage, stac, update, allow_unsafe, skip_check, no_sign_request, request_payer, uri, product, ): """ Iterate through files in an S3 bucket and add them to datacube""" transform = None if stac: transform = stac_transform candidate_products = product.split() opts = {} if request_payer: opts["RequestPayer"] = "requester" # Get a generator from supplied S3 Uri for metadata definitions fetcher = S3Fetcher(aws_unsigned=no_sign_request) # TODO: Share Fetcher s3_obj_stream = s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts) # Extract URLs from output of iterator before passing to Fetcher s3_url_stream = (o.url for o in s3_obj_stream) # TODO: Capture S3 URL's in batches and perform bulk_location_has # Consume generator and fetch YAML's dc = Datacube() added, failed = dump_to_odc( fetcher(s3_url_stream), dc, candidate_products, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, transform=transform, update=update, allow_unsafe=allow_unsafe, ) print(f"Added {added} Datasets, Failed {failed} Datasets")
def fix_metadata(date, workers): uri = f"s3://dea-public-data/baseline/s2b_ard_granule/{date}/**/*.yaml" fetcher = S3Fetcher(aws_unsigned=True) s3_obj_stream = s3_find_glob(uri, skip_check=True, s3=fetcher) s3_url_stream = (o.url for o in s3_obj_stream) data_stream = list(fetcher(s3_url_stream)) with ThreadPoolExecutor(max_workers=workers) as executor: futures = [executor.submit(process_dataset, s3_obj) for s3_obj in data_stream] for future in as_completed(futures): if future.exception() is not None: raise future.exception()
def cli(uri, skip_check, no_sign_request=None, request_payer=False): """List files on S3 bucket. Example: \b List files in directory that match `*yaml` > s3-find 's3://mybucket/some/path/*yaml' \b List files in directory and all sub-directories that match `*yaml` > s3-find 's3://mybucket/some/path/**/*yaml' \b List files that match `*yaml` 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/*yaml' \b List directories 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/' \b List all files named `metadata.yaml` 2 directories deep > s3-find 's3://mybucket/some/path/*/*/metadata.yaml' """ flush_freq = 100 opts = {} if request_payer: opts["RequestPayer"] = "requester" s3 = S3Fetcher(aws_unsigned=no_sign_request) try: stream = s3_find_glob(uri, skip_check=skip_check, s3=s3, **opts) for i, o in enumerate(stream): print(o.url, flush=(i % flush_freq == 0)) except ValueError as ve: click.echo(str(ve), err=True) sys.exit(1) except Exception as e: click.echo(str(e), err=True) sys.exit(1)
def cli(uri, skip_check): """ List files on S3 bucket. Example: \b List files in directory that match `*yaml` > s3-find 's3://mybucket/some/path/*yaml' \b List files in directory and all sub-directories that match `*yaml` > s3-find 's3://mybucket/some/path/**/*yaml' \b List files that match `*yaml` 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/*yaml' \b List directories 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/' \b List all files named `metadata.yaml` 2 directories deep > s3-find 's3://mybucket/some/path/*/*/metadata.yaml' """ flush_freq = 100 try: stream = s3_find_glob(uri, skip_check) for i, o in enumerate(stream): print(o.url, flush=(i % flush_freq == 0)) except ValueError as ve: click.echo(str(ve), err=True) sys.exit(1) except Exception as e: click.echo(str(e), err=True) sys.exit(1)
def cli( skip_lineage, fail_on_missing_lineage, verify_lineage, stac, absolute, update, update_if_exists, allow_unsafe, skip_check, no_sign_request, request_payer, uri, product, ): """ Iterate through files in an S3 bucket and add them to datacube""" transform = None if stac: if absolute: transform = stac_transform_absolute else: transform = stac_transform candidate_products = product.split() opts = {} if request_payer: opts["RequestPayer"] = "requester" # Check datacube connection and products dc = Datacube() odc_products = dc.list_products().name.values odc_products = set(odc_products) if not set(candidate_products).issubset(odc_products): missing_products = list(set(candidate_products) - odc_products) print( f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} " "not present in the ODC Database", file=sys.stderr, ) sys.exit(1) # Get a generator from supplied S3 Uri for candidate documents fetcher = S3Fetcher(aws_unsigned=no_sign_request) document_stream = stream_urls( s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts)) added, failed = dump_to_odc( fetcher(document_stream), dc, candidate_products, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, transform=transform, update=update, update_if_exists=update_if_exists, allow_unsafe=allow_unsafe, ) print(f"Added {added} datasets and failed {failed} datasets.") if failed > 0: sys.exit(failed)
def cli( skip_lineage, fail_on_missing_lineage, verify_lineage, stac, update, allow_unsafe, skip_check, no_sign_request, request_payer, uri, product, ): """ Iterate through files in an S3 bucket and add them to datacube""" transform = None if stac: transform = stac_transform candidate_products = product.split() opts = {} if request_payer: opts["RequestPayer"] = "requester" # Get a generator from supplied S3 Uri for metadata definitions fetcher = S3Fetcher(aws_unsigned=no_sign_request) # TODO: Share Fetcher s3_obj_stream = s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts) # Extract URLs from output of iterator before passing to Fetcher s3_url_stream = (o.url for o in s3_obj_stream) # TODO: Capture S3 URL's in batches and perform bulk_location_has # Consume generator and fetch YAML's dc = Datacube() odc_products = dc.list_products().name.values odc_products = set(odc_products) if not set(candidate_products).issubset(odc_products): missing_products = list(set(candidate_products) - odc_products) print( f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} " "not present in the ODC Database", file=sys.stderr, ) sys.exit(1) added, failed = dump_to_odc( fetcher(s3_url_stream), dc, candidate_products, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, transform=transform, update=update, allow_unsafe=allow_unsafe, ) print(f"Added {added} Datasets, Failed {failed} Datasets") if failed > 0: sys.exit(failed)