def cli(skip_lineage, fail_on_missing_lineage, verify_lineage, uri, product): """ Iterate through files in an S3 bucket and add them to datacube""" # Get a generator from supplied S3 Uri for metadata definitions fetcher = S3Fetcher() # TODO: Share Fetcher s3_obj_stream = s3_find_glob(uri, False) # Extract URL's from output of iterator before passing to Fetcher s3_url_stream = (o.url for o in s3_obj_stream) # TODO: Capture S3 URL's in batches and perform bulk_location_has # Consume generator and fetch YAML's dc = Datacube() added, failed = dump_to_odc( fetcher(s3_url_stream), dc, product, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, ) print(f"Added {added} Datasets, Failed {failed} Datasets")
def cli( skip_lineage, fail_on_missing_lineage, verify_lineage, stac, update, allow_unsafe, skip_check, no_sign_request, request_payer, uri, product, ): """ Iterate through files in an S3 bucket and add them to datacube""" transform = None if stac: transform = stac_transform candidate_products = product.split() opts = {} if request_payer: opts["RequestPayer"] = "requester" # Get a generator from supplied S3 Uri for metadata definitions fetcher = S3Fetcher(aws_unsigned=no_sign_request) # TODO: Share Fetcher s3_obj_stream = s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts) # Extract URLs from output of iterator before passing to Fetcher s3_url_stream = (o.url for o in s3_obj_stream) # TODO: Capture S3 URL's in batches and perform bulk_location_has # Consume generator and fetch YAML's dc = Datacube() added, failed = dump_to_odc( fetcher(s3_url_stream), dc, candidate_products, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, transform=transform, update=update, allow_unsafe=allow_unsafe, ) print(f"Added {added} Datasets, Failed {failed} Datasets")
def fix_metadata(date, workers): uri = f"s3://dea-public-data/baseline/s2b_ard_granule/{date}/**/*.yaml" fetcher = S3Fetcher(aws_unsigned=True) s3_obj_stream = s3_find_glob(uri, skip_check=True, s3=fetcher) s3_url_stream = (o.url for o in s3_obj_stream) data_stream = list(fetcher(s3_url_stream)) with ThreadPoolExecutor(max_workers=workers) as executor: futures = [executor.submit(process_dataset, s3_obj) for s3_obj in data_stream] for future in as_completed(futures): if future.exception() is not None: raise future.exception()
def cli(uri, skip_check, no_sign_request=None, request_payer=False): """List files on S3 bucket. Example: \b List files in directory that match `*yaml` > s3-find 's3://mybucket/some/path/*yaml' \b List files in directory and all sub-directories that match `*yaml` > s3-find 's3://mybucket/some/path/**/*yaml' \b List files that match `*yaml` 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/*yaml' \b List directories 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/' \b List all files named `metadata.yaml` 2 directories deep > s3-find 's3://mybucket/some/path/*/*/metadata.yaml' """ flush_freq = 100 opts = {} if request_payer: opts["RequestPayer"] = "requester" s3 = S3Fetcher(aws_unsigned=no_sign_request) try: stream = s3_find_glob(uri, skip_check=skip_check, s3=s3, **opts) for i, o in enumerate(stream): print(o.url, flush=(i % flush_freq == 0)) except ValueError as ve: click.echo(str(ve), err=True) sys.exit(1) except Exception as e: click.echo(str(e), err=True) sys.exit(1)
def cli(uri, skip_check): """ List files on S3 bucket. Example: \b List files in directory that match `*yaml` > s3-find 's3://mybucket/some/path/*yaml' \b List files in directory and all sub-directories that match `*yaml` > s3-find 's3://mybucket/some/path/**/*yaml' \b List files that match `*yaml` 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/*yaml' \b List directories 2 levels deep from known path > s3-find 's3://mybucket/some/path/*/*/' \b List all files named `metadata.yaml` 2 directories deep > s3-find 's3://mybucket/some/path/*/*/metadata.yaml' """ def do_file_query(qq, pred): for d in s3.dir_dir(qq.base, qq.depth): _, _files = s3.list_dir(d).result() for f in _files: if pred(f): yield f def do_file_query2(qq): fname = qq.file stream = s3.dir_dir(qq.base, qq.depth) if skip_check: yield from (SimpleNamespace(url=d + fname) for d in stream) return stream = (s3.head_object(d + fname) for d in stream) for (f, _), _ in future_results(stream, 32): if f is not None: yield f def do_dir_query(qq): return (SimpleNamespace(url=url) for url in s3.dir_dir(qq.base, qq.depth)) flush_freq = 100 try: qq = parse_query(uri) except ValueError as e: click.echo(str(e), err=True) sys.exit(1) s3 = S3Fetcher() glob_or_file = qq.glob or qq.file if qq.depth is None and glob_or_file is None: stream = s3.find(qq.base) elif qq.depth is None or qq.depth < 0: if qq.glob: stream = s3.find(qq.base, glob=qq.glob) elif qq.file: postfix = '/' + qq.file stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix)) else: # fixed depth query if qq.glob is not None: pred = norm_predicate(glob=qq.glob) stream = do_file_query(qq, pred) elif qq.file is not None: stream = do_file_query2(qq) else: stream = do_dir_query(qq) try: for i, o in enumerate(stream): print(o.url, flush=(i % flush_freq == 0)) except Exception as e: print(f"ERROR: {str(e)}") sys.exit(1)
def cli(n, verbose, gzip, xz, outfile, no_sign_request=None, request_payer=False): """ Fetch a bunch of s3 files into a tar archive. \b For every non-empty line in stdin - Treat line as a URI and fetch document from it - Write content of the file to a tar archive using `bucket-name/path/to/file` as file name """ opts = {} if request_payer: opts['RequestPayer'] = 'requester' logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', level=logging.ERROR) nconnections = 24 if n is None else n exit_early = False def dump_to_tar(data_stream, tar): nonlocal exit_early fps = RateEstimator() for d in data_stream: fps() fname = d.url[5:] if d.data is not None: if verbose: if fps.every(10): print('.', file=stderr, end='', flush=True) if fps.every(100): print(' {}'.format(str(fps)), file=stderr) add_txt_file(tar, fname, d.data, last_modified=d.last_modified) else: print("Failed %s (%s)" % (d.url, str(d.error)), file=stderr) if exit_early: break if verbose: print(' {}'.format(str(fps)), file=stderr) fetcher = S3Fetcher(nconcurrent=nconnections, aws_unsigned=no_sign_request) is_pipe = outfile == '-' tar_opts = dict(mode='w'+tar_mode(gzip=gzip, xz=xz, is_pipe=is_pipe)) if is_pipe: if stdout.isatty(): click.echo("Will not write to a terminal", err=True) sys.exit(1) # TODO: on windows switch stdout to binary mode tar_opts['fileobj'] = stdout.buffer else: tar_opts['name'] = outfile urls = read_stdin_lines(skip_empty=True) def on_ctrlc(sig, frame): nonlocal exit_early print('Shutting down...', file=sys.stderr) exit_early = True signal.signal(signal.SIGINT, on_ctrlc) with tarfile.open(**tar_opts) as tar: dump_to_tar(fetcher(urls, **opts), tar) fetcher.close()
def s3_fetch_dss(base, product, glob="*.json", s3=None): if s3 is None: s3 = S3Fetcher(aws_unsigned=True) blobs = s3(o.url for o in s3.find(base, glob=glob)) dss = (blob2ds(b, product) for b in blobs) return dss
def cli( skip_lineage, fail_on_missing_lineage, verify_lineage, stac, absolute, update, update_if_exists, allow_unsafe, skip_check, no_sign_request, request_payer, uri, product, ): """ Iterate through files in an S3 bucket and add them to datacube""" transform = None if stac: if absolute: transform = stac_transform_absolute else: transform = stac_transform candidate_products = product.split() opts = {} if request_payer: opts["RequestPayer"] = "requester" # Check datacube connection and products dc = Datacube() odc_products = dc.list_products().name.values odc_products = set(odc_products) if not set(candidate_products).issubset(odc_products): missing_products = list(set(candidate_products) - odc_products) print( f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} " "not present in the ODC Database", file=sys.stderr, ) sys.exit(1) # Get a generator from supplied S3 Uri for candidate documents fetcher = S3Fetcher(aws_unsigned=no_sign_request) document_stream = stream_urls( s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts)) added, failed = dump_to_odc( fetcher(document_stream), dc, candidate_products, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, transform=transform, update=update, update_if_exists=update_if_exists, allow_unsafe=allow_unsafe, ) print(f"Added {added} datasets and failed {failed} datasets.") if failed > 0: sys.exit(failed)
def cli( skip_lineage, fail_on_missing_lineage, verify_lineage, stac, update, allow_unsafe, skip_check, no_sign_request, request_payer, uri, product, ): """ Iterate through files in an S3 bucket and add them to datacube""" transform = None if stac: transform = stac_transform candidate_products = product.split() opts = {} if request_payer: opts["RequestPayer"] = "requester" # Get a generator from supplied S3 Uri for metadata definitions fetcher = S3Fetcher(aws_unsigned=no_sign_request) # TODO: Share Fetcher s3_obj_stream = s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts) # Extract URLs from output of iterator before passing to Fetcher s3_url_stream = (o.url for o in s3_obj_stream) # TODO: Capture S3 URL's in batches and perform bulk_location_has # Consume generator and fetch YAML's dc = Datacube() odc_products = dc.list_products().name.values odc_products = set(odc_products) if not set(candidate_products).issubset(odc_products): missing_products = list(set(candidate_products) - odc_products) print( f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} " "not present in the ODC Database", file=sys.stderr, ) sys.exit(1) added, failed = dump_to_odc( fetcher(s3_url_stream), dc, candidate_products, skip_lineage=skip_lineage, fail_on_missing_lineage=fail_on_missing_lineage, verify_lineage=verify_lineage, transform=transform, update=update, allow_unsafe=allow_unsafe, ) print(f"Added {added} Datasets, Failed {failed} Datasets") if failed > 0: sys.exit(failed)