Esempio n. 1
0
def cli(skip_lineage, fail_on_missing_lineage, verify_lineage, uri, product):
    """ Iterate through files in an S3 bucket and add them to datacube"""

    # Get a generator from supplied S3 Uri for metadata definitions
    fetcher = S3Fetcher()

    # TODO: Share Fetcher
    s3_obj_stream = s3_find_glob(uri, False)

    # Extract URL's from output of iterator before passing to Fetcher
    s3_url_stream = (o.url for o in s3_obj_stream)

    # TODO: Capture S3 URL's in batches and perform bulk_location_has

    # Consume generator and fetch YAML's
    dc = Datacube()
    added, failed = dump_to_odc(
        fetcher(s3_url_stream),
        dc,
        product,
        skip_lineage=skip_lineage,
        fail_on_missing_lineage=fail_on_missing_lineage,
        verify_lineage=verify_lineage,
    )

    print(f"Added {added} Datasets, Failed {failed} Datasets")
Esempio n. 2
0
def cli(
    skip_lineage,
    fail_on_missing_lineage,
    verify_lineage,
    stac,
    update,
    allow_unsafe,
    skip_check,
    no_sign_request,
    request_payer,
    uri,
    product,
):
    """ Iterate through files in an S3 bucket and add them to datacube"""

    transform = None
    if stac:
        transform = stac_transform

    candidate_products = product.split()

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    # Get a generator from supplied S3 Uri for metadata definitions
    fetcher = S3Fetcher(aws_unsigned=no_sign_request)

    # TODO: Share Fetcher
    s3_obj_stream = s3_find_glob(uri,
                                 skip_check=skip_check,
                                 s3=fetcher,
                                 **opts)

    # Extract URLs from output of iterator before passing to Fetcher
    s3_url_stream = (o.url for o in s3_obj_stream)

    # TODO: Capture S3 URL's in batches and perform bulk_location_has

    # Consume generator and fetch YAML's
    dc = Datacube()
    added, failed = dump_to_odc(
        fetcher(s3_url_stream),
        dc,
        candidate_products,
        skip_lineage=skip_lineage,
        fail_on_missing_lineage=fail_on_missing_lineage,
        verify_lineage=verify_lineage,
        transform=transform,
        update=update,
        allow_unsafe=allow_unsafe,
    )

    print(f"Added {added} Datasets, Failed {failed} Datasets")
Esempio n. 3
0
def fix_metadata(date, workers):
    uri = f"s3://dea-public-data/baseline/s2b_ard_granule/{date}/**/*.yaml"

    fetcher = S3Fetcher(aws_unsigned=True)
    s3_obj_stream = s3_find_glob(uri, skip_check=True, s3=fetcher)
    s3_url_stream = (o.url for o in s3_obj_stream)
    data_stream = list(fetcher(s3_url_stream))
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(process_dataset, s3_obj) for s3_obj in data_stream]
        
        for future in as_completed(futures):
            if future.exception() is not None:
                raise future.exception()
Esempio n. 4
0
def cli(uri, skip_check, no_sign_request=None, request_payer=False):
    """List files on S3 bucket.

    Example:

       \b
       List files in directory that match `*yaml`
        > s3-find 's3://mybucket/some/path/*yaml'

       \b
       List files in directory and all sub-directories that match `*yaml`
        > s3-find 's3://mybucket/some/path/**/*yaml'

       \b
       List files that match `*yaml` 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/*yaml'

       \b
       List directories 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/'

       \b
       List all files named `metadata.yaml` 2 directories deep
        > s3-find 's3://mybucket/some/path/*/*/metadata.yaml'
    """
    flush_freq = 100

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    s3 = S3Fetcher(aws_unsigned=no_sign_request)

    try:
        stream = s3_find_glob(uri, skip_check=skip_check, s3=s3, **opts)
        for i, o in enumerate(stream):
            print(o.url, flush=(i % flush_freq == 0))
    except ValueError as ve:
        click.echo(str(ve), err=True)
        sys.exit(1)
    except Exception as e:
        click.echo(str(e), err=True)
        sys.exit(1)
Esempio n. 5
0
def cli(uri, skip_check):
    """ List files on S3 bucket.

    Example:

       \b
       List files in directory that match `*yaml`
        > s3-find 's3://mybucket/some/path/*yaml'

       \b
       List files in directory and all sub-directories that match `*yaml`
        > s3-find 's3://mybucket/some/path/**/*yaml'

       \b
       List files that match `*yaml` 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/*yaml'

       \b
       List directories 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/'

       \b
       List all files named `metadata.yaml` 2 directories deep
        > s3-find 's3://mybucket/some/path/*/*/metadata.yaml'
    """
    def do_file_query(qq, pred):
        for d in s3.dir_dir(qq.base, qq.depth):
            _, _files = s3.list_dir(d).result()
            for f in _files:
                if pred(f):
                    yield f

    def do_file_query2(qq):
        fname = qq.file

        stream = s3.dir_dir(qq.base, qq.depth)

        if skip_check:
            yield from (SimpleNamespace(url=d + fname) for d in stream)
            return

        stream = (s3.head_object(d + fname) for d in stream)

        for (f, _), _ in future_results(stream, 32):
            if f is not None:
                yield f

    def do_dir_query(qq):
        return (SimpleNamespace(url=url)
                for url in s3.dir_dir(qq.base, qq.depth))

    flush_freq = 100

    try:
        qq = parse_query(uri)
    except ValueError as e:
        click.echo(str(e), err=True)
        sys.exit(1)

    s3 = S3Fetcher()

    glob_or_file = qq.glob or qq.file

    if qq.depth is None and glob_or_file is None:
        stream = s3.find(qq.base)
    elif qq.depth is None or qq.depth < 0:
        if qq.glob:
            stream = s3.find(qq.base, glob=qq.glob)
        elif qq.file:
            postfix = '/' + qq.file
            stream = s3.find(qq.base, pred=lambda o: o.url.endswith(postfix))
    else:
        # fixed depth query
        if qq.glob is not None:
            pred = norm_predicate(glob=qq.glob)
            stream = do_file_query(qq, pred)
        elif qq.file is not None:
            stream = do_file_query2(qq)
        else:
            stream = do_dir_query(qq)

    try:
        for i, o in enumerate(stream):
            print(o.url, flush=(i % flush_freq == 0))
    except Exception as e:
        print(f"ERROR: {str(e)}")
        sys.exit(1)
Esempio n. 6
0
def cli(n, verbose, gzip, xz, outfile, no_sign_request=None, request_payer=False):
    """ Fetch a bunch of s3 files into a tar archive.

    \b
    For every non-empty line in stdin
       - Treat line as a URI and fetch document from it
       - Write content of the file to a tar archive using `bucket-name/path/to/file` as file name
    """

    opts = {}
    if request_payer:
        opts['RequestPayer'] = 'requester'

    logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', level=logging.ERROR)

    nconnections = 24 if n is None else n
    exit_early = False

    def dump_to_tar(data_stream, tar):
        nonlocal exit_early
        fps = RateEstimator()

        for d in data_stream:
            fps()
            fname = d.url[5:]

            if d.data is not None:
                if verbose:
                    if fps.every(10):
                        print('.', file=stderr, end='', flush=True)

                    if fps.every(100):
                        print(' {}'.format(str(fps)), file=stderr)

                add_txt_file(tar, fname, d.data, last_modified=d.last_modified)
            else:
                print("Failed %s (%s)" % (d.url, str(d.error)),
                      file=stderr)

            if exit_early:
                break

        if verbose:
            print(' {}'.format(str(fps)), file=stderr)

    fetcher = S3Fetcher(nconcurrent=nconnections, aws_unsigned=no_sign_request)
    is_pipe = outfile == '-'
    tar_opts = dict(mode='w'+tar_mode(gzip=gzip, xz=xz, is_pipe=is_pipe))
    if is_pipe:
        if stdout.isatty():
            click.echo("Will not write to a terminal", err=True)
            sys.exit(1)
        # TODO: on windows switch stdout to binary mode
        tar_opts['fileobj'] = stdout.buffer
    else:
        tar_opts['name'] = outfile

    urls = read_stdin_lines(skip_empty=True)

    def on_ctrlc(sig, frame):
        nonlocal exit_early
        print('Shutting down...', file=sys.stderr)
        exit_early = True

    signal.signal(signal.SIGINT, on_ctrlc)

    with tarfile.open(**tar_opts) as tar:
        dump_to_tar(fetcher(urls, **opts), tar)

    fetcher.close()
Esempio n. 7
0
def s3_fetch_dss(base, product, glob="*.json", s3=None):
    if s3 is None:
        s3 = S3Fetcher(aws_unsigned=True)
    blobs = s3(o.url for o in s3.find(base, glob=glob))
    dss = (blob2ds(b, product) for b in blobs)
    return dss
Esempio n. 8
0
def cli(
    skip_lineage,
    fail_on_missing_lineage,
    verify_lineage,
    stac,
    absolute,
    update,
    update_if_exists,
    allow_unsafe,
    skip_check,
    no_sign_request,
    request_payer,
    uri,
    product,
):
    """ Iterate through files in an S3 bucket and add them to datacube"""

    transform = None
    if stac:
        if absolute:
            transform = stac_transform_absolute
        else:
            transform = stac_transform

    candidate_products = product.split()

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    # Check datacube connection and products
    dc = Datacube()
    odc_products = dc.list_products().name.values

    odc_products = set(odc_products)
    if not set(candidate_products).issubset(odc_products):
        missing_products = list(set(candidate_products) - odc_products)
        print(
            f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} "
            "not present in the ODC Database",
            file=sys.stderr,
        )
        sys.exit(1)

    # Get a generator from supplied S3 Uri for candidate documents
    fetcher = S3Fetcher(aws_unsigned=no_sign_request)
    document_stream = stream_urls(
        s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts))

    added, failed = dump_to_odc(
        fetcher(document_stream),
        dc,
        candidate_products,
        skip_lineage=skip_lineage,
        fail_on_missing_lineage=fail_on_missing_lineage,
        verify_lineage=verify_lineage,
        transform=transform,
        update=update,
        update_if_exists=update_if_exists,
        allow_unsafe=allow_unsafe,
    )

    print(f"Added {added} datasets and failed {failed} datasets.")

    if failed > 0:
        sys.exit(failed)
Esempio n. 9
0
def cli(
    skip_lineage,
    fail_on_missing_lineage,
    verify_lineage,
    stac,
    update,
    allow_unsafe,
    skip_check,
    no_sign_request,
    request_payer,
    uri,
    product,
):
    """ Iterate through files in an S3 bucket and add them to datacube"""

    transform = None
    if stac:
        transform = stac_transform

    candidate_products = product.split()

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    # Get a generator from supplied S3 Uri for metadata definitions
    fetcher = S3Fetcher(aws_unsigned=no_sign_request)

    # TODO: Share Fetcher
    s3_obj_stream = s3_find_glob(uri,
                                 skip_check=skip_check,
                                 s3=fetcher,
                                 **opts)

    # Extract URLs from output of iterator before passing to Fetcher
    s3_url_stream = (o.url for o in s3_obj_stream)

    # TODO: Capture S3 URL's in batches and perform bulk_location_has

    # Consume generator and fetch YAML's
    dc = Datacube()
    odc_products = dc.list_products().name.values

    odc_products = set(odc_products)
    if not set(candidate_products).issubset(odc_products):
        missing_products = list(set(candidate_products) - odc_products)
        print(
            f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} "
            "not present in the ODC Database",
            file=sys.stderr,
        )
        sys.exit(1)

    added, failed = dump_to_odc(
        fetcher(s3_url_stream),
        dc,
        candidate_products,
        skip_lineage=skip_lineage,
        fail_on_missing_lineage=fail_on_missing_lineage,
        verify_lineage=verify_lineage,
        transform=transform,
        update=update,
        allow_unsafe=allow_unsafe,
    )

    print(f"Added {added} Datasets, Failed {failed} Datasets")

    if failed > 0:
        sys.exit(failed)