Esempio n. 1
0
def cli(skip_lineage, fail_on_missing_lineage, verify_lineage, uri, product):
    """ Iterate through files in an S3 bucket and add them to datacube"""

    # Get a generator from supplied S3 Uri for metadata definitions
    fetcher = S3Fetcher()

    # TODO: Share Fetcher
    s3_obj_stream = s3_find_glob(uri, False)

    # Extract URL's from output of iterator before passing to Fetcher
    s3_url_stream = (o.url for o in s3_obj_stream)

    # TODO: Capture S3 URL's in batches and perform bulk_location_has

    # Consume generator and fetch YAML's
    dc = Datacube()
    added, failed = dump_to_odc(
        fetcher(s3_url_stream),
        dc,
        product,
        skip_lineage=skip_lineage,
        fail_on_missing_lineage=fail_on_missing_lineage,
        verify_lineage=verify_lineage,
    )

    print(f"Added {added} Datasets, Failed {failed} Datasets")
Esempio n. 2
0
def cli(
    skip_lineage,
    fail_on_missing_lineage,
    verify_lineage,
    stac,
    update,
    allow_unsafe,
    skip_check,
    no_sign_request,
    request_payer,
    uri,
    product,
):
    """ Iterate through files in an S3 bucket and add them to datacube"""

    transform = None
    if stac:
        transform = stac_transform

    candidate_products = product.split()

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    # Get a generator from supplied S3 Uri for metadata definitions
    fetcher = S3Fetcher(aws_unsigned=no_sign_request)

    # TODO: Share Fetcher
    s3_obj_stream = s3_find_glob(uri,
                                 skip_check=skip_check,
                                 s3=fetcher,
                                 **opts)

    # Extract URLs from output of iterator before passing to Fetcher
    s3_url_stream = (o.url for o in s3_obj_stream)

    # TODO: Capture S3 URL's in batches and perform bulk_location_has

    # Consume generator and fetch YAML's
    dc = Datacube()
    added, failed = dump_to_odc(
        fetcher(s3_url_stream),
        dc,
        candidate_products,
        skip_lineage=skip_lineage,
        fail_on_missing_lineage=fail_on_missing_lineage,
        verify_lineage=verify_lineage,
        transform=transform,
        update=update,
        allow_unsafe=allow_unsafe,
    )

    print(f"Added {added} Datasets, Failed {failed} Datasets")
Esempio n. 3
0
def fix_metadata(date, workers):
    uri = f"s3://dea-public-data/baseline/s2b_ard_granule/{date}/**/*.yaml"

    fetcher = S3Fetcher(aws_unsigned=True)
    s3_obj_stream = s3_find_glob(uri, skip_check=True, s3=fetcher)
    s3_url_stream = (o.url for o in s3_obj_stream)
    data_stream = list(fetcher(s3_url_stream))
    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = [executor.submit(process_dataset, s3_obj) for s3_obj in data_stream]
        
        for future in as_completed(futures):
            if future.exception() is not None:
                raise future.exception()
Esempio n. 4
0
def cli(uri, skip_check, no_sign_request=None, request_payer=False):
    """List files on S3 bucket.

    Example:

       \b
       List files in directory that match `*yaml`
        > s3-find 's3://mybucket/some/path/*yaml'

       \b
       List files in directory and all sub-directories that match `*yaml`
        > s3-find 's3://mybucket/some/path/**/*yaml'

       \b
       List files that match `*yaml` 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/*yaml'

       \b
       List directories 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/'

       \b
       List all files named `metadata.yaml` 2 directories deep
        > s3-find 's3://mybucket/some/path/*/*/metadata.yaml'
    """
    flush_freq = 100

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    s3 = S3Fetcher(aws_unsigned=no_sign_request)

    try:
        stream = s3_find_glob(uri, skip_check=skip_check, s3=s3, **opts)
        for i, o in enumerate(stream):
            print(o.url, flush=(i % flush_freq == 0))
    except ValueError as ve:
        click.echo(str(ve), err=True)
        sys.exit(1)
    except Exception as e:
        click.echo(str(e), err=True)
        sys.exit(1)
Esempio n. 5
0
def cli(uri, skip_check):
    """ List files on S3 bucket.

    Example:

       \b
       List files in directory that match `*yaml`
        > s3-find 's3://mybucket/some/path/*yaml'

       \b
       List files in directory and all sub-directories that match `*yaml`
        > s3-find 's3://mybucket/some/path/**/*yaml'

       \b
       List files that match `*yaml` 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/*yaml'

       \b
       List directories 2 levels deep from known path
        > s3-find 's3://mybucket/some/path/*/*/'

       \b
       List all files named `metadata.yaml` 2 directories deep
        > s3-find 's3://mybucket/some/path/*/*/metadata.yaml'
    """
    flush_freq = 100

    try:
        stream = s3_find_glob(uri, skip_check)
        for i, o in enumerate(stream):
            print(o.url, flush=(i % flush_freq == 0))
    except ValueError as ve:
        click.echo(str(ve), err=True)
        sys.exit(1)
    except Exception as e:
        click.echo(str(e), err=True)
        sys.exit(1)
Esempio n. 6
0
def cli(
    skip_lineage,
    fail_on_missing_lineage,
    verify_lineage,
    stac,
    absolute,
    update,
    update_if_exists,
    allow_unsafe,
    skip_check,
    no_sign_request,
    request_payer,
    uri,
    product,
):
    """ Iterate through files in an S3 bucket and add them to datacube"""

    transform = None
    if stac:
        if absolute:
            transform = stac_transform_absolute
        else:
            transform = stac_transform

    candidate_products = product.split()

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    # Check datacube connection and products
    dc = Datacube()
    odc_products = dc.list_products().name.values

    odc_products = set(odc_products)
    if not set(candidate_products).issubset(odc_products):
        missing_products = list(set(candidate_products) - odc_products)
        print(
            f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} "
            "not present in the ODC Database",
            file=sys.stderr,
        )
        sys.exit(1)

    # Get a generator from supplied S3 Uri for candidate documents
    fetcher = S3Fetcher(aws_unsigned=no_sign_request)
    document_stream = stream_urls(
        s3_find_glob(uri, skip_check=skip_check, s3=fetcher, **opts))

    added, failed = dump_to_odc(
        fetcher(document_stream),
        dc,
        candidate_products,
        skip_lineage=skip_lineage,
        fail_on_missing_lineage=fail_on_missing_lineage,
        verify_lineage=verify_lineage,
        transform=transform,
        update=update,
        update_if_exists=update_if_exists,
        allow_unsafe=allow_unsafe,
    )

    print(f"Added {added} datasets and failed {failed} datasets.")

    if failed > 0:
        sys.exit(failed)
Esempio n. 7
0
def cli(
    skip_lineage,
    fail_on_missing_lineage,
    verify_lineage,
    stac,
    update,
    allow_unsafe,
    skip_check,
    no_sign_request,
    request_payer,
    uri,
    product,
):
    """ Iterate through files in an S3 bucket and add them to datacube"""

    transform = None
    if stac:
        transform = stac_transform

    candidate_products = product.split()

    opts = {}
    if request_payer:
        opts["RequestPayer"] = "requester"

    # Get a generator from supplied S3 Uri for metadata definitions
    fetcher = S3Fetcher(aws_unsigned=no_sign_request)

    # TODO: Share Fetcher
    s3_obj_stream = s3_find_glob(uri,
                                 skip_check=skip_check,
                                 s3=fetcher,
                                 **opts)

    # Extract URLs from output of iterator before passing to Fetcher
    s3_url_stream = (o.url for o in s3_obj_stream)

    # TODO: Capture S3 URL's in batches and perform bulk_location_has

    # Consume generator and fetch YAML's
    dc = Datacube()
    odc_products = dc.list_products().name.values

    odc_products = set(odc_products)
    if not set(candidate_products).issubset(odc_products):
        missing_products = list(set(candidate_products) - odc_products)
        print(
            f"Error: Requested Product/s {', '.join(missing_products)} {'is' if len(missing_products) == 1 else 'are'} "
            "not present in the ODC Database",
            file=sys.stderr,
        )
        sys.exit(1)

    added, failed = dump_to_odc(
        fetcher(s3_url_stream),
        dc,
        candidate_products,
        skip_lineage=skip_lineage,
        fail_on_missing_lineage=fail_on_missing_lineage,
        verify_lineage=verify_lineage,
        transform=transform,
        update=update,
        allow_unsafe=allow_unsafe,
    )

    print(f"Added {added} Datasets, Failed {failed} Datasets")

    if failed > 0:
        sys.exit(failed)