Exemple #1
0
def cli(thredds_catalogue, skips, select, workers, outfile):
    """ Download Metadata from THREDDS server to tarball

    Example:

       \b
       Download files in directory that match `*yaml` and store them as a tar
        > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/"
        -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*'
         -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz

    """
    print("Searching {thredds_catalogue} for matching files".format(
        thredds_catalogue=thredds_catalogue))
    urls = thredds_find_glob(thredds_catalogue, skips, [select], workers)

    print("Found {0} metadata urls".format(str(len(urls))))

    yamls = download_yamls(urls, workers)

    # jam it all in a tar
    tar_opts = dict(name=outfile,
                    mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False))
    with tarfile.open(**tar_opts) as tar:
        for yaml in yamls:
            add_txt_file(tar=tar, content=yaml[0], fname=yaml[1])

    print("Done!")
Exemple #2
0
def cli(thredds_catalogue, skips, select, workers, outfile):
    """ Download Metadata from THREDDS server to tarball

    Example:

       \b
       Download files in directory that match `*yaml` and store them as a tar
        > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/"
        -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*'
         -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz

    """

    user_skips = Crawl.SKIPS
    for skip in skips:
        user_skips = user_skips + [skip]

    print("Searching {thredds_catalogue} for matching files".format(
        thredds_catalogue=thredds_catalogue))
    results = Crawl(thredds_catalogue + '/catalog.xml',
                    select=[select],
                    skip=user_skips,
                    workers=workers).datasets

    print("Found {0} metadata files".format(str(len(results))))

    # construct (guess) the fileserver url based on
    # https://www.unidata.ucar.edu/software/thredds/v4.6/tds/reference/Services.html#HTTP

    parsed_uri = urlparse(thredds_catalogue)

    split_path = parsed_uri.path.split('/')
    fileserver_path = parsed_uri.scheme + '://' + parsed_uri.netloc + '/'.join(
        split_path[:(split_path.index('thredds') + 1)] + ['fileServer', ''])

    parsed_uri = urlparse(fileserver_path)

    # use a threadpool to download from thredds
    pool = ThreadPool(workers)
    yamls = pool.map(partial(download, parsed_uri=parsed_uri), results)
    pool.close()
    pool.join()

    # jam it all in a tar
    tar_opts = dict(name=outfile,
                    mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False))
    with tarfile.open(**tar_opts) as tar:
        for yaml in yamls:
            add_txt_file(tar=tar, content=yaml[0], fname=yaml[1])

    print("Done!")
Exemple #3
0
def cli(thredds_catalogue, skips, select, workers, outfile):
    """ Download Metadata from THREDDS server to tarball

    Example:

       \b
       Download files in directory that match `*yaml` and store them as a tar
        > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/"
        -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*'
         -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz

    """

    user_skips = Crawl.SKIPS
    for skip in skips:
        user_skips = user_skips + [skip]

    print("Searching {thredds_catalogue} for matching files".format(
        thredds_catalogue=thredds_catalogue))
    results = Crawl(thredds_catalogue + '/catalog.xml',
                    select=[select],
                    skip=user_skips,
                    workers=workers).datasets

    print("Found {0} metadata files".format(str(len(results))))

    urls = [
        service['url'] for dataset in results for service in dataset.services
        if service['service'].lower() == 'httpserver'
    ]

    # use a threadpool to download from thredds
    pool = ThreadPool(workers)
    yamls = pool.map(partial(download), urls)
    pool.close()
    pool.join()

    # jam it all in a tar
    tar_opts = dict(name=outfile,
                    mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False))
    with tarfile.open(**tar_opts) as tar:
        for yaml in yamls:
            add_txt_file(tar=tar, content=yaml[0], fname=yaml[1])

    print("Done!")
Exemple #4
0
def cli( account_url: str,
        container_name: str,
        credential: str,
        prefix: str,
        suffix: str,
        workers: int,
        outfile:str):

    print(f"Opening AZ Container {container_name} on {account_url}")
    print(f"Searching on prefix '{prefix}' for files matching suffix '{suffix}'")
    yaml_urls = find_blobs(account_url, container_name, credential, prefix, suffix)

    print(f"Found {len(yaml_urls)} datasets")
    yamls = download_yamls(account_url, container_name, credential, yaml_urls, workers)

    url_prefix = (account_url + "/" + container_name + "/")[len("https://") :]

    # jam it all in a tar
    tar_opts = dict(name=outfile, mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False))
    with tarfile.open(**tar_opts) as tar:
        for yaml in yamls:
            add_txt_file(tar=tar, content=yaml[0], fname= url_prefix + yaml[1])

    print("Done!")
Exemple #5
0
def cli(n, verbose, gzip, xz, outfile, no_sign_request=None, request_payer=False):
    """ Fetch a bunch of s3 files into a tar archive.

    \b
    For every non-empty line in stdin
       - Treat line as a URI and fetch document from it
       - Write content of the file to a tar archive using `bucket-name/path/to/file` as file name
    """

    opts = {}
    if request_payer:
        opts['RequestPayer'] = 'requester'

    logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', level=logging.ERROR)

    nconnections = 24 if n is None else n
    exit_early = False

    def dump_to_tar(data_stream, tar):
        nonlocal exit_early
        fps = RateEstimator()

        for d in data_stream:
            fps()
            fname = d.url[5:]

            if d.data is not None:
                if verbose:
                    if fps.every(10):
                        print('.', file=stderr, end='', flush=True)

                    if fps.every(100):
                        print(' {}'.format(str(fps)), file=stderr)

                add_txt_file(tar, fname, d.data, last_modified=d.last_modified)
            else:
                print("Failed %s (%s)" % (d.url, str(d.error)),
                      file=stderr)

            if exit_early:
                break

        if verbose:
            print(' {}'.format(str(fps)), file=stderr)

    fetcher = S3Fetcher(nconcurrent=nconnections, aws_unsigned=no_sign_request)
    is_pipe = outfile == '-'
    tar_opts = dict(mode='w'+tar_mode(gzip=gzip, xz=xz, is_pipe=is_pipe))
    if is_pipe:
        if stdout.isatty():
            click.echo("Will not write to a terminal", err=True)
            sys.exit(1)
        # TODO: on windows switch stdout to binary mode
        tar_opts['fileobj'] = stdout.buffer
    else:
        tar_opts['name'] = outfile

    urls = read_stdin_lines(skip_empty=True)

    def on_ctrlc(sig, frame):
        nonlocal exit_early
        print('Shutting down...', file=sys.stderr)
        exit_early = True

    signal.signal(signal.SIGINT, on_ctrlc)

    with tarfile.open(**tar_opts) as tar:
        dump_to_tar(fetcher(urls, **opts), tar)

    fetcher.close()
Exemple #6
0
def cli(input_fname,
        env,
        product_names,
        exclude_product_names,
        auto_add_lineage,
        verify_lineage,
        ignore_lineage,
        update,
        eo3,
        gzip,
        xz,
        protocol):

    # Ensure :// is present in prefix
    prefix = protocol.rstrip('://') + '://'
    if prefix.startswith('file'):
        prefix = prefix + '/'

    if ignore_lineage:
        auto_add_lineage = False

    if eo3:
        verify_lineage = False
        auto_add_lineage = False

    ds_resolve_args = dict(products=product_names,
                           exclude_products=exclude_product_names,
                           fail_on_missing_lineage=not auto_add_lineage,
                           verify_lineage=verify_lineage,
                           skip_lineage=ignore_lineage)

    doc_transform = prep_eo3 if eo3 else None
    allowed_changes = {(): allow_any}

    def mk_uri(name):
        return prefix + name

    def report_error(msg):
        print(msg, file=sys.stderr)

    def process_file(filename, index, fps, mode=None, n_failed=0, doc_transform=None):
        for ds, err in from_tar_file(filename, index, mk_uri, doc_transform=doc_transform, mode=mode, **ds_resolve_args):
            if ds is not None:
                try:
                    if update:
                        index.datasets.update(ds, allowed_changes)
                    else:
                        index.datasets.add(ds, with_lineage=auto_add_lineage)

                except Exception as e:
                    n_failed += 1
                    report_error(str(e))
            else:
                n_failed += 1
                report_error(err)

            fps()

            if fps.every(10):
                print('.', end='', flush=True)

            if fps.every(100):
                print(' {} F:{:d}'.format(str(fps), n_failed))

        return n_failed

    dc = datacube.Datacube(env=env)

    if len(input_fname) == 0:
        input_fname = ('-',)

    n_failed = 0
    fps = RateEstimator()
    mode = None

    for filename in input_fname:
        if filename == '-':
            if sys.stdin.isatty():
                report_error("Requesting to read from stdin but not redirecting input?")
                sys.exit(1)
            filename = sys.stdin.buffer
            mode = tar_mode(is_pipe=True, gzip=gzip, xz=xz)

        n_failed = process_file(filename, dc.index, fps, mode=mode, n_failed=n_failed, doc_transform=doc_transform)

    if n_failed > 0:
        report_error("**WARNING** there were failures: {}".format(n_failed))