def cli(thredds_catalogue, skips, select, workers, outfile): """ Download Metadata from THREDDS server to tarball Example: \b Download files in directory that match `*yaml` and store them as a tar > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/" -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*' -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz """ print("Searching {thredds_catalogue} for matching files".format( thredds_catalogue=thredds_catalogue)) urls = thredds_find_glob(thredds_catalogue, skips, [select], workers) print("Found {0} metadata urls".format(str(len(urls)))) yamls = download_yamls(urls, workers) # jam it all in a tar tar_opts = dict(name=outfile, mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False)) with tarfile.open(**tar_opts) as tar: for yaml in yamls: add_txt_file(tar=tar, content=yaml[0], fname=yaml[1]) print("Done!")
def cli(thredds_catalogue, skips, select, workers, outfile): """ Download Metadata from THREDDS server to tarball Example: \b Download files in directory that match `*yaml` and store them as a tar > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/" -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*' -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz """ user_skips = Crawl.SKIPS for skip in skips: user_skips = user_skips + [skip] print("Searching {thredds_catalogue} for matching files".format( thredds_catalogue=thredds_catalogue)) results = Crawl(thredds_catalogue + '/catalog.xml', select=[select], skip=user_skips, workers=workers).datasets print("Found {0} metadata files".format(str(len(results)))) # construct (guess) the fileserver url based on # https://www.unidata.ucar.edu/software/thredds/v4.6/tds/reference/Services.html#HTTP parsed_uri = urlparse(thredds_catalogue) split_path = parsed_uri.path.split('/') fileserver_path = parsed_uri.scheme + '://' + parsed_uri.netloc + '/'.join( split_path[:(split_path.index('thredds') + 1)] + ['fileServer', '']) parsed_uri = urlparse(fileserver_path) # use a threadpool to download from thredds pool = ThreadPool(workers) yamls = pool.map(partial(download, parsed_uri=parsed_uri), results) pool.close() pool.join() # jam it all in a tar tar_opts = dict(name=outfile, mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False)) with tarfile.open(**tar_opts) as tar: for yaml in yamls: add_txt_file(tar=tar, content=yaml[0], fname=yaml[1]) print("Done!")
def cli(thredds_catalogue, skips, select, workers, outfile): """ Download Metadata from THREDDS server to tarball Example: \b Download files in directory that match `*yaml` and store them as a tar > thredds-to-tar -c "http://dapds00.nci.org.au/thredds/catalog/if87/2018-11-29/" -t ".*ARD-METADATA.yaml" -s '.*NBAR.*' -s '.*SUPPLEMENTARY.*' -s '.*NBART.*' -s '.*/QA/.*' -w 8 --outfile 2018-11-29.tar.gz """ user_skips = Crawl.SKIPS for skip in skips: user_skips = user_skips + [skip] print("Searching {thredds_catalogue} for matching files".format( thredds_catalogue=thredds_catalogue)) results = Crawl(thredds_catalogue + '/catalog.xml', select=[select], skip=user_skips, workers=workers).datasets print("Found {0} metadata files".format(str(len(results)))) urls = [ service['url'] for dataset in results for service in dataset.services if service['service'].lower() == 'httpserver' ] # use a threadpool to download from thredds pool = ThreadPool(workers) yamls = pool.map(partial(download), urls) pool.close() pool.join() # jam it all in a tar tar_opts = dict(name=outfile, mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False)) with tarfile.open(**tar_opts) as tar: for yaml in yamls: add_txt_file(tar=tar, content=yaml[0], fname=yaml[1]) print("Done!")
def cli( account_url: str, container_name: str, credential: str, prefix: str, suffix: str, workers: int, outfile:str): print(f"Opening AZ Container {container_name} on {account_url}") print(f"Searching on prefix '{prefix}' for files matching suffix '{suffix}'") yaml_urls = find_blobs(account_url, container_name, credential, prefix, suffix) print(f"Found {len(yaml_urls)} datasets") yamls = download_yamls(account_url, container_name, credential, yaml_urls, workers) url_prefix = (account_url + "/" + container_name + "/")[len("https://") :] # jam it all in a tar tar_opts = dict(name=outfile, mode='w' + tar_mode(gzip=True, xz=True, is_pipe=False)) with tarfile.open(**tar_opts) as tar: for yaml in yamls: add_txt_file(tar=tar, content=yaml[0], fname= url_prefix + yaml[1]) print("Done!")
def cli(n, verbose, gzip, xz, outfile, no_sign_request=None, request_payer=False): """ Fetch a bunch of s3 files into a tar archive. \b For every non-empty line in stdin - Treat line as a URI and fetch document from it - Write content of the file to a tar archive using `bucket-name/path/to/file` as file name """ opts = {} if request_payer: opts['RequestPayer'] = 'requester' logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', level=logging.ERROR) nconnections = 24 if n is None else n exit_early = False def dump_to_tar(data_stream, tar): nonlocal exit_early fps = RateEstimator() for d in data_stream: fps() fname = d.url[5:] if d.data is not None: if verbose: if fps.every(10): print('.', file=stderr, end='', flush=True) if fps.every(100): print(' {}'.format(str(fps)), file=stderr) add_txt_file(tar, fname, d.data, last_modified=d.last_modified) else: print("Failed %s (%s)" % (d.url, str(d.error)), file=stderr) if exit_early: break if verbose: print(' {}'.format(str(fps)), file=stderr) fetcher = S3Fetcher(nconcurrent=nconnections, aws_unsigned=no_sign_request) is_pipe = outfile == '-' tar_opts = dict(mode='w'+tar_mode(gzip=gzip, xz=xz, is_pipe=is_pipe)) if is_pipe: if stdout.isatty(): click.echo("Will not write to a terminal", err=True) sys.exit(1) # TODO: on windows switch stdout to binary mode tar_opts['fileobj'] = stdout.buffer else: tar_opts['name'] = outfile urls = read_stdin_lines(skip_empty=True) def on_ctrlc(sig, frame): nonlocal exit_early print('Shutting down...', file=sys.stderr) exit_early = True signal.signal(signal.SIGINT, on_ctrlc) with tarfile.open(**tar_opts) as tar: dump_to_tar(fetcher(urls, **opts), tar) fetcher.close()
def cli(input_fname, env, product_names, exclude_product_names, auto_add_lineage, verify_lineage, ignore_lineage, update, eo3, gzip, xz, protocol): # Ensure :// is present in prefix prefix = protocol.rstrip('://') + '://' if prefix.startswith('file'): prefix = prefix + '/' if ignore_lineage: auto_add_lineage = False if eo3: verify_lineage = False auto_add_lineage = False ds_resolve_args = dict(products=product_names, exclude_products=exclude_product_names, fail_on_missing_lineage=not auto_add_lineage, verify_lineage=verify_lineage, skip_lineage=ignore_lineage) doc_transform = prep_eo3 if eo3 else None allowed_changes = {(): allow_any} def mk_uri(name): return prefix + name def report_error(msg): print(msg, file=sys.stderr) def process_file(filename, index, fps, mode=None, n_failed=0, doc_transform=None): for ds, err in from_tar_file(filename, index, mk_uri, doc_transform=doc_transform, mode=mode, **ds_resolve_args): if ds is not None: try: if update: index.datasets.update(ds, allowed_changes) else: index.datasets.add(ds, with_lineage=auto_add_lineage) except Exception as e: n_failed += 1 report_error(str(e)) else: n_failed += 1 report_error(err) fps() if fps.every(10): print('.', end='', flush=True) if fps.every(100): print(' {} F:{:d}'.format(str(fps), n_failed)) return n_failed dc = datacube.Datacube(env=env) if len(input_fname) == 0: input_fname = ('-',) n_failed = 0 fps = RateEstimator() mode = None for filename in input_fname: if filename == '-': if sys.stdin.isatty(): report_error("Requesting to read from stdin but not redirecting input?") sys.exit(1) filename = sys.stdin.buffer mode = tar_mode(is_pipe=True, gzip=gzip, xz=xz) n_failed = process_file(filename, dc.index, fps, mode=mode, n_failed=n_failed, doc_transform=doc_transform) if n_failed > 0: report_error("**WARNING** there were failures: {}".format(n_failed))