def cli(product, input_prefix, location, verbose): """ Dump stats data into a cache file. Note: The input bucket must be public otherwise the data can not be listed. """ product = product_from_yaml(product) dss = s3_fetch_dss(input_prefix, product, glob="*.json") cache = create_cache(f"{product.name}.db") if verbose: print(f"Writing {location}/{product.name}.db") cache = create_cache(f"{location}/{product.name}.db") cache.bulk_save(dss) if verbose: print(f"Found {cache.count:,d} datasets")
def cli(product, input_prefix, location, verbose): """ Generate mosaic overviews of the stats data. An intermediate cache file is generated and stored in the output location during this process. Note: The input bucket must be public otherwise the data can not be listed. """ product = product_from_yaml(product) if verbose: print(f"Preparing mosaics for {product.name} product") dss = s3_fetch_dss(input_prefix, product, glob="*.json") cache = create_cache(f"{product.name}.db") if verbose: print(f"Writing {location}/{product.name}.db") cache = create_cache(f"{location}/{product.name}.db") cache.bulk_save(dss) if verbose: print(f"Found {cache.count:,d} datasets") dc = Datacube() dss = list(cache.get_all()) xx = dc.load( datasets=dss, dask_chunks={ "x": 3200, "y": 3200 }, resolution=(-120, 120), measurements=["red", "green", "blue"], ) save(xx, location, product.name, verbose)
def cli(env, grid, year, output, products, complevel): """Extract product(s) to an on disk cache. Optionally tile datasets into a grid while extracting (see --grid option) """ if len(products) == 0: click.echo("Have to supply at least one product") raise click.Abort() dc = datacube.Datacube(env=env) all_prods = {p.name: p for p in dc.index.products.get_all()} if len(products) == 1 and products[0].lower() in (":all:", "*"): click.echo("Will read all products") products = list(all_prods) for p in products: if p not in all_prods: click.echo("No such product found: %s" % p) raise click.Abort() query = {} if year is not None: query.update(time=f"{year}") click.echo("Getting dataset counts") counts = {p: dataset_count(dc.index, product=p, **query) for p in products} n_total = 0 for p, c in counts.items(): click.echo("..{}: {:8,d}".format(p, c)) n_total += c if n_total == 0: click.echo("No datasets found") raise click.Abort() click.echo("Training compression dictionary") zdict = dictionary_from_product_list(dc, products, samples_per_product=50, query=query) click.echo("..done") # TODO: check for overwrite cache = dscache.create_cache(output, zdict=zdict, complevel=complevel, truncate=True) raw2ds = mk_raw2ds(all_prods) def db_task(products, conn, q): for p in products: if len(query) == 0: dss = map(raw2ds, raw_dataset_stream(p, conn)) else: dss = ordered_dss(dc, product=p, **query) for ds in dss: q.put(ds) q.put(EOS) conn = db_connect(cfg=env) q = queue.Queue(maxsize=10_000) db_thread = Thread(target=db_task, args=(products, conn, q)) db_thread.start() dss = qmap(lambda ds: ds, q, eos_marker=EOS) dss = cache.tee(dss) cells = {} if grid is not None: gs = parse_gridspec(grid) # TODO for named gridspecs should we use the name as group_prefix? group_prefix = f"epsg{gs.crs.epsg:d}" cache.add_grid(gs, group_prefix) dss = bin_dataset_stream(gs, dss, cells) label = "Processing ({:8,d})".format(n_total) with click.progressbar(dss, label=label, length=n_total) as dss: for _ in dss: pass if grid is not None: click.echo("Total bins: {:d}".format(len(cells))) with click.progressbar(cells.values(), length=len(cells), label="Saving") as groups: for group in groups: cache.add_grid_tile(group_prefix, group.idx, group.dss) db_thread.join() cache.close()
def cli(env, output, products, complevel): if len(products) == 0: click.echo('Have to supply at least one product') raise click.Abort() dc = datacube.Datacube(env=env) all_prods = {p.name: p for p in dc.index.products.get_all()} if len(products) == 1 and products[0].lower() in (':all:', '*'): click.echo('Will read all products') products = list(all_prods) for p in products: if p not in all_prods: click.echo('No such product found: %s' % p) raise click.Abort() click.echo('Getting dataset counts') counts = { p.name: count for p, count in dc.index.datasets.count_by_product( product=[p for p in products]) } n_total = 0 for p, c in counts.items(): click.echo('..{}: {:8,d}'.format(p, c)) n_total += c if n_total == 0: click.echo("No datasets found") raise click.Abort() click.echo('Training compression dictionary') zdict = dictionary_from_product_list(dc, products, samples_per_product=50) click.echo('..done') # TODO: check for overwrite cache = dscache.create_cache(output, zdict=zdict, complevel=complevel, truncate=True) raw2ds = mk_raw2ds(all_prods) def db_task(products, conn, q): for p in products: for ds in map(raw2ds, raw_dataset_stream(p, conn)): q.put(ds) q.put(EOS) conn = db_connect(cfg=env) q = queue.Queue(maxsize=10_000) db_thread = Thread(target=db_task, args=(products, conn, q)) db_thread.start() dss = qmap(lambda ds: ds, q, eos_marker=EOS) dss = cache.tee(dss) label = 'Processing ({:8,d})'.format(n_total) with click.progressbar(dss, label=label, length=n_total) as dss: for ds in dss: pass db_thread.join() cache.close()