def find_dupes_to_archive(dc, time, keep_threshold=0.05, freq="m"): ds_s2_order = lambda ds: ( ds.center_time, ds.metadata.region_code, ds.metadata_doc["label"], ) query = dict(product="s2_l2a", time=time) n_total = dataset_count(dc.index, **query) dss = ordered_dss(dc, key=ds_s2_order, freq=freq, **query) dss = tqdm(dss, total=n_total) groups = (group for group in toolz.partitionby( lambda ds: (ds.center_time, ds.metadata.region_code), dss) if len(group) > 1) keep_groups = [] to_archive = [] for dss_group in groups: a_or, a_and = overlap_info(dss_group) # aa is in range [0, 1] with # 0 -- 100% overlap across dupes # 1 -- 0% overlap across dupes (disjoint footprints) aa = (a_or - a_and) / a_or if aa > keep_threshold: keep_groups.append(dss_group) else: to_archive.extend(ds.id for ds in dss_group[:-1]) return to_archive, keep_groups, n_total
def _get_dss( self, dc: Datacube, product: str, msg: Callable[[str], Any], temporal_range: Optional[DateTimeRange] = None, tiles: Optional[TilesRange2d] = None, ): """ This returns a tuple containing: - a generator of datasets - the number of datasets in the generator - a config dictionary containing the product, temporal range, tiles, and the datacube query used """ cfg: Dict[str, Any] = dict( grid=self._grid, freq=self._frequency, ) query = dict(product=product) if tiles is not None: (x0, x1), (y0, y1) = tiles msg(f"Limit search to tiles: x:[{x0}, {x1}) y:[{y0}, {y1})") cfg["tiles"] = tiles query["geopolygon"] = gs_bounds(self._gridspec, tiles) if temporal_range is not None: query.update( temporal_range.dc_query(pad=0.6) ) # pad a bit more than half a day on each side cfg["temporal_range"] = temporal_range.short cfg["query"] = sanitize_query(query) if DatasetCache.exists(self._output) and self._overwrite is False: raise ValueError(f"File database already exists: {self._output}") msg("Connecting to the database, counting datasets") n_dss = dataset_count(dc.index, **query) if n_dss == 0: msg("Found no datasets to process") return False msg(f"Processing {n_dss:,d} datasets") cells: Dict[Tuple[int, int], Any] = {} if "time" in query: dss = chopped_dss(dc, freq="w", **query) else: if len(query) == 1: dss = all_datasets(dc, **query) else: # note: this blocks for large result sets dss = dc.find_datasets_lazy(**query) return dss, n_dss, cfg
def _update_info_count(self): s = self._state spatial_query = s.bounds s.count = dataset_count(self._dc.index, product=s.product, time=s.time, **spatial_query) self._gui.info.value = "{:,d} datasets in view".format(s.count)
def save( self, dc: Datacube, product: str, temporal_range: Union[str, DateTimeRange, None] = None, tiles: Optional[TilesRange2d] = None, msg: Optional[Callable[[str], Any]] = None, debug: bool = False, ) -> bool: dt_range = SimpleNamespace(start=None, end=None) def _update_start_end(x, out): if out.start is None: out.start = x out.end = x else: out.start = min(out.start, x) out.end = max(out.end, x) def persist(ds: Dataset) -> CompressedDataset: _ds = compress_ds(ds) _update_start_end(_ds.time, dt_range) return _ds def msg_default(msg): pass if msg is None: msg = msg_default if isinstance(temporal_range, str): temporal_range = DateTimeRange(temporal_range) cfg: Dict[str, Any] = dict( grid=self._grid, freq=self._frequency, ) query = dict(product=product) if tiles is not None: (x0, x1), (y0, y1) = tiles msg(f"Limit search to tiles: x:[{x0}, {x1}) y:[{y0}, {y1})") cfg["tiles"] = tiles query["geopolygon"] = gs_bounds(self._gridspec, tiles) # TODO: properly handle UTC offset when limiting query to a given time temporal_range # Basically need to pad query by 12hours, then trim datasets post-query if temporal_range is not None: query.update(temporal_range.dc_query( pad=0.6)) # pad a bit more than half a day on each side cfg["temporal_range"] = temporal_range.short cfg["query"] = sanitize_query(query) if DatasetCache.exists(self._output) and self._overwrite is False: raise ValueError(f"File database already exists: {self._output}") msg("Connecting to the database, counting datasets") n_dss = dataset_count(dc.index, **query) if n_dss == 0: msg("Found no datasets to process") return False msg(f"Processing {n_dss:,d} datasets") msg("Training compression dictionary") zdict = dictionary_from_product_list(dc, [product], samples_per_product=100) msg(".. done") cache = DatasetCache.create( self._output, zdict=zdict, complevel=self._complevel, truncate=self._overwrite, ) cache.add_grid(self._gridspec, self._grid) cache.append_info_dict("stats/", dict(config=cfg)) cells: Dict[Tuple[int, int], Any] = {} if "time" in query: dss = chopped_dss(dc, freq="w", **query) else: if len(query) == 1: dss = all_datasets(dc, **query) else: # note: this blocks for large result sets dss = dc.find_datasets_lazy(**query) dss = cache.tee(dss) dss = bin_dataset_stream(self._gridspec, dss, cells, persist=persist) dss = tqdm(dss, total=n_dss) rr = ds_stream_test_func(dss) msg(rr.text) if tiles is not None: # prune out tiles that were not requested cells = { tidx: dss for tidx, dss in cells.items() if is_tile_in(tidx, tiles) } n_tiles = len(cells) msg(f"Total of {n_tiles:,d} spatial tiles") if self._frequency == "all": tasks = bin_full_history(cells, start=dt_range.start, end=dt_range.end) elif self._frequency == "seasonal": tasks = bin_seasonal(cells, months=3, anchor=12) elif temporal_range is not None: tasks = bin_generic(cells, [temporal_range]) else: tasks = bin_annual(cells) tasks_uuid = {k: [ds.id for ds in dss] for k, dss in tasks.items()} msg(f"Saving tasks to disk ({len(tasks)})") cache.add_grid_tiles(self._grid, tasks_uuid) msg(".. done") csv_path = self.out_path(".csv") msg(f"Writing summary to {csv_path}") with open(csv_path, "wt") as f: f.write('"T","X","Y","datasets","days"\n') for p, x, y in sorted(tasks): dss = tasks[(p, x, y)] n_dss = len(dss) n_days = len(set(ds.time.date() for ds in dss)) line = f'"{p}", {x:+05d}, {y:+05d}, {n_dss:4d}, {n_days:4d}\n' f.write(line) msg("Dumping GeoJSON(s)") grid_info = compute_grid_info( cells, resolution=max(self._gridspec.tile_size) / 4) tasks_geo = gjson_from_tasks(tasks, grid_info) for temporal_range, gjson in tasks_geo.items(): fname = self.out_path(f"-{temporal_range}.geojson") msg(f"..writing to {fname}") with open(fname, "wt") as f: json.dump(gjson, f) if debug: pkl_path = self.out_path("-cells.pkl") msg(f"Saving debug info to: {pkl_path}") with open(pkl_path, "wb") as fb: pickle.dump(cells, fb) pkl_path = self.out_path("-tasks.pkl") msg(f"Saving debug info to: {pkl_path}") with open(pkl_path, "wb") as fb: pickle.dump(tasks, fb) return True
def cli(env, grid, year, output, products, complevel): """Extract product(s) to an on disk cache. Optionally tile datasets into a grid while extracting (see --grid option) """ if len(products) == 0: click.echo("Have to supply at least one product") raise click.Abort() dc = datacube.Datacube(env=env) all_prods = {p.name: p for p in dc.index.products.get_all()} if len(products) == 1 and products[0].lower() in (":all:", "*"): click.echo("Will read all products") products = list(all_prods) for p in products: if p not in all_prods: click.echo("No such product found: %s" % p) raise click.Abort() query = {} if year is not None: query.update(time=f"{year}") click.echo("Getting dataset counts") counts = {p: dataset_count(dc.index, product=p, **query) for p in products} n_total = 0 for p, c in counts.items(): click.echo("..{}: {:8,d}".format(p, c)) n_total += c if n_total == 0: click.echo("No datasets found") raise click.Abort() click.echo("Training compression dictionary") zdict = dictionary_from_product_list(dc, products, samples_per_product=50, query=query) click.echo("..done") # TODO: check for overwrite cache = dscache.create_cache(output, zdict=zdict, complevel=complevel, truncate=True) raw2ds = mk_raw2ds(all_prods) def db_task(products, conn, q): for p in products: if len(query) == 0: dss = map(raw2ds, raw_dataset_stream(p, conn)) else: dss = ordered_dss(dc, product=p, **query) for ds in dss: q.put(ds) q.put(EOS) conn = db_connect(cfg=env) q = queue.Queue(maxsize=10_000) db_thread = Thread(target=db_task, args=(products, conn, q)) db_thread.start() dss = qmap(lambda ds: ds, q, eos_marker=EOS) dss = cache.tee(dss) cells = {} if grid is not None: gs = parse_gridspec(grid) # TODO for named gridspecs should we use the name as group_prefix? group_prefix = f"epsg{gs.crs.epsg:d}" cache.add_grid(gs, group_prefix) dss = bin_dataset_stream(gs, dss, cells) label = "Processing ({:8,d})".format(n_total) with click.progressbar(dss, label=label, length=n_total) as dss: for _ in dss: pass if grid is not None: click.echo("Total bins: {:d}".format(len(cells))) with click.progressbar(cells.values(), length=len(cells), label="Saving") as groups: for group in groups: cache.add_grid_tile(group_prefix, group.idx, group.dss) db_thread.join() cache.close()