Esempio n. 1
0
def find_dupes_to_archive(dc, time, keep_threshold=0.05, freq="m"):
    ds_s2_order = lambda ds: (
        ds.center_time,
        ds.metadata.region_code,
        ds.metadata_doc["label"],
    )

    query = dict(product="s2_l2a", time=time)
    n_total = dataset_count(dc.index, **query)
    dss = ordered_dss(dc, key=ds_s2_order, freq=freq, **query)

    dss = tqdm(dss, total=n_total)
    groups = (group for group in toolz.partitionby(
        lambda ds: (ds.center_time, ds.metadata.region_code), dss)
              if len(group) > 1)

    keep_groups = []
    to_archive = []

    for dss_group in groups:
        a_or, a_and = overlap_info(dss_group)
        # aa is in range [0, 1] with
        #  0 -- 100% overlap across dupes
        #  1 -- 0% overlap across dupes (disjoint footprints)
        aa = (a_or - a_and) / a_or
        if aa > keep_threshold:
            keep_groups.append(dss_group)
        else:
            to_archive.extend(ds.id for ds in dss_group[:-1])

    return to_archive, keep_groups, n_total
Esempio n. 2
0
    def _get_dss(
        self,
        dc: Datacube,
        product: str,
        msg: Callable[[str], Any],
        temporal_range: Optional[DateTimeRange] = None,
        tiles: Optional[TilesRange2d] = None,
    ):
        """
        This returns a tuple containing:
        - a generator of datasets
        - the number of datasets in the generator
        - a config dictionary containing the product, temporal range, tiles, and the datacube query used
        """

        cfg: Dict[str, Any] = dict(
            grid=self._grid,
            freq=self._frequency,
        )

        query = dict(product=product)

        if tiles is not None:
            (x0, x1), (y0, y1) = tiles
            msg(f"Limit search to tiles: x:[{x0}, {x1}) y:[{y0}, {y1})")
            cfg["tiles"] = tiles
            query["geopolygon"] = gs_bounds(self._gridspec, tiles)

        if temporal_range is not None:
            query.update(
                temporal_range.dc_query(pad=0.6)
            )  # pad a bit more than half a day on each side
            cfg["temporal_range"] = temporal_range.short

        cfg["query"] = sanitize_query(query)

        if DatasetCache.exists(self._output) and self._overwrite is False:
            raise ValueError(f"File database already exists: {self._output}")

        msg("Connecting to the database, counting datasets")
        n_dss = dataset_count(dc.index, **query)
        if n_dss == 0:
            msg("Found no datasets to process")
            return False

        msg(f"Processing {n_dss:,d} datasets")

        cells: Dict[Tuple[int, int], Any] = {}
        if "time" in query:
            dss = chopped_dss(dc, freq="w", **query)
        else:
            if len(query) == 1:
                dss = all_datasets(dc, **query)
            else:
                # note: this blocks for large result sets
                dss = dc.find_datasets_lazy(**query)

        return dss, n_dss, cfg
Esempio n. 3
0
    def _update_info_count(self):
        s = self._state
        spatial_query = s.bounds

        s.count = dataset_count(self._dc.index,
                                product=s.product,
                                time=s.time,
                                **spatial_query)
        self._gui.info.value = "{:,d} datasets in view".format(s.count)
Esempio n. 4
0
    def save(
        self,
        dc: Datacube,
        product: str,
        temporal_range: Union[str, DateTimeRange, None] = None,
        tiles: Optional[TilesRange2d] = None,
        msg: Optional[Callable[[str], Any]] = None,
        debug: bool = False,
    ) -> bool:

        dt_range = SimpleNamespace(start=None, end=None)

        def _update_start_end(x, out):
            if out.start is None:
                out.start = x
                out.end = x
            else:
                out.start = min(out.start, x)
                out.end = max(out.end, x)

        def persist(ds: Dataset) -> CompressedDataset:
            _ds = compress_ds(ds)
            _update_start_end(_ds.time, dt_range)
            return _ds

        def msg_default(msg):
            pass

        if msg is None:
            msg = msg_default

        if isinstance(temporal_range, str):
            temporal_range = DateTimeRange(temporal_range)

        cfg: Dict[str, Any] = dict(
            grid=self._grid,
            freq=self._frequency,
        )

        query = dict(product=product)

        if tiles is not None:
            (x0, x1), (y0, y1) = tiles
            msg(f"Limit search to tiles: x:[{x0}, {x1}) y:[{y0}, {y1})")
            cfg["tiles"] = tiles
            query["geopolygon"] = gs_bounds(self._gridspec, tiles)

        # TODO: properly handle UTC offset when limiting query to a given time temporal_range
        #       Basically need to pad query by 12hours, then trim datasets post-query
        if temporal_range is not None:
            query.update(temporal_range.dc_query(
                pad=0.6))  # pad a bit more than half a day on each side
            cfg["temporal_range"] = temporal_range.short

        cfg["query"] = sanitize_query(query)

        if DatasetCache.exists(self._output) and self._overwrite is False:
            raise ValueError(f"File database already exists: {self._output}")

        msg("Connecting to the database, counting datasets")
        n_dss = dataset_count(dc.index, **query)
        if n_dss == 0:
            msg("Found no datasets to process")
            return False

        msg(f"Processing {n_dss:,d} datasets")

        msg("Training compression dictionary")
        zdict = dictionary_from_product_list(dc, [product],
                                             samples_per_product=100)
        msg(".. done")

        cache = DatasetCache.create(
            self._output,
            zdict=zdict,
            complevel=self._complevel,
            truncate=self._overwrite,
        )
        cache.add_grid(self._gridspec, self._grid)
        cache.append_info_dict("stats/", dict(config=cfg))

        cells: Dict[Tuple[int, int], Any] = {}
        if "time" in query:
            dss = chopped_dss(dc, freq="w", **query)
        else:
            if len(query) == 1:
                dss = all_datasets(dc, **query)
            else:
                # note: this blocks for large result sets
                dss = dc.find_datasets_lazy(**query)

        dss = cache.tee(dss)
        dss = bin_dataset_stream(self._gridspec, dss, cells, persist=persist)
        dss = tqdm(dss, total=n_dss)

        rr = ds_stream_test_func(dss)
        msg(rr.text)

        if tiles is not None:
            # prune out tiles that were not requested
            cells = {
                tidx: dss
                for tidx, dss in cells.items() if is_tile_in(tidx, tiles)
            }

        n_tiles = len(cells)
        msg(f"Total of {n_tiles:,d} spatial tiles")

        if self._frequency == "all":
            tasks = bin_full_history(cells,
                                     start=dt_range.start,
                                     end=dt_range.end)
        elif self._frequency == "seasonal":
            tasks = bin_seasonal(cells, months=3, anchor=12)
        elif temporal_range is not None:
            tasks = bin_generic(cells, [temporal_range])
        else:
            tasks = bin_annual(cells)

        tasks_uuid = {k: [ds.id for ds in dss] for k, dss in tasks.items()}

        msg(f"Saving tasks to disk ({len(tasks)})")
        cache.add_grid_tiles(self._grid, tasks_uuid)
        msg(".. done")

        csv_path = self.out_path(".csv")
        msg(f"Writing summary to {csv_path}")
        with open(csv_path, "wt") as f:
            f.write('"T","X","Y","datasets","days"\n')

            for p, x, y in sorted(tasks):
                dss = tasks[(p, x, y)]
                n_dss = len(dss)
                n_days = len(set(ds.time.date() for ds in dss))
                line = f'"{p}", {x:+05d}, {y:+05d}, {n_dss:4d}, {n_days:4d}\n'
                f.write(line)

        msg("Dumping GeoJSON(s)")
        grid_info = compute_grid_info(
            cells, resolution=max(self._gridspec.tile_size) / 4)
        tasks_geo = gjson_from_tasks(tasks, grid_info)
        for temporal_range, gjson in tasks_geo.items():
            fname = self.out_path(f"-{temporal_range}.geojson")
            msg(f"..writing to {fname}")
            with open(fname, "wt") as f:
                json.dump(gjson, f)

        if debug:
            pkl_path = self.out_path("-cells.pkl")
            msg(f"Saving debug info to: {pkl_path}")
            with open(pkl_path, "wb") as fb:
                pickle.dump(cells, fb)

            pkl_path = self.out_path("-tasks.pkl")
            msg(f"Saving debug info to: {pkl_path}")
            with open(pkl_path, "wb") as fb:
                pickle.dump(tasks, fb)

        return True
Esempio n. 5
0
def cli(env, grid, year, output, products, complevel):
    """Extract product(s) to an on disk cache.

    Optionally tile datasets into a grid while extracting (see --grid option)
    """

    if len(products) == 0:
        click.echo("Have to supply at least one product")
        raise click.Abort()

    dc = datacube.Datacube(env=env)
    all_prods = {p.name: p for p in dc.index.products.get_all()}

    if len(products) == 1 and products[0].lower() in (":all:", "*"):
        click.echo("Will read all products")
        products = list(all_prods)

    for p in products:
        if p not in all_prods:
            click.echo("No such product found: %s" % p)
            raise click.Abort()

    query = {}
    if year is not None:
        query.update(time=f"{year}")

    click.echo("Getting dataset counts")
    counts = {p: dataset_count(dc.index, product=p, **query) for p in products}

    n_total = 0
    for p, c in counts.items():
        click.echo("..{}: {:8,d}".format(p, c))
        n_total += c

    if n_total == 0:
        click.echo("No datasets found")
        raise click.Abort()

    click.echo("Training compression dictionary")
    zdict = dictionary_from_product_list(dc,
                                         products,
                                         samples_per_product=50,
                                         query=query)
    click.echo("..done")

    # TODO: check for overwrite
    cache = dscache.create_cache(output,
                                 zdict=zdict,
                                 complevel=complevel,
                                 truncate=True)

    raw2ds = mk_raw2ds(all_prods)

    def db_task(products, conn, q):
        for p in products:
            if len(query) == 0:
                dss = map(raw2ds, raw_dataset_stream(p, conn))
            else:
                dss = ordered_dss(dc, product=p, **query)

            for ds in dss:
                q.put(ds)
        q.put(EOS)

    conn = db_connect(cfg=env)
    q = queue.Queue(maxsize=10_000)
    db_thread = Thread(target=db_task, args=(products, conn, q))
    db_thread.start()

    dss = qmap(lambda ds: ds, q, eos_marker=EOS)
    dss = cache.tee(dss)

    cells = {}
    if grid is not None:
        gs = parse_gridspec(grid)
        # TODO for named gridspecs should we use the name as group_prefix?
        group_prefix = f"epsg{gs.crs.epsg:d}"
        cache.add_grid(gs, group_prefix)
        dss = bin_dataset_stream(gs, dss, cells)

    label = "Processing ({:8,d})".format(n_total)
    with click.progressbar(dss, label=label, length=n_total) as dss:
        for _ in dss:
            pass

    if grid is not None:
        click.echo("Total bins: {:d}".format(len(cells)))

        with click.progressbar(cells.values(),
                               length=len(cells),
                               label="Saving") as groups:
            for group in groups:
                cache.add_grid_tile(group_prefix, group.idx, group.dss)

    db_thread.join()
    cache.close()