Exemple #1
0
    def group(self, datasets: VirtualDatasetBag,
              **group_settings: Dict[str, Any]) -> VirtualDatasetBox:
        self._assert(
            'juxtapose' in datasets.bag
            and len(datasets.bag['juxtapose']) == len(self._children),
            "invalid dataset bag")

        groups = [
            product.group(
                VirtualDatasetBag(dataset_bag, datasets.geopolygon,
                                  datasets.product_definitions),
                **group_settings) for product, dataset_bag in zip(
                    self._children, datasets.bag['juxtapose'])
        ]

        aligned_boxes = xarray.align(*[grouped.box for grouped in groups])

        def tuplify(indexes, _):
            return {
                'juxtapose':
                [box.sel(**indexes).item() for box in aligned_boxes]
            }

        return VirtualDatasetBox(
            xr_apply(aligned_boxes[0], tuplify),
            select_unique([grouped.geobox for grouped in groups]),
            select_unique([grouped.load_natively for grouped in groups]),
            merge_dicts([grouped.product_definitions for grouped in groups]),
            geopolygon=select_unique(
                [grouped.geopolygon for grouped in groups]))
Exemple #2
0
def ingest_work(config, source_type, output_type, index, sources, geobox):
    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.product_data(sources, geobox, measurements, fuse_func=fuse_func)
    nudata = data.rename(namemap)
    file_path = get_filename(config, index, sources)

    def _make_dataset(labels, sources):
        sources_union = union_points(*[source.extent.to_crs(geobox.crs).points for source in sources])
        valid_data = intersect_points(geobox.extent.points, sources_union)
        dataset = make_dataset(dataset_type=output_type,
                               sources=sources,
                               extent=geobox.extent,
                               center_time=labels['time'],
                               uri=file_path.absolute().as_uri(),
                               app_info=get_app_metadata(config, config['filename']),
                               valid_data=GeoPolygon(valid_data, geobox.crs))
        return dataset
    datasets = xr_apply(sources, _make_dataset, dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(nudata, global_attributes, variable_params, file_path)

    return datasets
Exemple #3
0
    def _dask_load(sources,
                   geobox,
                   measurements,
                   dask_chunks,
                   skip_broken_datasets=False):
        needed_irr_chunks, grid_chunks = _calculate_chunk_sizes(
            sources, geobox, dask_chunks)
        gbt = GeoboxTiles(geobox, grid_chunks)
        dsk = {}

        def chunk_datasets(dss, gbt):
            out = {}
            for ds in dss:
                dsk[_tokenize_dataset(ds)] = ds
                for idx in gbt.tiles(ds.extent):
                    out.setdefault(idx, []).append(ds)
            return out

        chunked_srcs = xr_apply(sources,
                                lambda _, dss: chunk_datasets(dss, gbt),
                                dtype=object)

        def data_func(measurement):
            return _make_dask_array(chunked_srcs,
                                    dsk,
                                    gbt,
                                    measurement,
                                    chunks=needed_irr_chunks + grid_chunks,
                                    skip_broken_datasets=skip_broken_datasets)

        return Datacube.create_storage(
            OrderedDict((dim, sources.coords[dim]) for dim in sources.dims),
            geobox, measurements, data_func)
Exemple #4
0
def ingest_work(config, source_type, output_type, tile, tile_index):
    # pylint: disable=too-many-locals
    _LOG.info('Starting task %s', tile_index)
    driver = storage_writer_by_name(config['storage']['driver'])

    if driver is None:
        _LOG.error('Failed to load storage driver %s', config['storage']['driver'])
        raise ValueError('Something went wrong: no longer can find driver pointed by storage.driver option')

    namemap = get_namemap(config)
    # TODO: get_measurements possibly changes dtype, not sure load_data would like that
    measurements = get_measurements(source_type, config)
    resampling = get_resampling(config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]

    datasets = tile.sources.sum().item()
    for dataset in datasets:
        if not dataset.uris:
            _LOG.error('Locationless dataset found in the database: %r', dataset)

    data = Datacube.load_data(tile.sources, tile.geobox, measurements,
                              resampling=resampling,
                              fuse_func=fuse_func)

    nudata = data.rename(namemap)
    file_path = get_filename(config, tile_index, tile.sources)
    file_uri = driver.mk_uri(file_path, config['storage'])

    def _make_dataset(labels, sources):
        return make_dataset(product=output_type,
                            sources=sources,
                            extent=tile.geobox.extent,
                            center_time=labels['time'],
                            uri=file_uri,
                            app_info=get_app_metadata(config['filename']),
                            valid_data=polygon_from_sources_extents(sources, tile.geobox))

    datasets = xr_apply(tile.sources, _make_dataset, dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    variable_params['dataset'] = {
        'chunksizes': (1,),
        'zlib': True,
        'complevel': 9,
    }

    driver_data = driver.write_dataset_to_storage(nudata, file_uri,
                                                  global_attributes=global_attributes,
                                                  variable_params=variable_params,
                                                  storage_config=config['storage'])

    if (driver_data is not None) and len(driver_data) > 0:
        datasets.attrs['driver_data'] = driver_data

    _LOG.info('Finished task %s', tile_index)

    return datasets
Exemple #5
0
        def _juxtapose_group_():
            self._assert(
                'juxtapose' in datasets.pile
                and len(datasets.pile['juxtapose']) == len(self._children),
                "invalid dataset pile")

            groups = [
                product.group(
                    VirtualDatasetBag(dataset_pile, datasets.grid_spec,
                                      datasets.geopolygon,
                                      datasets.product_definitions),
                    **search_terms) for product, dataset_pile in zip(
                        self._children, datasets.pile['juxtapose'])
            ]

            aligned_piles = xarray.align(*[grouped.pile for grouped in groups])

            def tuplify(indexes, _):
                return {
                    'juxtapose':
                    [pile.sel(**indexes).item() for pile in aligned_piles]
                }

            return VirtualDatasetBox(
                xr_apply(aligned_piles[0], tuplify),
                select_unique([grouped.geobox for grouped in groups]),
                merge_dicts(
                    [grouped.product_definitions for grouped in groups]))
Exemple #6
0
def do_fixer_task(config, task):
    global_attributes = config['global_attributes']

    # Don't keep the original history if we are trying to fix it
    global_attributes['history'] = build_history_string(config,
                                                        task,
                                                        keep_original=False)

    variable_params = config['variable_params']

    output_filename = Path(task['output_filename'])
    output_uri = output_filename.absolute().as_uri()
    temp_filename = get_temp_file(output_filename)
    tile = task['tile']

    # Only use the time chunk size (eg 5), but not spatial chunks
    # This means the file only gets opened once per band, and all data is available when compressing on write
    # 5 * 4000 * 4000 * 2bytes == 152MB, so mem usage is not an issue
    chunk_profile = {'time': config['storage']['chunking']['time']}

    data = datacube.api.GridWorkflow.load(tile, dask_chunks=chunk_profile)

    unwrapped_datasets = xr_apply(tile.sources,
                                  _unwrap_dataset_list,
                                  dtype='O')
    data['dataset'] = datasets_to_doc(unwrapped_datasets)

    try:
        if data.geobox is None:
            raise DatacubeException(
                'Dataset geobox property is None, cannot write to NetCDF file.'
            )

        if data.geobox.crs is None:
            raise DatacubeException(
                'Dataset geobox.crs property is None, cannot write to NetCDF file.'
            )

        nco = create_netcdf_storage_unit(temp_filename, data.geobox.crs,
                                         data.coords, data.data_vars,
                                         variable_params, global_attributes)
        write_data_variables(data.data_vars, nco)
        nco.close()

        temp_filename.rename(output_filename)

        if config.get('check_data_identical', False):
            new_tile = make_updated_tile(unwrapped_datasets, output_uri,
                                         tile.geobox)
            new_data = datacube.api.GridWorkflow.load(
                new_tile, dask_chunks=chunk_profile)
            check_identical(data, new_data, output_filename)

    except Exception as e:
        if temp_filename.exists():
            temp_filename.unlink()
        raise e

    return unwrapped_datasets, output_uri
Exemple #7
0
def _do_fc_task(config, task):
    """
    Load data, run FC algorithm, attach metadata, and write output.
    :param dict config: Config object
    :param dict task: Dictionary of tasks
    :return: Dataset objects representing the generated data that can be added to the index
    :rtype: list(datacube.model.Dataset)
    """
    global_attributes = config['global_attributes']
    variable_params = config['variable_params']
    output_product = config['fc_product']

    file_path = Path(task['filename_dataset'])

    uri, band_uris = calc_uris(file_path, variable_params)
    output_measurements = config['fc_product'].measurements.values()

    nbart = io.native_load(task['dataset'], measurements=config['load_bands'])
    if config['band_mapping'] is not None:
        nbart = nbart.rename(config['band_mapping'])

    fc_dataset = run_fc(nbart, output_measurements,
                        config.get('sensor_regression_coefficients'))

    def _make_dataset(labels, sources):
        assert sources
        dataset = make_dataset(product=output_product,
                               sources=sources,
                               extent=nbart.geobox.extent,
                               center_time=labels['time'],
                               uri=uri,
                               band_uris=band_uris,
                               app_info=_get_app_metadata(config),
                               valid_data=polygon_from_sources_extents(
                                   sources, nbart.geobox))
        return dataset

    source = Datacube.group_datasets([task['dataset']], 'time')

    datasets = xr_apply(source, _make_dataset, dtype='O')
    fc_dataset['dataset'] = datasets_to_doc(datasets)

    base, ext = os.path.splitext(file_path)
    if ext == '.tif':
        dataset_to_geotif_yaml(
            dataset=fc_dataset,
            odc_dataset=datasets.item(),
            filename=file_path,
            variable_params=variable_params,
        )
    else:
        write_dataset_to_netcdf(
            dataset=fc_dataset,
            filename=file_path,
            global_attributes=global_attributes,
            variable_params=variable_params,
        )

    return datasets
Exemple #8
0
def make_updated_tile(old_datasets, new_uri, geobox):
    def update_dataset_location(labels, dataset: Dataset) -> List[Dataset]:
        new_dataset = copy.copy(dataset)
        new_dataset.uris = [new_uri]
        return [new_dataset]

    updated_datasets = xr_apply(old_datasets, update_dataset_location, dtype='O')
    return datacube.api.Tile(sources=updated_datasets, geobox=geobox)
def test_xr_apply():
    src = xr.DataArray(np.asarray([1, 2, 3], dtype='uint8'), dims=['time'])
    dst = xr_apply(src, lambda _, v: v, dtype='float32')

    assert dst.dtype.name == 'float32'
    assert dst.shape == src.shape
    assert dst.values.tolist() == [1, 2, 3]

    dst = xr_apply(src, lambda _, v: v)
    assert dst.dtype.name == 'uint8'
    assert dst.shape == src.shape
    assert dst.values.tolist() == [1, 2, 3]

    dst = xr_apply(src, lambda idx, _, v: idx[0] + v, with_numeric_index=True)
    assert dst.dtype.name == 'uint8'
    assert dst.shape == src.shape
    assert dst.values.tolist() == [0 + 1, 1 + 2, 2 + 3]
Exemple #10
0
def make_updated_tile(old_datasets, new_uri, geobox):
    def update_dataset_location(idx, labels, dataset: Dataset) -> List[Dataset]:
        idx, = idx
        new_dataset = copy.copy(dataset)
        new_dataset.uris = [mk_part_uri(new_uri, idx)]
        return [new_dataset]

    updated_datasets = xr_apply(old_datasets, update_dataset_location, with_numeric_index=True)
    return datacube.api.Tile(sources=updated_datasets, geobox=geobox)
Exemple #11
0
def ingest_work(config, source_type, output_type, tile, tile_index):
    # pylint: disable=too-many-locals
    _LOG.info('Starting task %s', tile_index)
    driver = storage_writer_by_name(config['storage']['driver'])

    if driver is None:
        _LOG.error('Failed to load storage driver %s', config['storage']['driver'])
        raise ValueError('Something went wrong: no longer can find driver pointed by storage.driver option')

    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func)

    nudata = data.rename(namemap)
    file_path = get_filename(config, tile_index, tile.sources)

    def mk_uri(file_path):
        if driver.uri_scheme == "file":
            return file_path.absolute().as_uri()
        return '{}://{}'.format(driver.uri_scheme, file_path)

    def _make_dataset(labels, sources):
        return make_dataset(product=output_type,
                            sources=sources,
                            extent=tile.geobox.extent,
                            center_time=labels['time'],
                            uri=mk_uri(file_path),
                            app_info=get_app_metadata(config, config['filename']),
                            valid_data=GeoPolygon.from_sources_extents(sources, tile.geobox))

    datasets = xr_apply(tile.sources, _make_dataset, dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    variable_params['dataset'] = {
        'chunksizes': (1,),
        'zlib': True,
        'complevel': 9,
    }

    storage_metadata = driver.write_dataset_to_storage(nudata, file_path,
                                                       global_attributes=global_attributes,
                                                       variable_params=variable_params,
                                                       storage_config=config['storage'])

    if (storage_metadata is not None) and len(storage_metadata) > 0:
        datasets.attrs['storage_metadata'] = storage_metadata

    _LOG.info('Finished task %s', tile_index)

    return datasets
Exemple #12
0
def do_stack_task(task):
    datasets_to_add = None
    datasets_to_update = None
    datasets_to_archive = None

    global_attributes = task['global_attributes']
    variable_params = task['variable_params']

    output_filename = Path(task['output_filename'])
    tile = task['tile']

    if task.get('make_new_datasets', False):
        datasets_to_add = make_datasets(tile, output_filename, task)
        datasets_to_archive = xr_apply(tile.sources, _single_dataset, dtype='O')

        output_datasets = datasets_to_add
    else:
        datasets_to_update = xr_apply(tile.sources, _single_dataset, dtype='O')

        output_datasets = datasets_to_update

    data = datacube.api.GridWorkflow.load(tile, dask_chunks=dict(time=1))  # TODO: chunk along output NetCDF chunk?
    data['dataset'] = datasets_to_doc(output_datasets)

    nco = create_netcdf_storage_unit(output_filename,
                                     data.crs,
                                     data.coords,
                                     data.data_vars,
                                     variable_params,
                                     global_attributes)

    for name, variable in data.data_vars.items():
        try:
            with dask.set_options(get=dask.async.get_sync):
                da.store(variable.data, nco[name], lock=True)
        except ValueError:
            nco[name][:] = netcdf_writer.netcdfy_data(variable.values)
        nco.sync()

    nco.close()
    return datasets_to_add, datasets_to_update, datasets_to_archive
Exemple #13
0
def make_datasets(tile, file_path, config):
    def _make_dataset(labels, sources):
        new_dataset = make_dataset(product=tile.product,
                                   sources=sources,
                                   extent=tile.geobox.extent,
                                   center_time=labels['time'],
                                   uri=file_path.absolute().as_uri(),
                                   app_info=get_app_metadata(config),
                                   valid_data=sources[0].extent)
        return new_dataset

    return xr_apply(tile.sources, _make_dataset, dtype='O')
Exemple #14
0
def do_stack_task(config, task):
    global_attributes = config['global_attributes']
    global_attributes['history'] = get_history_attribute(config, task)

    variable_params = config['variable_params']

    variable_params['dataset'] = {
        'chunksizes': (1, ),
        'zlib': True,
        'complevel': 9,
    }

    output_filename = Path(task['output_filename'])
    output_uri = output_filename.absolute().as_uri()
    temp_filename = get_temp_file(output_filename)
    tile = task['tile']

    # Only use the time chunk size (eg 5), but not spatial chunks
    # This means the file only gets opened once per band, and all data is available when compressing on write
    # 5 * 4000 * 4000 * 2bytes == 152MB, so mem usage is not an issue
    chunk_profile = {'time': config['storage']['chunking']['time']}

    data = datacube.api.GridWorkflow.load(tile, dask_chunks=chunk_profile)

    unwrapped_datasets = xr_apply(tile.sources,
                                  _unwrap_dataset_list,
                                  dtype='O')
    data['dataset'] = datasets_to_doc(unwrapped_datasets)

    try:
        nco = create_netcdf_storage_unit(temp_filename, data.crs, data.coords,
                                         data.data_vars, variable_params,
                                         global_attributes)
        write_data_variables(data.data_vars, nco)
        nco.close()

        temp_filename.rename(output_filename)

        if config.get('check_data_identical', False):
            new_tile = make_updated_tile(unwrapped_datasets, output_uri,
                                         tile.geobox)
            new_data = datacube.api.GridWorkflow.load(
                new_tile, dask_chunks=chunk_profile)
            check_identical(data, new_data, output_filename)

    except Exception as e:
        if temp_filename.exists():
            temp_filename.unlink()
        raise e

    return unwrapped_datasets, output_uri
Exemple #15
0
def do_ndvi_task(config, task):
    global_attributes = config['global_attributes']
    variable_params = config['variable_params']
    file_path = Path(task['filename'])
    output_type = config['ndvi_dataset_type']
    measurement = output_type.measurements['ndvi']
    output_dtype = np.dtype(measurement['dtype'])
    nodata_value = np.dtype(output_dtype).type(measurement['nodata'])

    if file_path.exists():
        raise OSError(errno.EEXIST, 'Output file already exists',
                      str(file_path))

    measurements = ['red', 'nir']

    nbar_tile = task['nbar']
    nbar = GridWorkflow.load(nbar_tile, measurements)

    ndvi = calculate_ndvi(nbar,
                          nodata=nodata_value,
                          dtype=output_dtype,
                          units=measurement['units'])

    def _make_dataset(labels, sources):
        assert len(sources)
        geobox = nbar.geobox
        source_data = union_points(
            *[dataset.extent.to_crs(geobox.crs).points for dataset in sources])
        valid_data = intersect_points(geobox.extent.points, source_data)
        dataset = make_dataset(product=output_type,
                               sources=sources,
                               extent=geobox.extent,
                               center_time=labels['time'],
                               uri=file_path.absolute().as_uri(),
                               app_info=get_app_metadata(config),
                               valid_data=GeoPolygon(valid_data, geobox.crs))
        return dataset

    datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O')
    ndvi['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(
        dataset=ndvi,
        filename=Path(file_path),
        global_attributes=global_attributes,
        variable_params=variable_params,
    )
    return datasets
Exemple #16
0
def ingest_work(driver_manager, config, source_type, output_type, tile,
                tile_index):
    _LOG.info('Starting task %s', tile_index)

    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.load_data(tile.sources,
                                  tile.geobox,
                                  measurements,
                                  fuse_func=fuse_func,
                                  driver_manager=driver_manager)
    nudata = data.rename(namemap)
    file_path = get_filename(config, tile_index, tile.sources)

    def _make_dataset(labels, sources):
        return make_dataset(product=output_type,
                            sources=sources,
                            extent=tile.geobox.extent,
                            center_time=labels['time'],
                            uri=file_path.absolute().as_uri(),
                            app_info=get_app_metadata(config,
                                                      config['filename']),
                            valid_data=GeoPolygon.from_sources_extents(
                                sources, tile.geobox))

    datasets = xr_apply(
        tile.sources, _make_dataset,
        dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    # Until ingest becomes a class and DriverManager an instance
    # variable, we call the constructor each time. DriverManager being
    # a singleton, there is little overhead, though.
    datasets.attrs['storage_output'] = driver_manager.write_dataset_to_storage(
        nudata, file_path, global_attributes, variable_params)
    _LOG.info('Finished task %s', tile_index)

    # When using multiproc executor, Driver Manager is a clone.
    if driver_manager.is_clone:
        driver_manager.close()

    return datasets
Exemple #17
0
def _do_fc_task(config, task):
    """
    Load data, run FC algorithm, attach metadata, and write output.
    :param dict config: Config object
    :param dict task: Dictionary of tasks
    :return: Dataset objects representing the generated data that can be added to the index
    :rtype: list(datacube.model.Dataset)
    """
    global_attributes = config['global_attributes']
    variable_params = config['variable_params']
    file_path = Path(task['filename'])
    output_product = config['fc_product']

    if file_path.exists():
        raise OSError(errno.EEXIST, 'Output file already exists',
                      str(file_path))

    nbart_tile: Tile = task['nbart']
    nbart = GridWorkflow.load(nbart_tile,
                              ['green', 'red', 'nir', 'swir1', 'swir2'])

    output_measurements = config['fc_product'].measurements.values()
    fc_dataset = _make_fc_tile(nbart, output_measurements,
                               config.get('sensor_regression_coefficients'))

    def _make_dataset(labels, sources):
        assert sources
        dataset = make_dataset(product=output_product,
                               sources=sources,
                               extent=nbart.geobox.extent,
                               center_time=labels['time'],
                               uri=file_path.absolute().as_uri(),
                               app_info=_get_app_metadata(config),
                               valid_data=polygon_from_sources_extents(
                                   sources, nbart.geobox))
        return dataset

    datasets = xr_apply(nbart_tile.sources, _make_dataset, dtype='O')
    fc_dataset['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(
        dataset=fc_dataset,
        filename=file_path,
        global_attributes=global_attributes,
        variable_params=variable_params,
    )
    return datasets
Exemple #18
0
def do_ndvi_task(config, task):
    global_attributes = config['global_attributes']
    variable_params = config['variable_params']
    file_path = Path(task['filename'])
    output_type = config['ndvi_dataset_type']
    measurement = output_type.measurements['ndvi']
    output_dtype = np.dtype(measurement['dtype'])
    nodata_value = np.dtype(output_dtype).type(measurement['nodata'])

    if file_path.exists():
        raise OSError(errno.EEXIST, 'Output file already exists', str(file_path))

    measurements = ['red', 'nir']

    nbar_tile = task['nbar']
    nbar = GridWorkflow.load(nbar_tile, measurements)

    ndvi = calculate_ndvi(nbar, nodata=nodata_value, dtype=output_dtype, units=measurement['units'])

    def _make_dataset(labels, sources):
        assert len(sources)
        geobox = nbar.geobox
        source_data = union_points(*[dataset.extent.to_crs(geobox.crs).points for dataset in sources])
        valid_data = intersect_points(geobox.extent.points, source_data)
        dataset = make_dataset(product=output_type,
                               sources=sources,
                               extent=geobox.extent,
                               center_time=labels['time'],
                               uri=file_path.absolute().as_uri(),
                               app_info=get_app_metadata(config),
                               valid_data=GeoPolygon(valid_data, geobox.crs))
        return dataset

    datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O')
    ndvi['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(
        dataset=ndvi,
        filename=Path(file_path),
        global_attributes=global_attributes,
        variable_params=variable_params,
    )
    return datasets
Exemple #19
0
    def _dask_load(sources,
                   geobox,
                   measurements,
                   dask_chunks,
                   skip_broken_datasets=False,
                   extra_dims=None):
        chunk_sizes = _calculate_chunk_sizes(sources, geobox, dask_chunks,
                                             extra_dims)
        needed_irr_chunks = chunk_sizes[0]
        if extra_dims:
            extra_dim_chunks = chunk_sizes[1]
        grid_chunks = chunk_sizes[-1]
        gbt = GeoboxTiles(geobox, grid_chunks)
        dsk = {}

        def chunk_datasets(dss, gbt):
            out = {}
            for ds in dss:
                dsk[_tokenize_dataset(ds)] = ds
                for idx in gbt.tiles(ds.extent):
                    out.setdefault(idx, []).append(ds)
            return out

        chunked_srcs = xr_apply(sources,
                                lambda _, dss: chunk_datasets(dss, gbt),
                                dtype=object)

        def data_func(measurement, shape):
            if 'extra_dim' in measurement:
                chunks = needed_irr_chunks + extra_dim_chunks + grid_chunks
            else:
                chunks = needed_irr_chunks + grid_chunks
            return _make_dask_array(chunked_srcs,
                                    dsk,
                                    gbt,
                                    measurement,
                                    chunks=chunks,
                                    skip_broken_datasets=skip_broken_datasets,
                                    extra_dims=extra_dims)

        return Datacube.create_storage(sources.coords, geobox, measurements,
                                       data_func, extra_dims)
def ingest_work(config, source_type, output_type, index, sources, geobox):
    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.product_data(sources,
                                     geobox,
                                     measurements,
                                     fuse_func=fuse_func)
    nudata = data.rename(namemap)
    file_path = get_filename(config, index, sources)

    def _make_dataset(labels, sources):
        sources_union = union_points(
            *[source.extent.to_crs(geobox.crs).points for source in sources])
        valid_data = intersect_points(geobox.extent.points, sources_union)
        dataset = make_dataset(dataset_type=output_type,
                               sources=sources,
                               extent=geobox.extent,
                               center_time=labels['time'],
                               uri=file_path.absolute().as_uri(),
                               app_info=get_app_metadata(
                                   config, config['filename']),
                               valid_data=GeoPolygon(valid_data, geobox.crs))
        return dataset

    datasets = xr_apply(
        sources, _make_dataset,
        dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(nudata, global_attributes, variable_params,
                            file_path)

    return datasets
Exemple #21
0
def ingest_work(config, source_type, output_type, tile, tile_index):
    _LOG.info('Starting task %s', tile_index)
    namemap = get_namemap(config)
    measurements = get_measurements(source_type, config)
    variable_params = get_variable_params(config)
    global_attributes = config['global_attributes']

    with datacube.set_options(reproject_threads=1):
        fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')]
        data = Datacube.load_data(tile.sources,
                                  tile.geobox,
                                  measurements,
                                  fuse_func=fuse_func)
    nudata = data.rename(namemap)
    file_path = get_filename(config, tile_index, tile.sources)

    def _make_dataset(labels, sources):
        return make_dataset(product=output_type,
                            sources=sources,
                            extent=tile.geobox.extent,
                            center_time=labels['time'],
                            uri=file_path.absolute().as_uri(),
                            app_info=get_app_metadata(config,
                                                      config['filename']),
                            valid_data=GeoPolygon.from_sources_extents(
                                sources, tile.geobox))

    datasets = xr_apply(
        tile.sources, _make_dataset,
        dtype='O')  # Store in Dataarray to associate Time -> Dataset
    nudata['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(nudata, file_path, global_attributes,
                            variable_params)
    _LOG.info('Finished task %s', tile_index)

    return datasets
Exemple #22
0
def do_fc_task(config, task):
    global_attributes = config['global_attributes']
    variable_params = config['variable_params']
    file_path = Path(task['filename'])
    output_product = config['fc_product']

    if file_path.exists():
        raise OSError(errno.EEXIST, 'Output file already exists', str(file_path))

    nbar_tile: Tile = task['nbar']
    nbar = GridWorkflow.load(nbar_tile, ['green', 'red', 'nir', 'swir1', 'swir2'])

    output_measurements = config['fc_product'].measurements.values()
    fc_dataset = make_fc_tile(nbar, output_measurements, config.get('sensor_regression_coefficients'))

    def _make_dataset(labels, sources):
        assert sources
        dataset = make_dataset(product=output_product,
                               sources=sources,
                               extent=nbar.geobox.extent,
                               center_time=labels['time'],
                               uri=file_path.absolute().as_uri(),
                               app_info=get_app_metadata(config),
                               valid_data=GeoPolygon.from_sources_extents(sources, nbar.geobox))
        return dataset

    datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O')
    fc_dataset['dataset'] = datasets_to_doc(datasets)

    write_dataset_to_netcdf(
        dataset=fc_dataset,
        filename=file_path,
        global_attributes=global_attributes,
        variable_params=variable_params,
    )
    return datasets
Exemple #23
0
 def map(self, func, dtype='O'):
     return DatasetPile(xr_apply(self.pile, func, dtype=dtype), self.geobox)
Exemple #24
0
 def map(self, func, dtype='O'):
     return VirtualDatasetBox(xr_apply(self.box, func, dtype=dtype),
                              self.geobox,
                              self.load_natively,
                              self.product_definitions,
                              geopolygon=self.geopolygon)
Exemple #25
0
 def map(self, func, dtype='O'):
     return VirtualDatasetBox(xr_apply(self.pile, func, dtype=dtype),
                              self.geobox, self.product_definitions)
Exemple #26
0
    def _find_source_datasets(self,
                              stat: OutputProduct,
                              uri: str = None,
                              band_uris: dict = None) -> xarray.DataArray:
        """
        Find all the source datasets for a task

        Put them in order so that they can be assigned to a stacked output aligned against it's time dimension

        :return: (datasets, sources)

        datasets is a bunch of strings to dump, indexed on time
        sources is more structured. An x-array of lists of dataset sources, indexed on time
        """
        task = self._task
        geobox = self._task.geobox
        app_info = self._app_info

        def add_all(iterable):
            return reduce_(operator.add, iterable)

        def merge_sources(prod):
            # Merge data sources and mask sources
            # Align the data `Tile` with potentially many mask `Tile`s along their time axis
            all_sources = xarray.align(
                prod.data.sources,
                *[mask_tile.sources for mask_tile in prod.masks if mask_tile])

            # TODO: The following can fail if prod.data and prod.masks have different times
            # Which can happen in the case of a missing PQ Scene, where there is a scene overlap
            # ie. Two overlapped NBAR scenes, One PQ scene (the later)
            return add_all(sources_.sum() for sources_ in all_sources)

        sources = add_all(merge_sources(prod) for prod in task.sources)

        def unique(index, dataset_tuple):
            return tuple(set(dataset_tuple))

        sources = xr_apply(sources, unique, dtype='O')

        # Sources has no time at this point, so insert back in the start of our stats epoch
        start_time, _ = task.time_period
        sources = unsqueeze_data_array(sources,
                                       dim='time',
                                       pos=0,
                                       coord=start_time,
                                       attrs=task.time_attributes)

        if not sources:
            raise StatsOutputError(
                'No valid sources found, or supplied sources do not align to the same time.\n'
                'Unable to write dataset metadata.')

        def _make_dataset(labels, sources_):
            return make_dataset(product=stat.product,
                                sources=sources_,
                                extent=geobox.extent,
                                center_time=labels['time'],
                                uri=uri,
                                band_uris=band_uris,
                                app_info=app_info,
                                valid_data=polygon_from_sources_extents(
                                    sources_, geobox))

        datasets = xr_apply(
            sources, _make_dataset,
            dtype='O')  # Store in DataArray to associate Time -> Dataset
        datasets = datasets_to_doc(datasets)
        return datasets
Exemple #27
0
    def group(self, datasets: VirtualDatasetBag,
              **search_terms: Dict[str, Any]) -> VirtualDatasetBox:
        """
        Datasets grouped by their timestamps.
        :param datasets: the `VirtualDatasetBag` to fetch data from
        :param query: to specify a spatial sub-region
        """
        grid_spec = datasets.grid_spec
        geopolygon = datasets.geopolygon

        if 'product' in self:
            # select only those inside the ROI
            # ROI could be smaller than the query for the `query` method
            if query_geopolygon(**search_terms) is not None:
                geopolygon = query_geopolygon(**search_terms)
                selected = list(
                    select_datasets_inside_polygon(datasets.pile, geopolygon))
            else:
                selected = list(datasets.pile)

            # geobox
            merged = merge_search_terms(
                select_keys(self, self._NON_SPATIAL_KEYS),
                select_keys(search_terms, self._NON_SPATIAL_KEYS))

            geobox = output_geobox(datasets=selected,
                                   grid_spec=grid_spec,
                                   geopolygon=geopolygon,
                                   **select_keys(merged, self._GEOBOX_KEYS))

            # group by time
            group_query = query_group_by(
                **select_keys(merged, self._GROUPING_KEYS))

            # information needed for Datacube.load_data
            return VirtualDatasetBox(
                Datacube.group_datasets(selected, group_query), geobox,
                datasets.product_definitions)

        elif 'transform' in self:
            return self._input.group(datasets, **search_terms)

        elif 'collate' in self:
            self._assert(
                'collate' in datasets.pile
                and len(datasets.pile['collate']) == len(self._children),
                "invalid dataset pile")

            def build(source_index, product, dataset_pile):
                grouped = product.group(
                    VirtualDatasetBag(dataset_pile, datasets.grid_spec,
                                      datasets.geopolygon,
                                      datasets.product_definitions),
                    **search_terms)

                def tag(_, value):
                    return {'collate': (source_index, value)}

                return grouped.map(tag)

            groups = [
                build(source_index, product, dataset_pile)
                for source_index, (product, dataset_pile) in enumerate(
                    zip(self._children, datasets.pile['collate']))
            ]

            return VirtualDatasetBox(
                xarray.concat([grouped.pile for grouped in groups],
                              dim='time'),
                select_unique([grouped.geobox for grouped in groups]),
                merge_dicts(
                    [grouped.product_definitions for grouped in groups]))

        elif 'juxtapose' in self:
            self._assert(
                'juxtapose' in datasets.pile
                and len(datasets.pile['juxtapose']) == len(self._children),
                "invalid dataset pile")

            groups = [
                product.group(
                    VirtualDatasetBag(dataset_pile, datasets.grid_spec,
                                      datasets.geopolygon,
                                      datasets.product_definitions),
                    **search_terms) for product, dataset_pile in zip(
                        self._children, datasets.pile['juxtapose'])
            ]

            aligned_piles = xarray.align(*[grouped.pile for grouped in groups])

            def tuplify(indexes, _):
                return {
                    'juxtapose':
                    [pile.sel(**indexes).item() for pile in aligned_piles]
                }

            return VirtualDatasetBox(
                xr_apply(aligned_piles[0], tuplify),
                select_unique([grouped.geobox for grouped in groups]),
                merge_dicts(
                    [grouped.product_definitions for grouped in groups]))

        else:
            raise VirtualProductException("virtual product was not validated")