def group(self, datasets: VirtualDatasetBag, **group_settings: Dict[str, Any]) -> VirtualDatasetBox: self._assert( 'juxtapose' in datasets.bag and len(datasets.bag['juxtapose']) == len(self._children), "invalid dataset bag") groups = [ product.group( VirtualDatasetBag(dataset_bag, datasets.geopolygon, datasets.product_definitions), **group_settings) for product, dataset_bag in zip( self._children, datasets.bag['juxtapose']) ] aligned_boxes = xarray.align(*[grouped.box for grouped in groups]) def tuplify(indexes, _): return { 'juxtapose': [box.sel(**indexes).item() for box in aligned_boxes] } return VirtualDatasetBox( xr_apply(aligned_boxes[0], tuplify), select_unique([grouped.geobox for grouped in groups]), select_unique([grouped.load_natively for grouped in groups]), merge_dicts([grouped.product_definitions for grouped in groups]), geopolygon=select_unique( [grouped.geopolygon for grouped in groups]))
def ingest_work(config, source_type, output_type, index, sources, geobox): namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.product_data(sources, geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, index, sources) def _make_dataset(labels, sources): sources_union = union_points(*[source.extent.to_crs(geobox.crs).points for source in sources]) valid_data = intersect_points(geobox.extent.points, sources_union) dataset = make_dataset(dataset_type=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply(sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, global_attributes, variable_params, file_path) return datasets
def _dask_load(sources, geobox, measurements, dask_chunks, skip_broken_datasets=False): needed_irr_chunks, grid_chunks = _calculate_chunk_sizes( sources, geobox, dask_chunks) gbt = GeoboxTiles(geobox, grid_chunks) dsk = {} def chunk_datasets(dss, gbt): out = {} for ds in dss: dsk[_tokenize_dataset(ds)] = ds for idx in gbt.tiles(ds.extent): out.setdefault(idx, []).append(ds) return out chunked_srcs = xr_apply(sources, lambda _, dss: chunk_datasets(dss, gbt), dtype=object) def data_func(measurement): return _make_dask_array(chunked_srcs, dsk, gbt, measurement, chunks=needed_irr_chunks + grid_chunks, skip_broken_datasets=skip_broken_datasets) return Datacube.create_storage( OrderedDict((dim, sources.coords[dim]) for dim in sources.dims), geobox, measurements, data_func)
def ingest_work(config, source_type, output_type, tile, tile_index): # pylint: disable=too-many-locals _LOG.info('Starting task %s', tile_index) driver = storage_writer_by_name(config['storage']['driver']) if driver is None: _LOG.error('Failed to load storage driver %s', config['storage']['driver']) raise ValueError('Something went wrong: no longer can find driver pointed by storage.driver option') namemap = get_namemap(config) # TODO: get_measurements possibly changes dtype, not sure load_data would like that measurements = get_measurements(source_type, config) resampling = get_resampling(config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] datasets = tile.sources.sum().item() for dataset in datasets: if not dataset.uris: _LOG.error('Locationless dataset found in the database: %r', dataset) data = Datacube.load_data(tile.sources, tile.geobox, measurements, resampling=resampling, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) file_uri = driver.mk_uri(file_path, config['storage']) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_uri, app_info=get_app_metadata(config['filename']), valid_data=polygon_from_sources_extents(sources, tile.geobox)) datasets = xr_apply(tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) variable_params['dataset'] = { 'chunksizes': (1,), 'zlib': True, 'complevel': 9, } driver_data = driver.write_dataset_to_storage(nudata, file_uri, global_attributes=global_attributes, variable_params=variable_params, storage_config=config['storage']) if (driver_data is not None) and len(driver_data) > 0: datasets.attrs['driver_data'] = driver_data _LOG.info('Finished task %s', tile_index) return datasets
def _juxtapose_group_(): self._assert( 'juxtapose' in datasets.pile and len(datasets.pile['juxtapose']) == len(self._children), "invalid dataset pile") groups = [ product.group( VirtualDatasetBag(dataset_pile, datasets.grid_spec, datasets.geopolygon, datasets.product_definitions), **search_terms) for product, dataset_pile in zip( self._children, datasets.pile['juxtapose']) ] aligned_piles = xarray.align(*[grouped.pile for grouped in groups]) def tuplify(indexes, _): return { 'juxtapose': [pile.sel(**indexes).item() for pile in aligned_piles] } return VirtualDatasetBox( xr_apply(aligned_piles[0], tuplify), select_unique([grouped.geobox for grouped in groups]), merge_dicts( [grouped.product_definitions for grouped in groups]))
def do_fixer_task(config, task): global_attributes = config['global_attributes'] # Don't keep the original history if we are trying to fix it global_attributes['history'] = build_history_string(config, task, keep_original=False) variable_params = config['variable_params'] output_filename = Path(task['output_filename']) output_uri = output_filename.absolute().as_uri() temp_filename = get_temp_file(output_filename) tile = task['tile'] # Only use the time chunk size (eg 5), but not spatial chunks # This means the file only gets opened once per band, and all data is available when compressing on write # 5 * 4000 * 4000 * 2bytes == 152MB, so mem usage is not an issue chunk_profile = {'time': config['storage']['chunking']['time']} data = datacube.api.GridWorkflow.load(tile, dask_chunks=chunk_profile) unwrapped_datasets = xr_apply(tile.sources, _unwrap_dataset_list, dtype='O') data['dataset'] = datasets_to_doc(unwrapped_datasets) try: if data.geobox is None: raise DatacubeException( 'Dataset geobox property is None, cannot write to NetCDF file.' ) if data.geobox.crs is None: raise DatacubeException( 'Dataset geobox.crs property is None, cannot write to NetCDF file.' ) nco = create_netcdf_storage_unit(temp_filename, data.geobox.crs, data.coords, data.data_vars, variable_params, global_attributes) write_data_variables(data.data_vars, nco) nco.close() temp_filename.rename(output_filename) if config.get('check_data_identical', False): new_tile = make_updated_tile(unwrapped_datasets, output_uri, tile.geobox) new_data = datacube.api.GridWorkflow.load( new_tile, dask_chunks=chunk_profile) check_identical(data, new_data, output_filename) except Exception as e: if temp_filename.exists(): temp_filename.unlink() raise e return unwrapped_datasets, output_uri
def _do_fc_task(config, task): """ Load data, run FC algorithm, attach metadata, and write output. :param dict config: Config object :param dict task: Dictionary of tasks :return: Dataset objects representing the generated data that can be added to the index :rtype: list(datacube.model.Dataset) """ global_attributes = config['global_attributes'] variable_params = config['variable_params'] output_product = config['fc_product'] file_path = Path(task['filename_dataset']) uri, band_uris = calc_uris(file_path, variable_params) output_measurements = config['fc_product'].measurements.values() nbart = io.native_load(task['dataset'], measurements=config['load_bands']) if config['band_mapping'] is not None: nbart = nbart.rename(config['band_mapping']) fc_dataset = run_fc(nbart, output_measurements, config.get('sensor_regression_coefficients')) def _make_dataset(labels, sources): assert sources dataset = make_dataset(product=output_product, sources=sources, extent=nbart.geobox.extent, center_time=labels['time'], uri=uri, band_uris=band_uris, app_info=_get_app_metadata(config), valid_data=polygon_from_sources_extents( sources, nbart.geobox)) return dataset source = Datacube.group_datasets([task['dataset']], 'time') datasets = xr_apply(source, _make_dataset, dtype='O') fc_dataset['dataset'] = datasets_to_doc(datasets) base, ext = os.path.splitext(file_path) if ext == '.tif': dataset_to_geotif_yaml( dataset=fc_dataset, odc_dataset=datasets.item(), filename=file_path, variable_params=variable_params, ) else: write_dataset_to_netcdf( dataset=fc_dataset, filename=file_path, global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def make_updated_tile(old_datasets, new_uri, geobox): def update_dataset_location(labels, dataset: Dataset) -> List[Dataset]: new_dataset = copy.copy(dataset) new_dataset.uris = [new_uri] return [new_dataset] updated_datasets = xr_apply(old_datasets, update_dataset_location, dtype='O') return datacube.api.Tile(sources=updated_datasets, geobox=geobox)
def test_xr_apply(): src = xr.DataArray(np.asarray([1, 2, 3], dtype='uint8'), dims=['time']) dst = xr_apply(src, lambda _, v: v, dtype='float32') assert dst.dtype.name == 'float32' assert dst.shape == src.shape assert dst.values.tolist() == [1, 2, 3] dst = xr_apply(src, lambda _, v: v) assert dst.dtype.name == 'uint8' assert dst.shape == src.shape assert dst.values.tolist() == [1, 2, 3] dst = xr_apply(src, lambda idx, _, v: idx[0] + v, with_numeric_index=True) assert dst.dtype.name == 'uint8' assert dst.shape == src.shape assert dst.values.tolist() == [0 + 1, 1 + 2, 2 + 3]
def make_updated_tile(old_datasets, new_uri, geobox): def update_dataset_location(idx, labels, dataset: Dataset) -> List[Dataset]: idx, = idx new_dataset = copy.copy(dataset) new_dataset.uris = [mk_part_uri(new_uri, idx)] return [new_dataset] updated_datasets = xr_apply(old_datasets, update_dataset_location, with_numeric_index=True) return datacube.api.Tile(sources=updated_datasets, geobox=geobox)
def ingest_work(config, source_type, output_type, tile, tile_index): # pylint: disable=too-many-locals _LOG.info('Starting task %s', tile_index) driver = storage_writer_by_name(config['storage']['driver']) if driver is None: _LOG.error('Failed to load storage driver %s', config['storage']['driver']) raise ValueError('Something went wrong: no longer can find driver pointed by storage.driver option') namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) def mk_uri(file_path): if driver.uri_scheme == "file": return file_path.absolute().as_uri() return '{}://{}'.format(driver.uri_scheme, file_path) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=mk_uri(file_path), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents(sources, tile.geobox)) datasets = xr_apply(tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) variable_params['dataset'] = { 'chunksizes': (1,), 'zlib': True, 'complevel': 9, } storage_metadata = driver.write_dataset_to_storage(nudata, file_path, global_attributes=global_attributes, variable_params=variable_params, storage_config=config['storage']) if (storage_metadata is not None) and len(storage_metadata) > 0: datasets.attrs['storage_metadata'] = storage_metadata _LOG.info('Finished task %s', tile_index) return datasets
def do_stack_task(task): datasets_to_add = None datasets_to_update = None datasets_to_archive = None global_attributes = task['global_attributes'] variable_params = task['variable_params'] output_filename = Path(task['output_filename']) tile = task['tile'] if task.get('make_new_datasets', False): datasets_to_add = make_datasets(tile, output_filename, task) datasets_to_archive = xr_apply(tile.sources, _single_dataset, dtype='O') output_datasets = datasets_to_add else: datasets_to_update = xr_apply(tile.sources, _single_dataset, dtype='O') output_datasets = datasets_to_update data = datacube.api.GridWorkflow.load(tile, dask_chunks=dict(time=1)) # TODO: chunk along output NetCDF chunk? data['dataset'] = datasets_to_doc(output_datasets) nco = create_netcdf_storage_unit(output_filename, data.crs, data.coords, data.data_vars, variable_params, global_attributes) for name, variable in data.data_vars.items(): try: with dask.set_options(get=dask.async.get_sync): da.store(variable.data, nco[name], lock=True) except ValueError: nco[name][:] = netcdf_writer.netcdfy_data(variable.values) nco.sync() nco.close() return datasets_to_add, datasets_to_update, datasets_to_archive
def make_datasets(tile, file_path, config): def _make_dataset(labels, sources): new_dataset = make_dataset(product=tile.product, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config), valid_data=sources[0].extent) return new_dataset return xr_apply(tile.sources, _make_dataset, dtype='O')
def do_stack_task(config, task): global_attributes = config['global_attributes'] global_attributes['history'] = get_history_attribute(config, task) variable_params = config['variable_params'] variable_params['dataset'] = { 'chunksizes': (1, ), 'zlib': True, 'complevel': 9, } output_filename = Path(task['output_filename']) output_uri = output_filename.absolute().as_uri() temp_filename = get_temp_file(output_filename) tile = task['tile'] # Only use the time chunk size (eg 5), but not spatial chunks # This means the file only gets opened once per band, and all data is available when compressing on write # 5 * 4000 * 4000 * 2bytes == 152MB, so mem usage is not an issue chunk_profile = {'time': config['storage']['chunking']['time']} data = datacube.api.GridWorkflow.load(tile, dask_chunks=chunk_profile) unwrapped_datasets = xr_apply(tile.sources, _unwrap_dataset_list, dtype='O') data['dataset'] = datasets_to_doc(unwrapped_datasets) try: nco = create_netcdf_storage_unit(temp_filename, data.crs, data.coords, data.data_vars, variable_params, global_attributes) write_data_variables(data.data_vars, nco) nco.close() temp_filename.rename(output_filename) if config.get('check_data_identical', False): new_tile = make_updated_tile(unwrapped_datasets, output_uri, tile.geobox) new_data = datacube.api.GridWorkflow.load( new_tile, dask_chunks=chunk_profile) check_identical(data, new_data, output_filename) except Exception as e: if temp_filename.exists(): temp_filename.unlink() raise e return unwrapped_datasets, output_uri
def do_ndvi_task(config, task): global_attributes = config['global_attributes'] variable_params = config['variable_params'] file_path = Path(task['filename']) output_type = config['ndvi_dataset_type'] measurement = output_type.measurements['ndvi'] output_dtype = np.dtype(measurement['dtype']) nodata_value = np.dtype(output_dtype).type(measurement['nodata']) if file_path.exists(): raise OSError(errno.EEXIST, 'Output file already exists', str(file_path)) measurements = ['red', 'nir'] nbar_tile = task['nbar'] nbar = GridWorkflow.load(nbar_tile, measurements) ndvi = calculate_ndvi(nbar, nodata=nodata_value, dtype=output_dtype, units=measurement['units']) def _make_dataset(labels, sources): assert len(sources) geobox = nbar.geobox source_data = union_points( *[dataset.extent.to_crs(geobox.crs).points for dataset in sources]) valid_data = intersect_points(geobox.extent.points, source_data) dataset = make_dataset(product=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O') ndvi['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf( dataset=ndvi, filename=Path(file_path), global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def ingest_work(driver_manager, config, source_type, output_type, tile, tile_index): _LOG.info('Starting task %s', tile_index) namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func, driver_manager=driver_manager) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents( sources, tile.geobox)) datasets = xr_apply( tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) # Until ingest becomes a class and DriverManager an instance # variable, we call the constructor each time. DriverManager being # a singleton, there is little overhead, though. datasets.attrs['storage_output'] = driver_manager.write_dataset_to_storage( nudata, file_path, global_attributes, variable_params) _LOG.info('Finished task %s', tile_index) # When using multiproc executor, Driver Manager is a clone. if driver_manager.is_clone: driver_manager.close() return datasets
def _do_fc_task(config, task): """ Load data, run FC algorithm, attach metadata, and write output. :param dict config: Config object :param dict task: Dictionary of tasks :return: Dataset objects representing the generated data that can be added to the index :rtype: list(datacube.model.Dataset) """ global_attributes = config['global_attributes'] variable_params = config['variable_params'] file_path = Path(task['filename']) output_product = config['fc_product'] if file_path.exists(): raise OSError(errno.EEXIST, 'Output file already exists', str(file_path)) nbart_tile: Tile = task['nbart'] nbart = GridWorkflow.load(nbart_tile, ['green', 'red', 'nir', 'swir1', 'swir2']) output_measurements = config['fc_product'].measurements.values() fc_dataset = _make_fc_tile(nbart, output_measurements, config.get('sensor_regression_coefficients')) def _make_dataset(labels, sources): assert sources dataset = make_dataset(product=output_product, sources=sources, extent=nbart.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=_get_app_metadata(config), valid_data=polygon_from_sources_extents( sources, nbart.geobox)) return dataset datasets = xr_apply(nbart_tile.sources, _make_dataset, dtype='O') fc_dataset['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf( dataset=fc_dataset, filename=file_path, global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def do_ndvi_task(config, task): global_attributes = config['global_attributes'] variable_params = config['variable_params'] file_path = Path(task['filename']) output_type = config['ndvi_dataset_type'] measurement = output_type.measurements['ndvi'] output_dtype = np.dtype(measurement['dtype']) nodata_value = np.dtype(output_dtype).type(measurement['nodata']) if file_path.exists(): raise OSError(errno.EEXIST, 'Output file already exists', str(file_path)) measurements = ['red', 'nir'] nbar_tile = task['nbar'] nbar = GridWorkflow.load(nbar_tile, measurements) ndvi = calculate_ndvi(nbar, nodata=nodata_value, dtype=output_dtype, units=measurement['units']) def _make_dataset(labels, sources): assert len(sources) geobox = nbar.geobox source_data = union_points(*[dataset.extent.to_crs(geobox.crs).points for dataset in sources]) valid_data = intersect_points(geobox.extent.points, source_data) dataset = make_dataset(product=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O') ndvi['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf( dataset=ndvi, filename=Path(file_path), global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def _dask_load(sources, geobox, measurements, dask_chunks, skip_broken_datasets=False, extra_dims=None): chunk_sizes = _calculate_chunk_sizes(sources, geobox, dask_chunks, extra_dims) needed_irr_chunks = chunk_sizes[0] if extra_dims: extra_dim_chunks = chunk_sizes[1] grid_chunks = chunk_sizes[-1] gbt = GeoboxTiles(geobox, grid_chunks) dsk = {} def chunk_datasets(dss, gbt): out = {} for ds in dss: dsk[_tokenize_dataset(ds)] = ds for idx in gbt.tiles(ds.extent): out.setdefault(idx, []).append(ds) return out chunked_srcs = xr_apply(sources, lambda _, dss: chunk_datasets(dss, gbt), dtype=object) def data_func(measurement, shape): if 'extra_dim' in measurement: chunks = needed_irr_chunks + extra_dim_chunks + grid_chunks else: chunks = needed_irr_chunks + grid_chunks return _make_dask_array(chunked_srcs, dsk, gbt, measurement, chunks=chunks, skip_broken_datasets=skip_broken_datasets, extra_dims=extra_dims) return Datacube.create_storage(sources.coords, geobox, measurements, data_func, extra_dims)
def ingest_work(config, source_type, output_type, index, sources, geobox): namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.product_data(sources, geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, index, sources) def _make_dataset(labels, sources): sources_union = union_points( *[source.extent.to_crs(geobox.crs).points for source in sources]) valid_data = intersect_points(geobox.extent.points, sources_union) dataset = make_dataset(dataset_type=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata( config, config['filename']), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply( sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, global_attributes, variable_params, file_path) return datasets
def ingest_work(config, source_type, output_type, tile, tile_index): _LOG.info('Starting task %s', tile_index) namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents( sources, tile.geobox)) datasets = xr_apply( tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, file_path, global_attributes, variable_params) _LOG.info('Finished task %s', tile_index) return datasets
def do_fc_task(config, task): global_attributes = config['global_attributes'] variable_params = config['variable_params'] file_path = Path(task['filename']) output_product = config['fc_product'] if file_path.exists(): raise OSError(errno.EEXIST, 'Output file already exists', str(file_path)) nbar_tile: Tile = task['nbar'] nbar = GridWorkflow.load(nbar_tile, ['green', 'red', 'nir', 'swir1', 'swir2']) output_measurements = config['fc_product'].measurements.values() fc_dataset = make_fc_tile(nbar, output_measurements, config.get('sensor_regression_coefficients')) def _make_dataset(labels, sources): assert sources dataset = make_dataset(product=output_product, sources=sources, extent=nbar.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config), valid_data=GeoPolygon.from_sources_extents(sources, nbar.geobox)) return dataset datasets = xr_apply(nbar_tile.sources, _make_dataset, dtype='O') fc_dataset['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf( dataset=fc_dataset, filename=file_path, global_attributes=global_attributes, variable_params=variable_params, ) return datasets
def map(self, func, dtype='O'): return DatasetPile(xr_apply(self.pile, func, dtype=dtype), self.geobox)
def map(self, func, dtype='O'): return VirtualDatasetBox(xr_apply(self.box, func, dtype=dtype), self.geobox, self.load_natively, self.product_definitions, geopolygon=self.geopolygon)
def map(self, func, dtype='O'): return VirtualDatasetBox(xr_apply(self.pile, func, dtype=dtype), self.geobox, self.product_definitions)
def _find_source_datasets(self, stat: OutputProduct, uri: str = None, band_uris: dict = None) -> xarray.DataArray: """ Find all the source datasets for a task Put them in order so that they can be assigned to a stacked output aligned against it's time dimension :return: (datasets, sources) datasets is a bunch of strings to dump, indexed on time sources is more structured. An x-array of lists of dataset sources, indexed on time """ task = self._task geobox = self._task.geobox app_info = self._app_info def add_all(iterable): return reduce_(operator.add, iterable) def merge_sources(prod): # Merge data sources and mask sources # Align the data `Tile` with potentially many mask `Tile`s along their time axis all_sources = xarray.align( prod.data.sources, *[mask_tile.sources for mask_tile in prod.masks if mask_tile]) # TODO: The following can fail if prod.data and prod.masks have different times # Which can happen in the case of a missing PQ Scene, where there is a scene overlap # ie. Two overlapped NBAR scenes, One PQ scene (the later) return add_all(sources_.sum() for sources_ in all_sources) sources = add_all(merge_sources(prod) for prod in task.sources) def unique(index, dataset_tuple): return tuple(set(dataset_tuple)) sources = xr_apply(sources, unique, dtype='O') # Sources has no time at this point, so insert back in the start of our stats epoch start_time, _ = task.time_period sources = unsqueeze_data_array(sources, dim='time', pos=0, coord=start_time, attrs=task.time_attributes) if not sources: raise StatsOutputError( 'No valid sources found, or supplied sources do not align to the same time.\n' 'Unable to write dataset metadata.') def _make_dataset(labels, sources_): return make_dataset(product=stat.product, sources=sources_, extent=geobox.extent, center_time=labels['time'], uri=uri, band_uris=band_uris, app_info=app_info, valid_data=polygon_from_sources_extents( sources_, geobox)) datasets = xr_apply( sources, _make_dataset, dtype='O') # Store in DataArray to associate Time -> Dataset datasets = datasets_to_doc(datasets) return datasets
def group(self, datasets: VirtualDatasetBag, **search_terms: Dict[str, Any]) -> VirtualDatasetBox: """ Datasets grouped by their timestamps. :param datasets: the `VirtualDatasetBag` to fetch data from :param query: to specify a spatial sub-region """ grid_spec = datasets.grid_spec geopolygon = datasets.geopolygon if 'product' in self: # select only those inside the ROI # ROI could be smaller than the query for the `query` method if query_geopolygon(**search_terms) is not None: geopolygon = query_geopolygon(**search_terms) selected = list( select_datasets_inside_polygon(datasets.pile, geopolygon)) else: selected = list(datasets.pile) # geobox merged = merge_search_terms( select_keys(self, self._NON_SPATIAL_KEYS), select_keys(search_terms, self._NON_SPATIAL_KEYS)) geobox = output_geobox(datasets=selected, grid_spec=grid_spec, geopolygon=geopolygon, **select_keys(merged, self._GEOBOX_KEYS)) # group by time group_query = query_group_by( **select_keys(merged, self._GROUPING_KEYS)) # information needed for Datacube.load_data return VirtualDatasetBox( Datacube.group_datasets(selected, group_query), geobox, datasets.product_definitions) elif 'transform' in self: return self._input.group(datasets, **search_terms) elif 'collate' in self: self._assert( 'collate' in datasets.pile and len(datasets.pile['collate']) == len(self._children), "invalid dataset pile") def build(source_index, product, dataset_pile): grouped = product.group( VirtualDatasetBag(dataset_pile, datasets.grid_spec, datasets.geopolygon, datasets.product_definitions), **search_terms) def tag(_, value): return {'collate': (source_index, value)} return grouped.map(tag) groups = [ build(source_index, product, dataset_pile) for source_index, (product, dataset_pile) in enumerate( zip(self._children, datasets.pile['collate'])) ] return VirtualDatasetBox( xarray.concat([grouped.pile for grouped in groups], dim='time'), select_unique([grouped.geobox for grouped in groups]), merge_dicts( [grouped.product_definitions for grouped in groups])) elif 'juxtapose' in self: self._assert( 'juxtapose' in datasets.pile and len(datasets.pile['juxtapose']) == len(self._children), "invalid dataset pile") groups = [ product.group( VirtualDatasetBag(dataset_pile, datasets.grid_spec, datasets.geopolygon, datasets.product_definitions), **search_terms) for product, dataset_pile in zip( self._children, datasets.pile['juxtapose']) ] aligned_piles = xarray.align(*[grouped.pile for grouped in groups]) def tuplify(indexes, _): return { 'juxtapose': [pile.sel(**indexes).item() for pile in aligned_piles] } return VirtualDatasetBox( xr_apply(aligned_piles[0], tuplify), select_unique([grouped.geobox for grouped in groups]), merge_dicts( [grouped.product_definitions for grouped in groups])) else: raise VirtualProductException("virtual product was not validated")