def execute_task(task: StatsTask, output_driver, chunking) -> StatsTask: """ Load data, run the statistical operations and write results out to the filesystem. :param datacube_stats.models.StatsTask task: :type output_driver: OutputDriver :param chunking: dict of dimension sizes to chunk the computation by """ timer = MultiTimer().start('total') datacube.set_options(reproject_threads=1) process_chunk = load_process_save_chunk_iteratively if task.is_iterative else load_process_save_chunk try: with output_driver(task=task) as output_files: # currently for polygons process will load entirely if len(chunking) == 0: chunking = {'x': task.sample_tile.shape[2], 'y': task.sample_tile.shape[1]} for sub_tile_slice in tile_iter(task.sample_tile, chunking): process_chunk(output_files, sub_tile_slice, task, timer) except OutputFileAlreadyExists as e: _LOG.warning(str(e)) except OutputDriverResult as e: # was run interactively # re-raise result to be caught again by StatsApp.execute_task raise e except Exception as e: _LOG.error("Error processing task: %s", task) raise StatsProcessingException("Error processing task: %s" % task) timer.pause('total') _LOG.debug('Completed %s %s task with %s data sources; %s', task.tile_index, [d.strftime('%Y-%m-%d') for d in task.time_period], task.data_sources_length(), timer) return task
def assert_same_read_results(source, dst_shape, dst_dtype, dst_transform, dst_nodata, dst_projection, resampling): expected = np.empty(dst_shape, dtype=dst_dtype) with source.open() as src: rasterio.warp.reproject(src.data, expected, src_transform=src.transform, src_crs=str(src.crs), src_nodata=src.nodata, dst_transform=dst_transform, dst_crs=str(dst_projection), dst_nodata=dst_nodata, resampling=resampling) result = np.full(dst_shape, dst_nodata, dtype=dst_dtype) H, W = dst_shape dst_gbox = GeoBox(W, H, dst_transform, dst_projection) with datacube.set_options(reproject_threads=1): with source.open() as rdr: read_time_slice(rdr, result, dst_gbox, dst_nodata=dst_nodata, resampling=resampling) assert np.isclose(result, expected, atol=0, rtol=0.05, equal_nan=True).all() return result
def ingest_work(config, source_type, output_type, tile, tile_index): _LOG.info('Starting task %s', tile_index) namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources, version=config['taskfile_version']) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents(sources, tile.geobox)) datasets = xr_apply(tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, file_path, global_attributes, variable_params) _LOG.info('Finished task %s', tile_index) return datasets
def data(self, datasets, mask=False, manual_merge=False, skip_corrections=False, **kwargs): # pylint: disable=too-many-locals, consider-using-enumerate if mask: prod = self._product.pq_product measurements = [prod.measurements[self._product.pq_band].copy()] else: prod = self._product.product measurements = [prod.measurements[name].copy() for name in self.needed_bands()] with datacube.set_options(reproject_threads=1, fast_load=True): if manual_merge: return self.manual_data_stack(datasets, measurements, mask, skip_corrections, **kwargs) elif self._product.solar_correction and not mask and not skip_corrections: # Merge performed already by dataset extent, but we need to # process the data for the datasets individually to do solar correction. merged = None for ds in datasets: d = read_data(ds, measurements, self._geobox, **kwargs) for band in self.needed_bands(): if band != self._product.pq_band: d[band] = solar_correct_data(d[band], ds) if merged is None: merged = d else: merged = merged.combine_first(d) return merged else: data = read_data(datasets, measurements, self._geobox, self._resampling, **kwargs) return data
def ingest_work(config, source_type, output_type, index, sources, geobox): namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.product_data(sources, geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, index, sources) def _make_dataset(labels, sources): sources_union = union_points(*[source.extent.to_crs(geobox.crs).points for source in sources]) valid_data = intersect_points(geobox.extent.points, sources_union) dataset = make_dataset(dataset_type=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply(sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, global_attributes, variable_params, file_path) return datasets
def _test_helper(source, dst_shape, dst_dtype, dst_transform, dst_nodata, dst_projection, resampling): expected = numpy.empty(dst_shape, dtype=dst_dtype) with source.open() as src: rasterio.warp.reproject(src.data, expected, src_transform=src.transform, src_crs=str(src.crs), src_nodata=src.nodata, dst_transform=dst_transform, dst_crs=str(dst_projection), dst_nodata=dst_nodata, resampling=resampling) result = numpy.empty(dst_shape, dtype=dst_dtype) with datacube.set_options(reproject_threads=1): read_from_source(source, result, dst_transform=dst_transform, dst_nodata=dst_nodata, dst_projection=dst_projection, resampling=resampling) assert numpy.isclose(result, expected, atol=0, rtol=0.05, equal_nan=True).all() return result
def ingest_work(config, source_type, output_type, tile, tile_index): # pylint: disable=too-many-locals _LOG.info('Starting task %s', tile_index) driver = storage_writer_by_name(config['storage']['driver']) if driver is None: _LOG.error('Failed to load storage driver %s', config['storage']['driver']) raise ValueError('Something went wrong: no longer can find driver pointed by storage.driver option') namemap = get_namemap(config) # TODO: get_measurements possibly changes dtype, not sure load_data would like that measurements = get_measurements(source_type, config) resampling = get_resampling(config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, resampling=resampling, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) def mk_uri(file_path): if driver.uri_scheme == "file": return file_path.absolute().as_uri() return '{}://{}'.format(driver.uri_scheme, file_path) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=mk_uri(file_path), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents(sources, tile.geobox)) datasets = xr_apply(tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) variable_params['dataset'] = { 'chunksizes': (1,), 'zlib': True, 'complevel': 9, } storage_metadata = driver.write_dataset_to_storage(nudata, file_path, global_attributes=global_attributes, variable_params=variable_params, storage_config=config['storage']) if (storage_metadata is not None) and len(storage_metadata) > 0: datasets.attrs['storage_metadata'] = storage_metadata _LOG.info('Finished task %s', tile_index) return datasets
def data(self, datasets): holder = numpy.empty(shape=tuple(), dtype=object) holder[()] = datasets sources = xarray.DataArray(holder) prod = datasets[0].type measurements = [ self._set_resampling(prod.measurements[name]) for name in self._bands ] with datacube.set_options(reproject_threads=1, fast_load=True): return datacube.Datacube.load_data(sources, self._geobox, measurements)
def _load_data(dc, geobox, product, bands, time_): to_load = _get_datasets(dc, geobox, product, time_) holder = numpy.empty(shape=tuple(), dtype=object) holder[()] = to_load sources = xarray.DataArray(holder) prod = dc.index.products.get_by_name(product) measurements = [ _set_resampling(m, 'cubic') for name, m in prod.measurements.items() if name in bands ] with datacube.set_options(reproject_threads=1): return dc.load_data(sources, geobox, measurements)
def data(self, datasets, mask=False, manual_merge=False, skip_corrections=False, use_overviews=False, **kwargs): #pylint: disable=too-many-locals, consider-using-enumerate if mask: prod = self._product.pq_product measurements = [prod.measurements[self._product.pq_band].copy()] else: prod = self._product.product measurements = [ prod.measurements[name].copy() for name in self.needed_bands() ] with datacube.set_options(reproject_threads=1, fast_load=True): if manual_merge: return self.manual_data_stack(datasets, measurements, mask, skip_corrections, use_overviews, **kwargs) elif self._product.solar_correction and not mask and not skip_corrections: # Merge performed already by dataset extent, but we need to # process the data for the datasets individually to do solar correction. merged = None for i in range(0, len(datasets)): holder = numpy.empty(shape=tuple(), dtype=object) ds = datasets[i] d = read_data(ds, measurements, self._geobox, use_overviews, **kwargs) for band in self.needed_bands(): if band != self._product.pq_band: d[band] = solar_correct_data(d[band], ds) if merged is None: merged = d else: merged = merged.combine_first(d) return merged else: # Merge performed already by dataset extent if isinstance(datasets, xarray.DataArray): sources = datasets else: holder = numpy.empty(shape=tuple(), dtype=object) holder[()] = datasets sources = xarray.DataArray(holder) data = read_data(datasets, measurements, self._geobox, use_overviews, **kwargs) return data
def data(self, datasets, mask=False, manual_merge=False): if mask: prod = self._product.pq_product measurements = [ self._set_resampling(prod.measurements[self._product.pq_band]) ] else: prod = self._product.product measurements = [ self._set_resampling(prod.measurements[name]) for name in self.needed_bands() ] with datacube.set_options(reproject_threads=1, fast_load=True): if manual_merge: datas = [] for i in range(0, len(datasets)): j = i + 1 holder = numpy.empty(shape=tuple(), dtype=object) holder[()] = datasets[i:j] sources = xarray.DataArray(holder) datas.append( datacube.Datacube.load_data(sources, self._geobox, measurements)) merged = None if mask: band = self._product.pq_band else: for band in self.needed_bands(): break for d in datas: extent_mask = self._product.extent_mask_func(d, band) dm = d.where(extent_mask) if merged is None: merged = dm else: merged = merged.combine_first(dm) if mask: merged = merged.astype('uint8', copy=True) merged[band].attrs = d[band].attrs return merged else: if isinstance(datasets, xarray.DataArray): sources = datasets else: holder = numpy.empty(shape=tuple(), dtype=object) holder[()] = datasets sources = xarray.DataArray(holder) return datacube.Datacube.load_data(sources, self._geobox, measurements)
def ingest_work(driver_manager, config, source_type, output_type, tile, tile_index): _LOG.info('Starting task %s', tile_index) namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.load_data(tile.sources, tile.geobox, measurements, fuse_func=fuse_func, driver_manager=driver_manager) nudata = data.rename(namemap) file_path = get_filename(config, tile_index, tile.sources) def _make_dataset(labels, sources): return make_dataset(product=output_type, sources=sources, extent=tile.geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata(config, config['filename']), valid_data=GeoPolygon.from_sources_extents( sources, tile.geobox)) datasets = xr_apply( tile.sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) # Until ingest becomes a class and DriverManager an instance # variable, we call the constructor each time. DriverManager being # a singleton, there is little overhead, though. datasets.attrs['storage_output'] = driver_manager.write_dataset_to_storage( nudata, file_path, global_attributes, variable_params) _LOG.info('Finished task %s', tile_index) # When using multiproc executor, Driver Manager is a clone. if driver_manager.is_clone: driver_manager.close() return datasets
def data(self, datasets, mask=False): if isinstance(datasets, xarray.DataArray): sources = datasets else: holder = numpy.empty(shape=tuple(), dtype=object) holder[()] = datasets sources = xarray.DataArray(holder) if mask: prod = self._product.pq_product measurements = [ self._set_resampling(prod.measurements[self._product.pq_band]) ] else: prod = self._product.product measurements = [ self._set_resampling(prod.measurements[name]) for name in self.needed_bands() ] with datacube.set_options(reproject_threads=1, fast_load=True): return datacube.Datacube.load_data(sources, self._geobox, measurements)
def ingest_work(config, source_type, output_type, index, sources, geobox): namemap = get_namemap(config) measurements = get_measurements(source_type, config) variable_params = get_variable_params(config) global_attributes = config['global_attributes'] with datacube.set_options(reproject_threads=1): fuse_func = {'copy': None}[config.get(FUSER_KEY, 'copy')] data = Datacube.product_data(sources, geobox, measurements, fuse_func=fuse_func) nudata = data.rename(namemap) file_path = get_filename(config, index, sources) def _make_dataset(labels, sources): sources_union = union_points( *[source.extent.to_crs(geobox.crs).points for source in sources]) valid_data = intersect_points(geobox.extent.points, sources_union) dataset = make_dataset(dataset_type=output_type, sources=sources, extent=geobox.extent, center_time=labels['time'], uri=file_path.absolute().as_uri(), app_info=get_app_metadata( config, config['filename']), valid_data=GeoPolygon(valid_data, geobox.crs)) return dataset datasets = xr_apply( sources, _make_dataset, dtype='O') # Store in Dataarray to associate Time -> Dataset nudata['dataset'] = datasets_to_doc(datasets) write_dataset_to_netcdf(nudata, global_attributes, variable_params, file_path) return datasets
Dependancies in this code: - csv file with the lat/lon coordinates of the case study bounding box/es Accompanying code - Extract_AGDC_for_study_sites.ipynb - steps through this code with explanations of how this code is compiled. See the acompanying code for more detailed explanations and examples - Run_AGDC_extraction - PBS submission code to generate single CPU jobs for each study site ''' # Import the libraries we need in the code and tell matplotlib to display the plots here import fiona import shapely.geometry import rasterio import rasterio.features import datacube datacube.set_options(reproject_threads=1) import numpy as np from datacube.storage import masking import matplotlib.pyplot as plt import xarray as xr import scipy.stats import pandas import os import sys from affine import Affine # Set up some functions to use later in the code def warp_geometry(geom, src_crs, dst_crs): """ warp geometry from src_crs to dst_crs