def collect_tasks(self, workflow, time_period, sources_spec, tile_index=None): """ Collect tasks for a time period. """ # Tasks are grouped by tile_index, and may contain sources from multiple places # Each source may be masked by multiple masks # pylint: disable=too-many-locals tasks = {} for source_index, source_spec in enumerate(sources_spec): ep_range = filter_time_by_source(source_spec.get('time'), time_period) if ep_range is None: _LOG.info("Datasets not included for %s and time range for %s", source_spec['product'], time_period) continue group_by_name = source_spec.get('group_by', DEFAULT_GROUP_BY) products = [source_spec['product']] + [ mask['product'] for mask in source_spec.get('masks', []) ] product_query = { products[0]: { 'source_filter': source_spec.get('source_filter', None) } } (data, *masks), unmatched_ = multi_product_list_cells( products, workflow, product_query=product_query, cell_index=tile_index, time=ep_range, group_by=group_by_name, geopolygon=self.geopolygon) self._total_unmatched += report_unmatched_datasets( unmatched_[0], _LOG.warning) for tile, sources in data.items(): task = tasks.setdefault( tile, StatsTask(time_period=ep_range, spatial_id={ 'x': tile[0], 'y': tile[1] })) task.sources.append( DataSource(data=sources, masks=[mask.get(tile) for mask in masks], spec=source_spec, source_index=source_index)) return list(tasks.values())
def __call__(self, index, sources_spec, date_ranges) -> Iterator[StatsTask]: """ :param index: database index :param input_region: dictionary of query parameters defining the target input region. Usually x/y spatial boundaries. :return: """ for input_region in self.input_region: make_tile = ArbitraryTileMaker(index, input_region, self.storage) for time_period in date_ranges: task = StatsTask(time_period=time_period) _LOG.info('Making output product tasks for time period: %s', time_period) for source_index, source_spec in enumerate(sources_spec): ep_range = filter_time_by_source(source_spec.get('time'), time_period) if ep_range is None: _LOG.info( "Datasets not included for %s and time range for %s", source_spec['product'], time_period) continue group_by_name = source_spec.get('group_by', DEFAULT_GROUP_BY) # Build Tile data = make_tile(product=source_spec['product'], time=ep_range, group_by=group_by_name) masks = [ make_tile(product=mask['product'], time=ep_range, group_by=group_by_name) for mask in source_spec.get('masks', []) ] if len(data.sources.time) == 0: _LOG.info("No matched for product %s", source_spec['product']) continue task.sources.append( DataSource(data=data, masks=masks, spec=source_spec, source_index=source_index)) _LOG.info("make tile finished") if task.sources: # Function which takes a Tile, containing sources, and returns a new 'filtered' Tile task = self.filter_task(task, input_region, date_ranges) _LOG.info('Created task for time period: %s', time_period) yield task
def execute_task(task: StatsTask, output_driver, chunking) -> StatsTask: """ Load data, run the statistical operations and write results out to the filesystem. :param datacube_stats.models.StatsTask task: :type output_driver: OutputDriver :param chunking: dict of dimension sizes to chunk the computation by """ timer = MultiTimer().start('total') process_chunk = load_process_save_chunk_iteratively if task.is_iterative else load_process_save_chunk try: with output_driver(task=task) as output_files: # currently for polygons process will load entirely if len(chunking) == 0: chunking = { 'x': task.sample_tile.shape[2], 'y': task.sample_tile.shape[1] } for sub_tile_slice in tile_iter(task.sample_tile, chunking): process_chunk(output_files, sub_tile_slice, task, timer) except OutputFileAlreadyExists as e: _LOG.warning(str(e)) except OutputDriverResult as e: # was run interactively # re-raise result to be caught again by StatsApp.execute_task raise e except Exception as e: _LOG.error("Error processing task: %s", task) raise StatsProcessingException("Error processing task: %s" % task) timer.pause('total') _LOG.debug('Completed %s %s task with %s data sources; %s', task.spatial_id, [d.strftime('%Y-%m-%d') for d in task.time_period], task.data_sources_length(), timer) return task
def __call__(self, index, sources_spec, date_ranges) -> Iterator[StatsTask]: """ :param index: database index :return: an iterator of StatTask objects to execute """ features = self.features if features is None: # input region not from a shapefile features = [None] for feature in features: if feature is None or feature.id is None: feature_id = '(none)' else: feature_id = str(feature.id) for time_period in date_ranges: task = StatsTask(time_period=time_period, spatial_id={'feature_id': feature_id}, feature=feature) _LOG.info( 'Making output product tasks for time period: %s, feature: %s', time_period, feature_id) for source_index, source_spec in enumerate(sources_spec): ep_range = filter_time_by_source(source_spec.get('time'), time_period) if ep_range is None: _LOG.info( "Datasets not included for %s and time range for %s", source_spec['product'], time_period) continue # Build Tile make_tile = partial( ArbitraryTileMaker(self.input_region, feature, self.storage), index=index, time=ep_range, group_by=source_spec.get('group_by', DEFAULT_GROUP_BY)) data = make_tile(product=source_spec['product']) masks = [ make_tile(product=mask['product']) for mask in source_spec.get('masks', []) ] if len(data.sources.time) == 0: _LOG.info("No matched for product %s", source_spec['product']) continue task.sources.append( DataSource(data=data, masks=masks, spec=source_spec, source_index=source_index)) _LOG.info("make tile finished") if task.sources: # Function which takes a Tile, containing sources, and returns a new 'filtered' Tile task = self.filter_task(task, feature, date_ranges) _LOG.info('Created task for time period: %s', time_period) yield task