def query(self, dc: Datacube, **search_terms: Dict[str, Any]) -> VirtualDatasetBag: product = dc.index.products.get_by_name(self._product) if product is None: raise VirtualProductException("could not find product {}".format( self._product)) originals = Query(dc.index, **reject_keys(self, self._NON_QUERY_KEYS)) overrides = Query(dc.index, **reject_keys(search_terms, self._NON_QUERY_KEYS)) query = Query( dc.index, **merge_search_terms(originals.search_terms, overrides.search_terms)) self._assert( query.product == self._product, "query for {} returned another product {}".format( self._product, query.product)) # find the datasets datasets = dc.index.datasets.search(**query.search_terms) if query.geopolygon is not None: datasets = select_datasets_inside_polygon(datasets, query.geopolygon) # should we put it in the Transformation class? if self.get('dataset_predicate') is not None: datasets = [ dataset for dataset in datasets if self['dataset_predicate'](dataset) ] return VirtualDatasetBag(list(datasets), query.geopolygon, {product.name: product})
def test_query_kwargs(): from mock import MagicMock mock_index = MagicMock() mock_index.datasets.get_field_names = lambda: {u'product', u'lat', u'sat_path', 'type_id', u'time', u'lon', u'orbit', u'instrument', u'sat_row', u'platform', 'metadata_type', u'gsi', 'type', 'id'} query = Query(index=mock_index, product='ls5_nbar_albers') assert str(query) assert query.product == 'ls5_nbar_albers' assert query.search_terms['product'] == 'ls5_nbar_albers' query = Query(index=mock_index, latitude=(-35, -36), longitude=(148, 149)) assert query.geopolygon assert 'lat' in query.search_terms assert 'lon' in query.search_terms query = Query(index=mock_index, latitude=-35, longitude=148) assert query.geopolygon assert 'lat' in query.search_terms assert 'lon' in query.search_terms query = Query(index=mock_index, y=(-4174726, -4180011), x=(1515184, 1523263), crs='EPSG:3577') assert query.geopolygon assert 'lat' in query.search_terms assert 'lon' in query.search_terms query = Query(index=mock_index, y=-4174726, x=1515184, crs='EPSG:3577') assert query.geopolygon assert 'lat' in query.search_terms assert 'lon' in query.search_terms query = Query(index=mock_index, y=-4174726, x=1515184, crs='EPSG:3577') assert query.geopolygon assert 'lat' in query.search_terms assert 'lon' in query.search_terms query = Query(index=mock_index, time='2001') assert 'time' in query.search query = Query(index=mock_index, time=('2001', '2002')) assert 'time' in query.search with pytest.raises(ValueError): Query(index=mock_index, y=-4174726, coordinate_reference_system='WGS84', x=1515184, crs='EPSG:3577') with pytest.raises(LookupError): Query(index=mock_index, y=-4174726, x=1515184, crs='EPSG:3577', made_up_key='NotReal') with pytest.raises(LookupError): query_group_by(group_by='magic') gb = query_group_by('time') assert isinstance(gb, GroupBy) assert query_group_by(group_by=gb) is gb
def query(self, dc: Datacube, **search_terms: Dict[str, Any]) -> VirtualDatasetBag: """ Collection of datasets that match the query. """ get = self.get if 'product' in self: product = dc.index.products.get_by_name(self._product) if product is None: raise VirtualProductException( "could not find product {}".format(self._product)) originals = Query(dc.index, **reject_keys(self, self._NON_QUERY_KEYS)) overrides = Query( dc.index, **reject_keys(search_terms, self._NON_QUERY_KEYS)) query = Query( dc.index, **merge_search_terms(originals.search_terms, overrides.search_terms)) self._assert( query.product == self._product, "query for {} returned another product {}".format( self._product, query.product)) # find the datasets datasets = dc.index.datasets.search(**query.search_terms) if query.geopolygon is not None: datasets = select_datasets_inside_polygon( datasets, query.geopolygon) # should we put it in the Transformation class? if get('dataset_predicate') is not None: datasets = [ dataset for dataset in datasets if self['dataset_predicate'](dataset) ] return VirtualDatasetBag(list(datasets), product.grid_spec, query.geopolygon, {product.name: product}) elif 'transform' in self: return self._input.query(dc, **search_terms) elif 'collate' in self or 'juxtapose' in self: result = [ child.query(dc, **search_terms) for child in self._children ] return VirtualDatasetBag( {self._kind: [datasets.pile for datasets in result]}, select_unique([datasets.grid_spec for datasets in result]), select_unique([datasets.geopolygon for datasets in result]), merge_dicts( [datasets.product_definitions for datasets in result])) else: raise VirtualProductException("virtual product was not validated")
def query(self, dc, **search_terms): # type: (Datacube, Dict[str, Any]) -> QueryResult """ Collection of datasets that match the query. """ get = self.get if 'product' in self: originals = Query(dc.index, **reject_keys(self, self._NON_QUERY_KEYS)) overrides = Query( dc.index, **reject_keys(search_terms, self._NON_QUERY_KEYS)) query = Query( dc.index, **merge_search_terms(originals.search_terms, overrides.search_terms)) self._assert( query.product == self._product, "query for {} returned another product {}".format( self._product, query.product)) # find the datasets datasets = select_datasets_inside_polygon( dc.index.datasets.search(**query.search_terms), query.geopolygon) if get('dataset_predicate') is not None: datasets = [ dataset for dataset in datasets if get('dataset_predicate')(dataset) ] # gather information from the index before it disappears from sight # this can also possibly extracted from the product definitions but this is easier grid_spec = dc.index.products.get_by_name(self._product).grid_spec return QueryResult(datasets, grid_spec) elif 'transform' in self: return self._input.query(dc, **search_terms) elif 'collate' in self or 'juxtapose' in self: result = [ child.query(dc, **search_terms) for child in self._children ] grid_spec = select_unique( [datasets.grid_spec for datasets in result]) return QueryResult(result, grid_spec) else: raise VirtualProductException("virtual product was not validated")
def _find_periods_with_data(index, product_names, period_duration='1 day', start_date='1985-01-01', end_date='2000-01-01'): """ Search the datacube and find which periods contain data This is very useful when running stats in the `daily` mode (which outputs a file for each day). It is very slow to create an output for every day regardless of data availability, so it is better to only find the useful days at the beginning. :return: sequence of (start_date, end_date) tuples """ # TODO: Read 'simple' job configuration from file # TODO: need get rid of the hard-coded query query = dict(y=(-41 * (40000 - 1600), -41 * 40000), x=(15 * 40000, 15 * (40000 + 1600)), crs='EPSG:3577', time=(start_date, end_date)) valid_dates = set() for product in product_names: counts = index.datasets.count_product_through_time( period_duration, product=product, **Query(**query).search_terms) for time_range, count in counts: if count > 0: time_range = Range(time_range.begin.astimezone(timezone.utc), time_range.end.astimezone(timezone.utc)) valid_dates.add(time_range) for time_range in sorted(valid_dates): yield time_range.begin, time_range.end
def test_convert_descriptor_query_to_search_query(): descriptor_query = { 'dimensions': { 'latitude': { 'range': (-35.5, -36.5), }, 'longitude': { 'range': (148.3, 149.9) }, 'time': { 'range': (datetime.datetime(2001, 5, 7), datetime.datetime(2002, 3, 9)) } } } descriptor_query_dimensions = descriptor_query['dimensions'] query = Query.from_descriptor_request(descriptor_query) search_query = query.search_terms assert min(descriptor_query_dimensions['latitude'] ['range']) == search_query['lat'].begin assert max(descriptor_query_dimensions['latitude'] ['range']) == search_query['lat'].end assert min(descriptor_query_dimensions['longitude'] ['range']) == search_query['lon'].begin assert max(descriptor_query_dimensions['longitude'] ['range']) == search_query['lon'].end assert datetime.datetime(2001, 5, 7, tzinfo=tz.tzutc()) == search_query['time'].begin assert datetime.datetime(2002, 3, 9, tzinfo=tz.tzutc()) == search_query['time'].end
def chop_query_by_time(q: Query, freq: str = "m") -> Iterator[Query]: """Given a query over longer period of time, chop it up along the time dimension into smaller queries each covering a shorter time period (year, month, week or day). """ qq = dict(**q.search_terms) time = qq.pop("time", None) if time is None: raise ValueError("Need time range in the query") for (t0, t1) in time_range(time.begin, time.end, freq=freq): yield Query(**qq, time=Range(t0, t1))
def chopped_dss(dc: Datacube, freq: str = "m", **query): """Emulate streaming interface for datacube queries. Basic idea is to perform a lot of smaller queries (shorter time periods) """ qq = Query(**query) for q in chop_query_by_time(qq, freq=freq): dss = dc.find_datasets_lazy(**q.search_terms) yield from dss
def ordered_dss(dc: Datacube, freq: str = 'm', **query): """Emulate "order by time" streaming interface for datacube queries. Basic idea is to perform a lot of smaller queries (shorter time periods), sort results then yield them to the calling code. """ qq = Query(**query) for q in chop_query_by_time(qq, freq=freq): dss = dc.find_datasets(**q.search_terms) dss.sort(key=lambda ds: ds.center_time) yield from dss
def find_datasets(self, limit=None, **search_terms): ''' Finds datasets matching the search terms in local index or in GEE catalog. Args: limit (int): Optional; limit the maximum datasets returned search_terms (dict): Search parameters to be passed to datacube.api.query.Query Returns: A generated list of datacube.model.Dataset objects. ''' query = Query(**search_terms) if query.product and not isinstance(query.product, datacube.model.DatasetType): query.product = self.index.products.get_by_name(query.product) query.asset = query.product.metadata_doc.get('properties').get('gee:asset') elif search_terms.get('asset'): query.product = self.generate_product(**search_terms) query.asset = search_terms.pop('asset') product_measurements = query.product.measurements.keys() if hasattr(query, 'asset'): images = self.get_images(self.build_parameters(query)) for document in generate_documents(query.asset, images, query.product): if limit != 0: limit = limit - 1 if limit is not None else limit if set(product_measurements) == set(document['measurements'].keys()): yield datacube.model.Dataset(query.product, document, uris=f'EEDAI://{query.asset}') else: break else: for dataset in super().find_datasets(limit=limit, search_terms=search_terms): yield dataset
def list_gqa_filtered_cells(index, gw, pix_th=None, cell_index=None, **indexers): geobox = gw.grid_spec.tile_geobox(cell_index) query = Query(index=index, geopolygon=None, **indexers) observations = index.datasets.search_eager(**query.search_terms) # filter now with pixel threshold value datasets = {} if pix_th is None: pix_th = 1 print ("pix_th value", str(pix_th)) for dataset in observations: if check_intersect(geobox.extent, dataset.extent.to_crs(gw.grid_spec.crs)): if get_gqa(index, dataset.id) < pix_th: #datasets.append(dataset) datasets.setdefault(cell_index,{'datasets': [], 'geobox': geobox})['datasets'].append(dataset) return gw.cell_sources(datasets, query_group_by(**indexers))
def test_convert_descriptor_query_to_search_query_with_groupby(): descriptor_query = { 'dimensions': { 'time': { 'range': (datetime.datetime(2001, 5, 7), datetime.datetime(2002, 3, 9)), 'group_by': 'solar_day' } } } query = Query.from_descriptor_request(descriptor_query) assert query.group_by assert callable(query.group_by.group_by_func) assert query.group_by.dimension == 'time' assert query.group_by.units == 'seconds since 1970-01-01 00:00:00'
def submit(index: Index, app_config: str, project: str, queue: str, no_qsub: bool, time_range: Tuple[datetime, datetime], tag: str): _LOG.info('Tag: %s', tag) app_config_path = Path(app_config).resolve() app_config = paths.read_document(app_config_path) task_desc, task_path = init_task_app( job_type="fc", source_products=[app_config['source_product']], output_products=[app_config['output_product']], # TODO: Use @datacube.ui.click.parsed_search_expressions to allow params other than time from the cli? datacube_query_args=Query(index=index, time=time_range).search_terms, app_config_path=app_config_path, pbs_project=project, pbs_queue=queue ) _LOG.info("Created task description: %s", task_path) if no_qsub: _LOG.info('Skipping submission due to --no-qsub') return 0 submit_subjob( name='generate', task_desc=task_desc, command=[ 'generate', '-v', '-v', '--task-desc', str(task_path), '--tag', tag ], qsub_params=dict( mem='20G', wd=True, ncpus=1, walltime='1h', name='fc-generate-{}'.format(tag) ) )
def ordered_dss(dc: Datacube, freq: str = "m", key=None, **query): """Emulate "order by time" streaming interface for datacube queries. Basic idea is to perform a lot of smaller queries (shorter time periods), sort results then yield them to the calling code. :param dc: Datacube instance :param freq: 'm' month sized chunks, 'w' week sized chunks, 'd' day :param key: Optional sorting function Dataset -> Comparable, for example ``lambda ds: (ds.center_time, ds.metadata.region_code)`` """ qq = Query(**query) if key is None: key = lambda ds: ds.center_time for q in chop_query_by_time(qq, freq=freq): dss = dc.find_datasets(**q.search_terms) dss.sort(key=key) yield from dss
def test_convert_descriptor_query_to_search_query_with_crs_conversion(): descriptor_query = { 'dimensions': { 'latitude': { 'range': (-3971790.0737348166, -4101004.3359463234), 'crs': 'EPSG:3577', }, 'longitude': { 'range': (1458629.8414059384, 1616407.8831088375), 'crs': 'EPSG:3577', } } } expected_result = { 'lat': Range(-36.6715565808, -35.3276413143), 'lon': Range(148.145408153, 150.070966341), } query = Query.from_descriptor_request(descriptor_query) search_query = query.search_terms assert all(map(isclose, search_query['lat'], expected_result['lat'])) assert all(map(isclose, search_query['lon'], expected_result['lon']))
def query(self, dc: Datacube, **search_terms: Dict[str, Any]) -> VirtualDatasetBag: product = dc.index.products.get_by_name(self._product) if product is None: raise VirtualProductException("could not find product {}".format( self._product)) merged_terms = merge_search_terms( reject_keys(self, self._NON_QUERY_KEYS), reject_keys(search_terms, self._NON_QUERY_KEYS)) query = Query( dc.index, **reject_keys(merged_terms, self._ADDITIONAL_SEARCH_KEYS)) self._assert( query.product == self._product, "query for {} returned another product {}".format( self._product, query.product)) return VirtualDatasetBag(dc.find_datasets(**merged_terms), query.geopolygon, {product.name: product})
def run_query(query, config=None, max_datasets=None): """ Load and return the data. :param dict query: Query. :param str config: Datacube config filepath or None. :return: Data. :rtype: xarray.Dataset :raise NoDataError: No data found for query """ # noinspection PyTypeChecker dc = datacube.Datacube(config=config, app='QGIS Plugin') test_query = { k: query[k] for k in ('product', 'time', 'x', 'y', 'crs') if k in query } test_query = Query(**test_query) datasets = dc.index.datasets.search_eager(**test_query.search_terms) if not datasets: raise NoDataError('No datasets found for query:\n{}'.format( str(query))) elif max_datasets and len(datasets) > max_datasets: msg = ( 'Number of datasets found ({}) exceeds maximum allowed ({}).\n' 'Reduce your temporal or spatial extent, or increase the maximum in Settings.' ) raise TooManyDatasetsError(msg.format(len(datasets), max_datasets)) data = dc.load(**query) if not data.variables: raise NoDataError('No data found for query:\n{}'.format(str(query))) return data
def test_convert_descriptor_query_to_search_query_with_single_value(): descriptor_query = { 'dimensions': { 'latitude': { 'range': -3971790.0737348166, 'crs': 'EPSG:3577', }, 'longitude': { 'range': 1458629.8414059384, 'crs': 'EPSG:3577', } } } expected_lat = -35.5160921229 expected_lon = 148.145408153 query = Query.from_descriptor_request(descriptor_query) search_query = query.search_terms assert min(*search_query['lat']) <= expected_lat <= max( *search_query['lat']) assert search_query['lat'].begin != search_query['lat'].end assert min(*search_query['lon']) <= expected_lon <= max( *search_query['lon']) assert search_query['lon'].begin != search_query['lon'].end
def test_convert_descriptor_query_to_search_query_with_slices(): descriptor_query = { 'dimensions': { 'latitude': { 'range': (-35.5, -36.5), 'array_range': (100, 200) }, 'longitude': { 'range': (148.3, 149.9), 'array_range': (100, 200) }, 'time': { 'range': (datetime.datetime(2001, 5, 7), datetime.datetime(2002, 3, 9)), 'array_range': (5, 10) } } } query = Query.from_descriptor_request(descriptor_query) assert query.slices assert query.slices['latitude'] == slice(100, 200) assert query.slices['longitude'] == slice(100, 200) assert query.slices['time'] == slice(5, 10)
def test_descriptor_handles_bad_input(): with pytest.raises(ValueError): descriptor_query = "Not a descriptor" Query.from_descriptor_request(descriptor_query) with pytest.raises(ValueError): descriptor_query = ["Not a descriptor"] Query.from_descriptor_request(descriptor_query) with pytest.raises(ValueError): descriptor_query = { 'dimensions': { 'latitude': { 'range': -35, 'crs': 'EPSG:4326', }, 'longitude': { 'range': 1458629.8414059384, 'crs': 'EPSG:3577', } } } Query.from_descriptor_request(descriptor_query)
def test_time_handling(time_param, expected): query = Query(time=time_param) assert 'time' in query.search_terms assert query.search_terms['time'] == expected
def _dataset_count(index, **query): """Return number of datasets matching a query.""" return index.datasets.count(**Query(**query).search_terms)
def _query_polygon(**kw): return Query(**kw).geopolygon
def submit(index: Index, app_config: str, project: str, queue: str, no_qsub: bool, time_range: Tuple[datetime, datetime], tag: str, email_options: str, email_id: str, dry_run: bool): """ Kick off two stage PBS job Stage 1 (Generate task file): The task-app machinery loads a config file, from a path specified on the command line, into a dict. If dry is enabled, a dummy DatasetType is created for tasks generation without indexing the product in the database. If dry run is disabled, generate tasks into file and queue PBS job to process them. Stage 2 (Run): During normal run, following are performed: 1) Tasks shall be yielded for dispatch to workers. 2) Load data 3) Run FC algorithm 4) Attach metadata 5) Write output files and 6) Finally index the newly created FC output netCDF files If dry run is enabled, application only prepares a list of output files to be created and does not record anything in the database. """ _LOG.info('Tag: %s', tag) app_config_path = Path(app_config).resolve() app_config = paths.read_document(app_config_path) if not time_range or not all(time_range): query_args = Query(index=index).search_terms else: query_args = Query(index=index, time=time_range).search_terms task_desc, task_path = init_task_app( job_type="fc", source_products=[app_config['source_product']], output_products=[app_config['output_product']], # TODO: Use @datacube.ui.click.parsed_search_expressions to allow params other than time from the cli? datacube_query_args=query_args, app_config_path=app_config_path, pbs_project=project, pbs_queue=queue) _LOG.info("Created task description: %s", task_path) if no_qsub: _LOG.info('Skipping submission due to --no-qsub') return 0 # If dry run is not enabled just pass verbose option dry_run_option = '--dry-run' if dry_run else '-v' extra_qsub_args = '-M {0} -m {1}'.format(email_id, email_options) # Append email options and email id to the PbsParameters dict key, extra_qsub_args task_desc.runtime_state.pbs_parameters.extra_qsub_args.extend( extra_qsub_args.split(' ')) submit_subjob(name='generate', task_desc=task_desc, command=[ 'generate', '-vv', '--task-desc', str(task_path), '--tag', tag, '--log-queries', '--email-id', email_id, '--email-options', email_options, dry_run_option, ], qsub_params=dict(name='fc-generate-{}'.format(tag), mem='medium', wd=True, nodes=1, walltime='1h'))
def test_query_multiple_products(mock_index): q = Query(index=mock_index, product=['ls5_nbar_albers', 'ls7_nbar_albers']) assert q.product == ['ls5_nbar_albers', 'ls7_nbar_albers']
def dataset_count(index, **query): return index.datasets.count(**Query(**query).search_terms)
def test_query_issue_1146(): q = Query(k='AB') assert q.search['k'] == 'AB'
def test_dateline_query_building(): lon = Query(x=(618300, 849000), y=(-1876800, -1642500), crs='EPSG:32660').search_terms['lon'] assert lon.begin < 180 < lon.end