def test_index_dataset_with_sources(index, default_metadata_type): type_ = index.products.add_document(_pseudo_telemetry_dataset_type) parent_doc = _telemetry_dataset.copy() parent = Dataset(type_, parent_doc, None, sources={}) child_doc = _telemetry_dataset.copy() child_doc['lineage'] = {'source_datasets': {'source': _telemetry_dataset}} child_doc['id'] = '051a003f-5bba-43c7-b5f1-7f1da3ae9cfb' child = Dataset(type_, child_doc, local_uri=None, sources={'source': parent}) with pytest.raises(MissingRecordError): index.datasets.add(child, sources_policy='skip') index.datasets.add(child, sources_policy='ensure') assert index.datasets.get(parent.id) assert index.datasets.get(child.id) index.datasets.add(child, sources_policy='skip') index.datasets.add(child, sources_policy='ensure') index.datasets.add(child, sources_policy='verify') # Deprecated property, but it should still work until we remove it completely. index.datasets.add(child, sources_policy='skip') parent_doc['platform'] = {'code': 'LANDSAT_9'} index.datasets.add(child, sources_policy='ensure') index.datasets.add(child, sources_policy='skip') with pytest.raises(DocumentMismatchError): index.datasets.add(child, sources_policy='verify')
def test_index_dataset_with_location(index, default_metadata_type): """ :type index: datacube.index._api.Index :type default_collection: datacube.model.DatasetType """ first_file = '/tmp/first/something.yaml' second_file = '/tmp/second/something.yaml' type_ = index.datasets.types.add_document(_pseudo_telemetry_dataset_type) dataset = Dataset(type_, _telemetry_dataset, Path(first_file).absolute().as_uri()) dataset = index.datasets.add(dataset) assert dataset.id == _telemetry_uuid # TODO: Dataset types? assert dataset.type.id == type_.id assert dataset.metadata_type.id == default_metadata_type.id assert dataset.local_path.absolute() == Path(first_file).absolute() # Ingesting again should have no effect. index.datasets.add(dataset) locations = index.datasets.get_locations(dataset) assert len(locations) == 1 first_as_uri = Path(first_file).absolute().as_uri() second_as_uri = Path(second_file).absolute().as_uri() # Ingesting with a new path should add the second one too. dataset.local_uri = second_as_uri index.datasets.add(dataset) locations = index.datasets.get_locations(dataset) assert len(locations) == 2 # Newest to oldest. assert locations == [second_as_uri, first_as_uri] # And the second one is newer, so it should be returned as the default local path: assert dataset.local_path.absolute() == Path(second_file).absolute()
def test_index_dataset_with_sources(index, default_metadata_type): type_ = index.products.add_document(_pseudo_telemetry_dataset_type) parent_doc = _telemetry_dataset.copy() parent = Dataset(type_, parent_doc, None, sources={}) child_doc = _telemetry_dataset.copy() child_doc['lineage'] = {'source_datasets': {'source': _telemetry_dataset}} child_doc['id'] = '051a003f-5bba-43c7-b5f1-7f1da3ae9cfb' child = Dataset(type_, child_doc, sources={'source': parent}) with pytest.raises(MissingRecordError): index.datasets.add(child, with_lineage=False) index.datasets.add(child) assert index.datasets.get(parent.id) assert index.datasets.get(child.id) assert len(index.datasets.bulk_get([parent.id, child.id])) == 2 index.datasets.add(child, with_lineage=False) index.datasets.add(child, with_lineage=True) parent_doc['platform'] = {'code': 'LANDSAT_9'} index.datasets.add(child, with_lineage=True) index.datasets.add(child, with_lineage=False) # backwards compatibility code path checks, don't use this in normal code for p in ('skip', 'ensure', 'verify'): index.datasets.add(child, sources_policy=p)
def mk_sample_dataset(bands, uri='file:///tmp', product_name='sample', format='GeoTiff', timestamp=None, id='3a1df9e0-8484-44fc-8102-79184eab85dd'): # pylint: disable=redefined-builtin image_bands_keys = 'path layer band'.split(' ') measurement_keys = 'dtype units nodata aliases name'.split(' ') def with_keys(d, keys): return dict((k, d[k]) for k in keys if k in d) measurements = [with_keys(m, measurement_keys) for m in bands] image_bands = dict((m['name'], with_keys(m, image_bands_keys)) for m in bands) ds_type = mk_sample_product(product_name, measurements=measurements) if timestamp is None: timestamp = '2018-06-29' return Dataset(ds_type, { 'id': id, 'format': {'name': format}, 'image': {'bands': image_bands}, 'time': timestamp, }, uris=[uri])
def mk_sample_dataset(bands, uri='file:///tmp', product_name='sample', format='GeoTiff', id='12345678123456781234567812345678'): image_bands_keys = 'path layer band'.split(' ') measurement_keys = 'dtype units nodata aliases name'.split(' ') def with_keys(d, keys): return dict((k, d[k]) for k in keys if k in d) measurements = [with_keys(m, measurement_keys) for m in bands] image_bands = dict( (m['name'], with_keys(m, image_bands_keys)) for m in bands) ds_type = mk_sample_product(product_name, measurements=measurements) return Dataset(ds_type, { 'id': id, 'format': { 'name': format }, 'image': { 'bands': image_bands } }, uris=[uri])
def mk_ds(zone, datum="GDA94"): return Dataset( product, { "grid_spatial": { "projection": { "zone": zone, "datum": datum, "ellipsoid": "GRS80", "orientation": "NORTH_UP", "geo_ref_points": { "ll": { "x": 537437.5, "y": 5900512.5 }, "lr": { "x": 781687.5, "y": 5900512.5 }, "ul": { "x": 537437.5, "y": 6117112.5 }, "ur": { "x": 781687.5, "y": 6117112.5 } }, "map_projection": "UTM", "resampling_option": "CUBIC_CONVOLUTION" } } })
def make_dataset(product, sources, extent, center_time, valid_data=None, uri=None, app_info=None): """ Create :class:`datacube.model.Dataset` for the data :param DatasetType product: Product the dataset is part of :param list[:class:`Dataset`] sources: datasets used to produce the dataset :param Geometry extent: extent of the dataset :param Geometry valid_data: extent of the valid data :param center_time: time of the central point of the dataset :param str uri: The uri of the dataset :param dict app_info: Additional metadata to be stored about the generation of the product :rtype: class:`Dataset` """ document = {} merge(document, product.metadata_doc) merge(document, new_dataset_info()) merge(document, machine_info()) merge(document, band_info(product.measurements.keys())) merge(document, source_info(sources)) merge(document, geobox_info(extent, valid_data)) merge(document, time_info(center_time)) merge(document, app_info or {}) return Dataset( product, document, local_uri=uri, sources={str(idx): dataset for idx, dataset in enumerate(sources)})
def resolve_no_lineage(ds, uri): doc = ds.doc_without_lineage_sources try: product = match_product(doc) except BadMatch as e: return None, e return Dataset(product, doc, uris=[uri], sources={}), None
def match_dataset(dataset_doc, uri, rules): """ :rtype datacube.model.Dataset: """ rule = match_doc(rules, dataset_doc) sources = {cls: match_dataset(source_doc, None, rules) for cls, source_doc in rule['type'].dataset_reader(dataset_doc).sources.items()} return Dataset(rule['type'], dataset_doc, uri, sources=sources)
def create_dataset(dataset_doc, uri, rules): """ :rtype datacube.model.Dataset: """ dataset_type = find_matching_product(rules, dataset_doc) sources = {cls: create_dataset(source_doc, None, rules) for cls, source_doc in dataset_type.dataset_reader(dataset_doc).sources.items()} return Dataset(dataset_type, dataset_doc, uris=[uri] if uri else None, sources=sources)
def doc2ds(doc, products): if doc is None: return None p = products.get(doc['product'], None) if p is None: raise ValueError('No product named: %s' % doc['product']) return Dataset(p, doc['metadata'], uris=doc['uris'])
def _build_dataset(doc): sources = { name: _build_dataset(src) for name, src in doc['lineage']['source_datasets'].items() } return Dataset(_EXAMPLE_DATASET_TYPE, doc, uris=['file://test.zzz'], sources=sources)
def doc2ds(doc: Optional[Document], products: Dict[str, DatasetType]) -> Optional[Dataset]: if doc is None: return None p = products.get(doc['product'], None) if p is None: raise ValueError('No product named: %s' % doc['product']) return Dataset(p, doc['metadata'], uris=doc['uris'])
def doc2ds( doc: Optional[Document], products: Dict[str, DatasetType] ) -> Optional[Dataset]: if doc is None: return None p = products.get(doc["product"], None) if p is None: raise ValueError("No product named: %s" % doc["product"]) return Dataset(p, doc["metadata"], uris=doc["uris"])
def _make(self, dataset_res, full_info=False): """ :rtype datacube.model.Dataset :param bool full_info: Include all available fields """ return Dataset(self.types.get(dataset_res.dataset_type_ref), dataset_res.metadata, dataset_res.local_uri, indexed_by=dataset_res.added_by if full_info else None, indexed_time=dataset_res.added if full_info else None)
def test_multiband_support_in_datasetsource(example_gdal_path): defn = { "id": '12345678123456781234567812345678', "format": { "name": "GeoTiff" }, "image": { "bands": { 'green': { 'type': 'reflective', 'cell_size': 25.0, 'path': example_gdal_path, 'label': 'Coastal Aerosol', 'number': '1', }, } } } # Without new band attribute, default to band number 1 d = Dataset(_EXAMPLE_DATASET_TYPE, defn, uris=['file:///tmp']) ds = RasterDatasetDataSource(BandInfo(d, 'green')) bandnum = ds.get_bandnumber(None) assert bandnum == 1 with ds.open() as foo: data = foo.read() assert isinstance(data, np.ndarray) ############# # With new 'image.bands.[band].band' attribute band_num = 3 defn['image']['bands']['green']['band'] = band_num d = Dataset(_EXAMPLE_DATASET_TYPE, defn, uris=['file:///tmp']) ds = RasterDatasetDataSource(BandInfo(d, 'green')) assert ds.get_bandnumber(None) == band_num
def test_add_eo3(sample_doc, sample_doc_180, eo3_product): doc = add_eo3_parts(sample_doc) assert doc is not sample_doc ds = Dataset(eo3_product, doc) assert ds.crs == 'EPSG:3857' assert ds.extent is not None assert ds.extent.crs == 'EPSG:3857' assert ds.metadata.lat.begin < ds.metadata.lat.end assert ds.metadata.lon.begin < ds.metadata.lon.end doc = dict(**sample_doc, geometry=ds.extent.buffer(-1).json) ds2 = Dataset(eo3_product, add_eo3_parts(doc)) assert ds2.crs == 'EPSG:3857' assert ds2.extent is not None assert ds2.extent.crs == 'EPSG:3857' assert ds2.metadata.lat.begin < ds2.metadata.lat.end assert ds2.metadata.lon.begin < ds2.metadata.lon.end assert ds.extent.contains(ds2.extent) doc = add_eo3_parts(sample_doc_180) assert doc is not sample_doc_180 ds = Dataset(eo3_product, doc) assert ds.crs == 'EPSG:32660' assert ds.extent is not None assert ds.extent.crs == 'EPSG:32660' assert ds.metadata.lat.begin < ds.metadata.lat.end assert ds.metadata.lon.begin < 180 < ds.metadata.lon.end doc = dict(**sample_doc) doc.pop('crs') with pytest.raises(ValueError): add_eo3_parts(doc) doc = dict(**sample_doc) doc.pop('grids') with pytest.raises(ValueError): add_eo3_parts(doc) with pytest.raises(ValueError): eo3_lonlat_bbox({})
def mk_dataset(ds, uri): uuid = ds.id if uuid is None: return None, None, "Metadata document it missing id field" existing = index.datasets.get(uuid) if existing is None: return None, None, "No such dataset in the database: {}".format(uuid) return Dataset(existing.type, ds.doc_without_lineage_sources, uris=[uri]), existing, None
def test_multiband_support_in_datasetsource(): defn = { "id": '12345678123456781234567812345678', "format": {"name": "hdf"}, 'measurements': {'green': {'nodata': -999}}, "image": { "bands": { 'green': { 'type': 'reflective', 'cell_size': 25.0, 'path': 'product/LS8_OLITIRS_NBAR_P54_GALPGS01-002_112_079_20140126_B1.tif', 'label': 'Coastal Aerosol', 'number': '1', }, } } } # Without new band attribute, default to band number 1 d = Dataset(_EXAMPLE_DATASET_TYPE, defn, uris=['file:///tmp']) ds = RasterDatasetSource(d, measurement_id='green') bandnum = ds.get_bandnumber(None) assert bandnum == 1 ############# # With new 'image.bands.[band].band' attribute band_num = 3 defn['image']['bands']['green']['band'] = band_num d = Dataset(_EXAMPLE_DATASET_TYPE, defn, uris=['file:///tmp']) ds = RasterDatasetSource(d, measurement_id='green') assert ds.get_bandnumber(None) == band_num
def to_dc_dataset( dc: Datacube, rendered: Dict[str, Any], ds_type: Optional[DatasetType] = None, transform: Callable = stac_transform, product_name: str = "crop_mask", ) -> Dataset: """ " Stac transformed """ if not ds_type: ds_type = dict( (d.name, d) for d in dc.index.datasets.types.get_all())[product_name] return Dataset(ds_type, transform(rendered))
def _make(self, dataset_res, full_info=False): """ :rtype Dataset :param bool full_info: Include all available fields """ uris = dataset_res.uris if uris: uris = [uri for uri in uris if uri] if uris else [] return Dataset(type_=self.types.get(dataset_res.dataset_type_ref), metadata_doc=dataset_res.metadata, uris=uris, indexed_by=dataset_res.added_by if full_info else None, indexed_time=dataset_res.added if full_info else None, archived_time=dataset_res.archived)
def _make(self, dataset_res, full_info=False): """ :rtype datacube.model.Dataset :param bool full_info: Include all available fields """ uri = dataset_res.uri return Dataset( self.types.get(dataset_res.dataset_type_ref), dataset_res.metadata, # We guarantee that this property on the class is only a local uri. uri if uri and uri.startswith('file:') else None, indexed_by=dataset_res.added_by if full_info else None, indexed_time=dataset_res.added if full_info else None, archived_time=dataset_res.archived)
def mk_dataset(ds, uri): uuid = ds.id if uuid is None: return None, None, "Metadata document it missing id field" existing = index.datasets.get(uuid) if existing is None: return None, None, "No such dataset in the database: {}".format(uuid) ds = SimpleDocNav(prep_eo3(ds.doc, auto_skip=True)) # TODO: what about sources=? return Dataset(existing.type, ds.doc_without_lineage_sources, uris=[uri]), existing, None
def resolve_ds(ds, sources, cache=None): cached = cache.get(ds.id) if cached is not None: return cached uris = [uri] if ds.id == main_uuid else [] doc = ds.doc db_ds = db_dss.get(ds.id) if db_ds: product = db_ds.type else: product = match_product(doc) return with_cache(Dataset(product, doc, uris=uris, sources=sources), ds.id, cache)
def mk_sample_dataset(bands, uri='file:///tmp', product_name='sample', format='GeoTiff', timestamp=None, id='3a1df9e0-8484-44fc-8102-79184eab85dd', geobox=None, product_opts=None): # pylint: disable=redefined-builtin image_bands_keys = 'path layer band'.split(' ') measurement_keys = 'dtype units nodata aliases name'.split(' ') def with_keys(d, keys): return dict((k, d[k]) for k in keys if k in d) measurements = [with_keys(m, measurement_keys) for m in bands] image_bands = dict( (m['name'], with_keys(m, image_bands_keys)) for m in bands) if product_opts is None: product_opts = {} ds_type = mk_sample_product(product_name, measurements=measurements, **product_opts) if timestamp is None: timestamp = '2018-06-29' if uri is None: uris = [] elif isinstance(uri, list): uris = uri.copy() else: uris = [uri] return Dataset(ds_type, { 'id': id, 'format': { 'name': format }, 'image': { 'bands': image_bands }, 'time': timestamp, **geobox_to_gridspatial(geobox), }, uris=uris)
def generate_dataset(data, sources, prod_info, uri): nudata = data.copy() datasets = [] for idx, (time, sources) in enumerate(zip(sources.time.values, sources.values)): document = { 'id': str(uuid.uuid4()), 'image': { 'bands': { name: { 'path': '', 'layer': name } for name in nudata.data_vars } }, 'lineage': { 'source_datasets': { str(idx): dataset.metadata_doc for idx, dataset in enumerate(sources) } } } # TODO: extent is a bad thing to store - it duplicates coordinates set_geobox_info(document, data.crs, data.extent) document['extent']['from_dt'] = str(time) document['extent']['to_dt'] = str(time) document['extent']['center_dt'] = str(time) document.update(prod_info.metadata) dataset = Dataset( prod_info, document, local_uri=uri, sources={str(idx): dataset for idx, dataset in enumerate(sources)}) datasets.append(dataset) nudata['dataset'] = (['time'], numpy.array([ yaml.dump(dataset.metadata_doc, Dumper=SafeDumper, encoding='utf-8') for dataset in datasets ], dtype='S')) return nudata, datasets
def on_success(dataset: DatasetDoc, dataset_path: Path): """ Index the dataset """ product_name = dataset.product.name product = products.get(product_name) if not product: product = index.products.get_by_name(product_name) if not product: raise ValueError( f"Product {product_name} not found in ODC index") products[product_name] = product index.datasets.add( Dataset(product, serialise.to_doc(dataset), uris=dataset.locations)) _LOG.debug("Indexed dataset", dataset_id=dataset.id, dataset_path=dataset_path)
def all_datasets(dc: Datacube, product: str, read_chunk: int = 1000, limit: Optional[int] = None): """ Like dc.find_datasets_lazy(product=product) but actually lazy, using db cursors """ import psycopg2 from random import randint assert isinstance(limit, (int, type(None))) db = psycopg2.connect(str(dc.index.url)) _limit = "" if limit is None else f"LIMIT {limit}" _product = dc.index.products.get_by_name(product) query = f"""select jsonb_build_object( 'product', %(product)s, 'uris', array((select _loc_.uri_scheme ||':'||_loc_.uri_body from agdc.dataset_location as _loc_ where _loc_.dataset_ref = agdc.dataset.id and _loc_.archived is null order by _loc_.added desc, _loc_.id desc)), 'metadata', metadata) as dataset from agdc.dataset where archived is null and dataset_type_ref = (select id from agdc.dataset_type where name = %(product)s) {_limit}; """ cursor_name = "c{:04X}".format(randint(0, 0xFFFF)) with db.cursor(name=cursor_name) as cursor: cursor.execute(query, dict(product=product)) while True: chunk = cursor.fetchmany(read_chunk) if not chunk: break for (ds, ) in chunk: yield Dataset(_product, ds["metadata"], ds["uris"])
def update(self, dataset: Dataset, updates_allowed=None): """ Update dataset metadata and location :param Dataset dataset: Dataset to update :param updates_allowed: Allowed updates :rtype: Dataset """ existing = self.get(dataset.id) can_update, safe_changes, unsafe_changes = self.can_update( dataset, updates_allowed) if not safe_changes and not unsafe_changes: self._ensure_new_locations(dataset, existing) _LOG.info("No changes detected for dataset %s", dataset.id) return dataset for offset, old_val, new_val in safe_changes: _LOG.info("Safe change in %s from %r to %r", _readable_offset(offset), old_val, new_val) for offset, old_val, new_val in unsafe_changes: _LOG.warning("Unsafe change in %s from %r to %r", _readable_offset(offset), old_val, new_val) if not can_update: raise ValueError(f"Unsafe changes in {dataset.id}: " + (", ".join( _readable_offset(offset) for offset, _, _ in unsafe_changes))) _LOG.info("Updating dataset %s", dataset.id) product = self.types.get_by_name(dataset.type.name) with self._db.begin() as transaction: if not transaction.update_dataset( dataset.metadata_doc_without_lineage(), dataset.id, product.id): raise ValueError("Failed to update dataset %s..." % dataset.id) self._ensure_new_locations(dataset, existing) return dataset
def add_dataset(pr, dt, metadict, file): """Add a dataset to the datacube database It's added to 2 tables: - dataset: with all the metadata - dataset_location Args: pr (ProductResource): A ProductResource object, contained in the return of ``add_product`` dt (DatasetType): A DatasetType object, contained in the return of ``add_product`` metadict (dict): Dictionary containing dataset metadata, generally generated by ``metadict_from_netcdf`` file (str): Path of the file to add to the index Return: No return, the function is used for its side effect of adding a dataset to the datacube """ db = PostgresDb.from_config(CONFIG) dataset_resource = DatasetResource(db, pr) dataset = Dataset(dt, metadict, sources={}) dataset_resource.add(dataset) uid = metadict['id'] dataset_resource.add_location(uid, file)