def datasets_by_region(engine, index, product_name, region_code, time_range, limit): product = index.products.get_by_name(product_name) query = ( select(postgres_api._DATASET_SELECT_FIELDS) .select_from( DATASET_SPATIAL.join(DATASET, DATASET_SPATIAL.c.id == DATASET.c.id) ) .where(DATASET_SPATIAL.c.region_code == bindparam("region_code", region_code)) .where( DATASET_SPATIAL.c.dataset_type_ref == bindparam("dataset_type_ref", product.id) ) ) if time_range: query = query.where( DATASET_SPATIAL.c.center_time > bindparam("from_time", time_range.begin) ).where(DATASET_SPATIAL.c.center_time < bindparam("to_time", time_range.end)) query = query.order_by(DATASET_SPATIAL.c.center_time).limit( bindparam("limit", limit) ) return ( index.datasets._make(res, full_info=True) for res in engine.execute(query).fetchall() )
def refresh_product(index: Index, product: DatasetType): engine: Engine = alchemy_engine(index) insert_count = _populate_missing_dataset_extents(engine, product) # If we inserted data... if insert_count: # And it's a non-spatial product... if get_dataset_extent_alchemy_expression( product.metadata_type) is None: # And it has WRS path/rows... if "sat_path" in product.metadata_type.dataset_fields: # We can synthesize the polygons! _LOG.debug( "spatial_synthesizing.start", product_name=product.name, ) shapes = _get_path_row_shapes() rows = [ row for row in index.datasets.search_returning( ("id", "sat_path", "sat_row"), product=product.name) if row.sat_path.lower is not None ] if rows: engine.execute( DATASET_SPATIAL.update().where( DATASET_SPATIAL.c.id == bindparam("dataset_id")). values(footprint=bindparam("footprint")), [ dict( dataset_id=id_, footprint=from_shape( shapely.ops.unary_union([ shapes[(int(sat_path.lower), row)] for row in range( int(sat_row.lower), int(sat_row.upper) + 1, ) ]), srid=4326, extended=True, ), ) for id_, sat_path, sat_row in rows ], ) _LOG.debug( "spatial_synthesizing.done", product_name=product.name, ) return insert_count
def _populate_missing_dataset_extents( engine: Engine, product: DatasetType, force_update_all=False ): columns = {c.name: c for c in _select_dataset_extent_columns(product)} if force_update_all: query = ( DATASET_SPATIAL.update() .values(**columns) .where(DATASET_SPATIAL.c.id == columns["id"]) .where( DATASET.c.dataset_type_ref == bindparam("product_ref", product.id, type_=SmallInteger) ) .where(DATASET.c.archived == None) ) else: query = ( postgres.insert(DATASET_SPATIAL) .from_select( columns.keys(), select(columns.values()) .where( DATASET.c.dataset_type_ref == bindparam("product_ref", product.id, type_=SmallInteger) ) .where(DATASET.c.archived == None) .order_by(columns["center_time"]), ) .on_conflict_do_nothing(index_elements=["id"]) ) # print(as_sql(query)) _LOG.debug( "spatial_insert_query.start", product_name=product.name, force_update_all=force_update_all, ) changed = engine.execute(query).rowcount _LOG.debug( "spatial_insert_query.end", product_name=product.name, change_count=changed ) return changed
def refresh_spatial_extents( index: Index, product: DatasetType, clean_up_deleted=False, assume_after_date: datetime = None, ): """ Update the spatial extents to match any changes upstream in ODC. :param assume_after_date: Only scan datasets that have changed after the given (db server) time. If None, all datasets will be regenerated. :param clean_up_deleted: Scan for any manually deleted rows too. Slow. """ engine: Engine = alchemy_engine(index) log = _LOG.bind(product_name=product.name, after_date=assume_after_date) # First, remove any archived datasets from our spatial table. datasets_to_delete = (select([DATASET.c.id]).where( DATASET.c.archived.isnot(None)).where( DATASET.c.dataset_type_ref == product.id)) if assume_after_date is not None: # Note that we use "dataset_changed_expression" to scan the datasets, # rather than "where archived > date", because the latter has no index! # (.... and we're using dataset_changed_expression's index everywhere else, # so it's probably still in memory and super fast!) datasets_to_delete = datasets_to_delete.where( dataset_changed_expression() > assume_after_date) log.info("spatial_archival", ) changed = engine.execute(DATASET_SPATIAL.delete().where( DATASET_SPATIAL.c.id.in_(datasets_to_delete))).rowcount log.info( "spatial_archival.end", change_count=changed, ) # Forcing? Check every other dataset for removal, so we catch manually-deleted rows from the table. if clean_up_deleted: log.warning("spatial_deletion_full_scan", ) changed += engine.execute( DATASET_SPATIAL.delete().where( DATASET_SPATIAL.c.dataset_type_ref == product.id, ) # Where it doesn't exist in the ODC dataset table. .where(~DATASET_SPATIAL.c.id.in_( select([DATASET.c.id]).where( DATASET.c.dataset_type_ref == product.id, )))).rowcount log.info( "spatial_deletion_scan.end", change_count=changed, ) # We'll update first, then insert new records. # -> We do it in this order so that inserted records aren't immediately updated. # (Note: why don't we do this in one upsert? Because we get our sqlalchemy expressions # through ODC's APIs and can't choose alternative table aliases to make sub-queries. # Maybe you can figure out a workaround, though?) column_values = { c.name: c for c in _select_dataset_extent_columns(product) } only_where = [ DATASET.c.dataset_type_ref == bindparam("product_ref", product.id, type_=SmallInteger), DATASET.c.archived.is_(None), ] if assume_after_date is not None: only_where.append(dataset_changed_expression() > assume_after_date) else: log.warning("spatial_update.recreating_everything") # Update any changed datasets log.info( "spatial_update", product_name=product.name, after_date=assume_after_date, ) changed += engine.execute( DATASET_SPATIAL.update().values(**column_values).where( DATASET_SPATIAL.c.id == column_values["id"]).where( and_(*only_where))).rowcount log.info("spatial_update.end", product_name=product.name, change_count=changed) # ... and insert new ones. log.info( "spatial_insert", product_name=product.name, after_date=assume_after_date, ) changed += engine.execute( postgres.insert(DATASET_SPATIAL).from_select( column_values.keys(), select(column_values.values()).where(and_(*only_where)).order_by( column_values["center_time"]), ).on_conflict_do_nothing(index_elements=["id"])).rowcount log.info("spatial_insert.end", product_name=product.name, change_count=changed) # If we changed data... if changed: # And it's a non-spatial product... if get_dataset_extent_alchemy_expression( product.metadata_type) is None: # And it has WRS path/rows... if "sat_path" in product.metadata_type.dataset_fields: # We can synthesize the polygons! log.info("spatial_synthesizing", ) shapes = _get_path_row_shapes() rows = [ row for row in index.datasets.search_returning( ("id", "sat_path", "sat_row"), product=product.name) if row.sat_path.lower is not None ] if rows: engine.execute( DATASET_SPATIAL.update().where( DATASET_SPATIAL.c.id == bindparam("dataset_id")). values(footprint=bindparam("footprint")), [ dict( dataset_id=id_, footprint=from_shape( shapely.ops.unary_union([ shapes[(int(sat_path.lower), row)] for row in range( int(sat_row.lower), int(sat_row.upper) + 1, ) ]), srid=4326, extended=True, ), ) for id_, sat_path, sat_row in rows ], ) log.info("spatial_synthesizing.end", ) return changed
def search_items( self, *, product_name: Optional[str] = None, time: Optional[Tuple[datetime, datetime]] = None, bbox: Tuple[float, float, float, float] = None, limit: int = 500, offset: int = 0, full_dataset: bool = False, dataset_ids: Sequence[UUID] = None, require_geometry=True, ordered=True, ) -> Generator[DatasetItem, None, None]: """ Search datasets using Cubedash's spatial table Returned as DatasetItem records, with optional embedded full Datasets (if full_dataset==True) Returned results are always sorted by (center_time, id) """ geom = func.ST_Transform(DATASET_SPATIAL.c.footprint, 4326) columns = [ geom.label("geometry"), func.Box2D(geom).label("bbox"), # TODO: dataset label? DATASET_SPATIAL.c.region_code.label("region_code"), DATASET_SPATIAL.c.creation_time, DATASET_SPATIAL.c.center_time, ] # If fetching the whole dataset, we need to join the ODC dataset table. if full_dataset: query: Select = select( (*columns, *_utils.DATASET_SELECT_FIELDS)).select_from( DATASET_SPATIAL.join( ODC_DATASET, onclause=ODC_DATASET.c.id == DATASET_SPATIAL.c.id)) # Otherwise query purely from the spatial table. else: query: Select = select((*columns, DATASET_SPATIAL.c.id, DATASET_SPATIAL.c.dataset_type_ref )).select_from(DATASET_SPATIAL) if time: query = query.where( func.tstzrange( _utils.default_utc(time[0]), _utils.default_utc(time[1]), "[]", type_=TSTZRANGE, ).contains(DATASET_SPATIAL.c.center_time)) if bbox: query = query.where( func.ST_Transform(DATASET_SPATIAL.c.footprint, 4326).intersects( func.ST_MakeEnvelope(*bbox))) if product_name: query = query.where(DATASET_SPATIAL.c.dataset_type_ref == select( [ODC_DATASET_TYPE.c.id]).where( ODC_DATASET_TYPE.c.name == product_name)) if dataset_ids: query = query.where(DATASET_SPATIAL.c.id.in_(dataset_ids)) if require_geometry: query = query.where(DATASET_SPATIAL.c.footprint != None) if ordered: query = query.order_by(DATASET_SPATIAL.c.center_time, DATASET_SPATIAL.c.id) query = query.limit(limit).offset( # TODO: Offset/limit isn't particularly efficient for paging... offset) for r in self._engine.execute(query): yield DatasetItem( dataset_id=r.id, bbox=_box2d_to_bbox(r.bbox) if r.bbox else None, product_name=self.index.products.get(r.dataset_type_ref).name, geometry=_get_shape(r.geometry), region_code=r.region_code, creation_time=r.creation_time, center_time=r.center_time, odc_dataset=(_utils.make_dataset_from_select_fields( self.index, r) if full_dataset else None), )