def intersect_grids(geom_expr, geom_tables: List[Table]): """Generate Intersection Query Conditions with Grid Tables.""" sub_where = [] outer_join = [] for geom_table in geom_tables: if not hasattr(geom_table.c, "tile"): break get_srid_expr = func.ST_SRID(geom_table.c.geom) sub_where.append( and_( func.ST_Intersects(func.ST_Transform(geom_expr, get_srid_expr), geom_table.c.geom), Tile.name == geom_table.c.tile, )) outer_join.append((geom_table, [Tile.name == geom_table.c.tile])) return [or_(*sub_where)], outer_join
def crs(self) -> Union[str, None]: """Retrieve the Coordinate Reference System (CRS) from the GRID.""" spatial_ref_sys = Table('spatial_ref_sys', db.metadata, schema='public', autoload=True, autoload_with=db.engine) geom_table = self.geom_table if geom_table is None: return None res = db.session.query(spatial_ref_sys.c.proj4text)\ .filter(spatial_ref_sys.c.srid == func.ST_SRID(geom_table.c.geom))\ .first() crs = None if res is not None: crs = res.proj4text return crs
def get_collection_items( collection_id=None, roles=[], item_id=None, bbox=None, datetime=None, ids=None, collections=None, intersects=None, page=1, limit=10, query=None, **kwargs, ): """Retrieve a list of collection items based on filters. :param collection_id: Single Collection ID to include in the search for items. Only Items in one of the provided Collection will be searched, defaults to None :type collection_id: str, optional :param item_id: item identifier, defaults to None :type item_id: str, optional :param bbox: bounding box for intersection [west, north, east, south], defaults to None :type bbox: list, optional :param datetime: Single date+time, or a range ('/' seperator), formatted to RFC 3339, section 5.6. Use double dots '..' for open date ranges, defaults to None. If the start or end date of an image generated by a temporal composition intersects the given datetime or range it will be included in the result. :type datetime: str, optional :param ids: Array of Item ids to return. All other filter parameters that further restrict the number of search results are ignored, defaults to None :type ids: list, optional :param collections: Array of Collection IDs to include in the search for items. Only Items in one of the provided Collections will be searched, defaults to None :type collections: list, optional :param intersects: Searches items by performing intersection between their geometry and provided GeoJSON geometry. All GeoJSON geometry types must be supported., defaults to None :type intersects: dict, optional :param page: The page offset of results, defaults to 1 :type page: int, optional :param limit: The maximum number of results to return (page size), defaults to 10 :type limit: int, optional :return: list of collectio items :rtype: list """ columns = [ func.concat(Collection.name, "-", Collection.version).label("collection"), Collection.collection_type, Collection._metadata.label("meta"), Item._metadata.label("item_meta"), Item.name.label("item"), Item.id, Item.collection_id, Item.start_date.label("start"), Item.end_date.label("end"), Item.assets, Item.created, Item.updated, cast(Item.cloud_cover, Float).label("cloud_cover"), func.ST_AsGeoJSON(Item.geom).label("geom"), func.Box2D(Item.geom).label("bbox"), Tile.name.label("tile"), ] where = [ Collection.id == Item.collection_id, or_(Collection.is_public.is_(True), Collection.id.in_([int(r.split(":")[0]) for r in roles])), ] if ids is not None: where += [Item.name.in_(ids.split(","))] else: if collections is not None: where += [ func.concat(Collection.name, "-", Collection.version).in_(collections.split(",")) ] elif collection_id is not None: where += [ func.concat(Collection.name, "-", Collection.version) == collection_id ] if item_id is not None: where += [Item.name.like(item_id)] if query: filters = create_query_filter(query) if filters: where += filters if intersects is not None: where += [ func.ST_Intersects(func.ST_GeomFromGeoJSON(str(intersects)), Item.geom) ] elif bbox is not None: try: split_bbox = [float(x) for x in bbox.split(",")] if split_bbox[0] == split_bbox[2] or split_bbox[ 1] == split_bbox[3]: raise InvalidBoundingBoxError("") where += [ func.ST_Intersects( func.ST_MakeEnvelope( split_bbox[0], split_bbox[1], split_bbox[2], split_bbox[3], func.ST_SRID(Item.geom), ), Item.geom, ) ] except: raise ( InvalidBoundingBoxError(f"'{bbox}' is not a valid bbox.")) if datetime is not None: date_filter = None if "/" in datetime: matches_open = ("..", "") time_start, time_end = datetime.split("/") if time_start in matches_open: # open start date_filter = [ or_(Item.start_date <= time_end, Item.end_date <= time_end) ] elif time_end in matches_open: # open end date_filter = [ or_(Item.start_date >= time_start, Item.end_date >= time_start) ] else: # closed range date_filter = [ or_( and_(Item.start_date >= time_start, Item.start_date <= time_end), and_(Item.end_date >= time_start, Item.end_date <= time_end), and_(Item.start_date < time_start, Item.end_date > time_end), ) ] else: date_filter = [ and_(Item.start_date <= datetime, Item.end_date >= datetime) ] where += date_filter outer = [Item.tile_id == Tile.id] query = session.query(*columns).outerjoin( Tile, *outer).filter(*where).order_by(Item.start_date.desc(), Item.id) result = query.paginate(page=int(page), per_page=int(limit), error_out=False, max_per_page=BDC_STAC_MAX_LIMIT) return result
class Place(Base): __tablename__ = 'place' place_id = Column(BigInteger, primary_key=True, autoincrement=False) osm_type = Column(osm_type_enum, nullable=False) osm_id = Column(BigInteger, nullable=False) radius = Column(Integer) # in metres, only for nodes display_name = Column(String, nullable=False) category = Column(String, nullable=False) type = Column(String, nullable=False) place_rank = Column(Integer, nullable=False) icon = Column(String) geom = Column(Geography(spatial_index=True)) south = Column(Float, nullable=False) west = Column(Float, nullable=False) north = Column(Float, nullable=False) east = Column(Float, nullable=False) extratags = deferred(Column(JSON)) address = deferred(Column(JSON)) namedetails = deferred(Column(JSON)) item_count = Column(Integer) candidate_count = Column(Integer) state = Column(String, index=True) override_name = Column(String) lat = Column(Float) lon = Column(Float) added = Column(DateTime, default=now_utc()) wikidata_query_timeout = Column(Boolean, default=False) wikidata = Column(String) item_types_retrieved = Column(Boolean, default=False) index_hide = Column(Boolean, default=False) overpass_is_in = deferred(Column(JSON)) existing_wikidata = deferred(Column(JSON)) area = column_property(func.ST_Area(geom)) geometry_type = column_property(func.GeometryType(geom)) geojson = column_property(func.ST_AsGeoJSON(geom, 4), deferred=True) srid = column_property(func.ST_SRID(geom)) npoints = column_property(func.ST_NPoints(cast(geom, Geometry)), deferred=True) # match_ratio = column_property(candidate_count / item_count) num_geom = column_property(func.ST_NumGeometries(cast(geom, Geometry)), deferred=True) items = relationship('Item', secondary='place_item', lazy='dynamic', backref=backref('places', lazy='dynamic')) __table_args__ = (UniqueConstraint('osm_type', 'osm_id'), ) @property def osm_url(self): return f'{base_osm_url}/{self.osm_type}/{self.osm_id}' @classmethod def get_by_osm(cls, osm_type, osm_id): return cls.query.filter_by(osm_type=osm_type, osm_id=osm_id).one_or_none() @classmethod def from_osm(cls, osm_type, osm_id): place = cls.get_by_osm(osm_type, osm_id) if place: return place hit = nominatim.reverse(osm_type, osm_id) try: place = Place.from_nominatim(hit) except KeyError: return None session.add(place) session.commit() return place @property def type_label(self): t = self.type.replace('_', ' ') cat = self.category.replace('_', ' ') if cat == 'place': return t if t == 'yes': return cat return t + ' ' + cat @classmethod def get_by_wikidata(cls, qid): q = cls.query.filter_by(wikidata=qid) try: return q.one_or_none() except MultipleResultsFound: return None def get_address_key(self, key): if isinstance(self.address, dict): return self.address.get(key) for line in self.address or []: if line['type'] == key: return line['name'] @property def country_code(self): return self.get_address_key('country_code') @property def country(self): return self.get_address_key('country') @classmethod def get_or_abort(cls, osm_type, osm_id): place = cls.get_by_osm(osm_type, osm_id) if place: return place abort(404) @hybrid_property def area_in_sq_km(self): return self.area / (1000 * 1000) @property def type_and_id(self): return (self.osm_type, self.osm_id) @property def too_big(self): max_area = current_app.config['PLACE_MAX_AREA'] return self.area_in_sq_km > max_area @property def too_complex(self): return self.npoints > current_app.config['PLACE_MAX_NPOINTS'] @property def bad_geom_type(self): return self.geometry_type in {'LINESTRING', 'MULTILINESTRING'} @property def area_in_range(self): min_area = current_app.config['PLACE_MIN_AREA'] if g.user.is_authenticated: max_area = current_app.config['PLACE_MAX_AREA'] else: max_area = current_app.config['PLACE_MAX_AREA_ANON'] return min_area < self.area_in_sq_km < max_area @property def allowed_cat(self): cats = { 'place', 'boundary', 'natural', 'leisure', 'amenity', 'landuse' } return self.category in cats @property def matcher_allowed(self): '''Are we allowd to run the matcher for this place?''' allow_node = bool(current_app.config.get('ALLOW_NODE_MATCH')) if self.osm_type == 'node': return allow_node return (not self.bad_geom_type and self.allowed_cat and self.area_in_range and not self.too_complex) def update_from_nominatim(self, hit): if self.place_id != int(hit['place_id']): print((self.place_id, hit['place_id'])) self.place_id = hit['place_id'] keys = ('lat', 'lon', 'display_name', 'place_rank', 'category', 'type', 'icon', 'extratags', 'namedetails') assert all(hit[n] is not None for n in ('lat', 'lon')) for n in keys: setattr(self, n, hit.get(n)) bbox = hit['boundingbox'] assert all(i is not None for i in bbox) (self.south, self.north, self.west, self.east) = bbox self.address = [ dict(name=n, type=t) for t, n in hit['address'].items() ] self.wikidata = hit['extratags'].get('wikidata') self.geom = hit['geotext'] def change_comment(self, item_count): if item_count == 1: return g.user.single or default_change_comments['single'] comment = getattr(g.user, 'multi', None) or default_change_comments['multi'] return comment.replace('PLACE', self.name_for_change_comment) @property def name_for_changeset(self): address = self.address n = self.name if not address: return self.name if isinstance(address, list): d = {a['type']: a['name'] for a in address} elif isinstance(address, dict): d = address if d.get('country_code') == 'us': state = d.get('state') if state and n != state: return n + ', ' + state country = d.get('country') if country and self.name != country: return '{} ({})'.format(self.name, country) return self.name def update_address(self): hit = nominatim.reverse(self.osm_type, self.osm_id, polygon_text=0) self.address = [ dict(name=n, type=t) for t, n in hit['address'].items() ] session.commit() @property def name_for_change_comment(self): n = self.name if self.address: if isinstance(self.address, dict): self.update_address() address = {a['type']: a['name'] for a in self.address} parts = [] country_code = address.get('country_code') skip = {'country_code', 'postcode'} if country_code in {'us'}: skip.add('county') if country_code in {'gb', 'us'} and 'state' in address: skip.add('country') if self.type in {'university', 'hospital', 'administrative'}: skip |= {'path', 'footway', 'road', 'neighbourhood'} if (country_code == 'gb' and self.category == 'boundary' and self.type in {'traditional', 'ceremonial', 'historic'}): parts = [ a for a in self.address if a['type'] in {'state_district', 'state'} ] else: parts = [a for a in self.address if a['type'] not in skip] name_parts = [n] prev_part = n for part in parts: if part['name'] == prev_part or ( part['type'] != 'city' and (part['name'] in prev_part or prev_part in part['name'])): continue name_parts.append(part['name']) prev_part = part['name'] n = ', '.join(name_parts) if (' of ' in n or 'national park' in n.lower()) and ', ' not in n: return 'the ' + n else: return n @classmethod def from_nominatim(cls, hit): keys = ('place_id', 'osm_type', 'osm_id', 'lat', 'lon', 'display_name', 'place_rank', 'category', 'type', 'icon', 'extratags', 'namedetails') n = {k: hit[k] for k in keys if k in hit} bbox = hit['boundingbox'] (n['south'], n['north'], n['west'], n['east']) = bbox n['geom'] = hit['geotext'] n['address'] = [ dict(name=n, type=t) for t, n in hit['address'].items() ] if 'extratags' in hit: n['wikidata'] = hit['extratags'].get('wikidata') return cls(**n) @classmethod def get_or_add_place(cls, hit): place = cls.query.filter_by(osm_type=hit['osm_type'], osm_id=hit['osm_id']).one_or_none() if place and place.place_id != hit['place_id']: place.update_from_nominatim(hit) elif not place: place = Place.query.get(hit['place_id']) if place: place.update_from_nominatim(hit) else: place = cls.from_nominatim(hit) session.add(place) session.commit() return place @property def match_ratio(self): if self.item_count: return self.candidate_count / self.item_count @property def bbox(self): return (self.south, self.north, self.west, self.east) @property def is_point(self): return self.osm_type == 'node' @property def display_area(self): return '{:.1f} km²'.format(self.area_in_sq_km) def get_wikidata_query(self): # this is an old function, it isn't used by the matcher if self.osm_type == 'node': radius = self.radius or radius_default query = wikidata.get_point_query(self.lat, self.lon, radius) else: query = wikidata.get_enwiki_query(*self.bbox) return query def point_wikidata_items(self): radius = self.radius or radius_default query_map = wikidata.point_query_map(self.lat, self.lon, radius) return self.items_from_wikidata(query_map) def bbox_wikidata_items(self, bbox=None): if bbox is None: bbox = self.bbox query_map = wikidata.bbox_query_map(*bbox) items = self.items_from_wikidata(query_map) # Would be nice to include OSM chunk information with each # item. Not doing it at this point because it means lots # of queries. Easier once the items are loaded into the database. return {k: v for k, v in items.items() if self.covers(v)} def items_from_wikidata(self, query_map): rows = wikidata.run_query(query_map['enwiki']) items = wikidata.parse_enwiki_query(rows) try: # add items with the coordinates in the HQ field rows = wikidata.run_query(query_map['hq_enwiki']) items.update(wikidata.parse_enwiki_query(rows)) except wikidata_api.QueryError: pass # HQ query timeout isn't fatal rows = wikidata.run_query(query_map['item_tag']) wikidata.parse_item_tag_query(rows, items) try: # add items with the coordinates in the HQ field rows = wikidata.run_query(query_map['hq_item_tag']) wikidata.parse_item_tag_query(rows, items) except wikidata_api.QueryError: pass # HQ query timeout isn't fatal return items def covers(self, item): ''' Is the given item within the geometry of this place. ''' q = (select([func.ST_Covers(Place.geom, item['location']) ]).where(Place.place_id == self.place_id)) return object_session(self).scalar(q) def add_tags_to_items(self): for item in self.items.filter(Item.categories != '{}'): # if wikidata says this is a place then adding tags # from wikipedia can just confuse things if any(t.startswith('place') for t in item.tags): continue for t in matcher.categories_to_tags(item.categories): item.tags.add(t) @property def prefix(self): return f'osm_{self.place_id}' @property def identifier(self): return f'{self.osm_type}/{self.osm_id}' @property def overpass_filename(self): overpass_dir = current_app.config['OVERPASS_DIR'] return os.path.join(overpass_dir, '{}.xml'.format(self.place_id)) def is_overpass_filename(self, f): ''' Does the overpass filename belongs to this place. ''' place_id = str(self.place_id) return f == place_id + '.xml' or f.startswith(place_id + '_') def delete_overpass(self): for f in os.scandir(current_app.config['OVERPASS_DIR']): if self.is_overpass_filename(f.name): os.remove(f.path) def clean_up(self): place_id = self.place_id engine = session.bind for t in get_tables(): if not t.startswith(self.prefix): continue engine.execute(f'drop table if exists {t}') engine.execute('commit') overpass_dir = current_app.config['OVERPASS_DIR'] for f in os.listdir(overpass_dir): if not any( f.startswith(str(place_id) + end) for end in ('_', '.')): continue os.remove(os.path.join(overpass_dir, f)) @property def overpass_done(self): return os.path.exists(self.overpass_filename) def items_with_candidates(self): return self.items.join(ItemCandidate) def items_with_candidates_count(self): if self.state != 'ready': return return (session.query(Item.item_id).join(PlaceItem).join(Place).join( ItemCandidate).filter(Place.place_id == self.place_id).group_by( Item.item_id).count()) def items_without_candidates(self): return self.items.outerjoin(ItemCandidate).filter( ItemCandidate.item_id.is_(None)) def items_with_multiple_candidates(self): # select count(*) from (select 1 from item, item_candidate where item.item_id=item_candidate.item_id) x; q = (self.items.join(ItemCandidate).group_by(Item.item_id).having( func.count(Item.item_id) > 1).with_entities(Item.item_id)) return q @property def name(self): if self.override_name: return self.override_name name = self.namedetails.get('name:en') or self.namedetails.get('name') display = self.display_name if not name: return display for short in ('City', '1st district'): start = len(short) + 2 if name == short and display.startswith( short + ', ') and ', ' in display[start:]: name = display[:display.find(', ', start)] break return name @property def name_extra_detail(self): for n in 'name:en', 'name': if n not in self.namedetails: continue start = self.namedetails[n] + ', ' if self.display_name.startswith(start): return self.display_name[len(start):] @property def export_name(self): return self.name.replace(':', '').replace(' ', '_') def items_with_instanceof(self): return [item for item in self.items if item.instanceof()] def osm2pgsql_cmd(self, filename=None): if filename is None: filename = self.overpass_filename style = os.path.join(current_app.config['DATA_DIR'], 'matcher.style') return [ 'osm2pgsql', '--create', '--drop', '--slim', '--hstore-all', '--hstore-add-index', '--prefix', self.prefix, '--cache', '500', '--style', style, '--multi-geometry', '--host', current_app.config['DB_HOST'], '--username', current_app.config['DB_USER'], '--database', current_app.config['DB_NAME'], filename ] def load_into_pgsql(self, filename=None, capture_stderr=True): if filename is None: filename = self.overpass_filename if not os.path.exists(filename): return 'no data from overpass to load with osm2pgsql' if os.stat(filename).st_size == 0: return 'no data from overpass to load with osm2pgsql' cmd = self.osm2pgsql_cmd(filename) if not capture_stderr: p = subprocess.run( cmd, env={'PGPASSWORD': current_app.config['DB_PASS']}) return p = subprocess.run(cmd, stderr=subprocess.PIPE, env={'PGPASSWORD': current_app.config['DB_PASS']}) if p.returncode != 0: if b'Out of memory' in p.stderr: return 'out of memory' else: return p.stderr.decode('utf-8') def save_overpass(self, content): with open(self.overpass_filename, 'wb') as out: out.write(content) @property def all_tags(self): tags = set() for item in self.items: tags |= set(item.tags) tags |= item.disused_tags() tags.difference_update(skip_tags) return matcher.simplify_tags(tags) @property def overpass_type(self): return overpass_types[self.osm_type] @property def overpass_filter(self): return 'around:{0.radius},{0.lat},{0.lon}'.format(self) @property def wikidata_item_id(self): if self.wikidata: return int(self.wikidata[1:]) def building_names(self): re_paren = re.compile(r'\(.+\)') re_drop = re.compile(r'\b(the|and|at|of|de|le|la|les|von)\b') names = set() for building in (item for item in self.items if 'building' in item.tags): for n in building.names(): if n[0].isdigit() and ',' in n: continue n = n.lower() comma = n.rfind(', ') if comma != -1 and not n[0].isdigit(): n = n[:comma] n = re_paren.sub('', n).replace("'s", "('s)?") n = n.replace('(', '').replace(')', '').replace('.', r'\.') names.add(n) names.add(re_drop.sub('', n)) names = sorted(n.replace(' ', r'\W*') for n in names) if names: return '({})'.format('|'.join(names)) def get_point_oql(self, buildings_special=False): tags = self.all_tags if buildings_special and 'building' in tags: buildings = self.building_names() tags.remove('building') else: buildings = None radius = self.radius or radius_default return overpass.oql_for_point(self.lat, self.lon, radius, tags, buildings) def get_bbox_oql(self, buildings_special=False): bbox = f'{self.south:f},{self.west:f},{self.north:f},{self.east:f}' tags = self.all_tags if buildings_special and 'building' in tags: buildings = self.building_names() tags.remove('building') else: buildings = None return overpass.oql_for_area(self.overpass_type, self.osm_id, tags, bbox, buildings) union = ['{}({});'.format(self.overpass_type, self.osm_id)] for tag in self.all_tags: u = (oql_from_tag(tag, filters=self.overpass_filter) if self.osm_type == 'node' else oql_from_tag(tag)) if u: union += u if self.osm_type == 'node': oql = ('[timeout:300][out:xml];\n' + '({});\n' + '(._;>;);\n' + 'out qt;').format(''.join(union)) return oql bbox = '{:f},{:f},{:f},{:f}'.format(self.south, self.west, self.north, self.east) offset = {'way': 2400000000, 'relation': 3600000000} area_id = offset[self.osm_type] + int(self.osm_id) oql = ('[timeout:300][out:xml][bbox:{}];\n' + 'area({})->.a;\n' + '({});\n' + '(._;>;);\n' + 'out qt;').format( bbox, area_id, ''.join(union)) return oql def get_oql(self, buildings_special=False): if self.is_point: return self.get_point_oql(buildings_special=False) else: return self.get_bbox_oql(buildings_special=False) def candidates_url(self, **kwargs): if g.get('filter'): kwargs['name_filter'] = g.filter endpoint = 'candidates_with_filter' else: endpoint = 'candidates' return self.place_url(endpoint, **kwargs) def place_url(self, endpoint, **kwargs): return url_for(endpoint, osm_type=self.osm_type, osm_id=self.osm_id, **kwargs) def browse_url(self): if self.wikidata: return url_for('browse_page', item_id=self.wikidata_item_id) def next_state_url(self): return (self.candidates_url() if self.state == 'ready' else self.matcher_progress_url()) def matcher_progress_url(self): return self.place_url('matcher.matcher_progress') def matcher_done_url(self, start): return self.place_url('matcher.matcher_done', start=start) def item_list(self): lang = self.most_common_language() or 'en' q = self.items.filter(Item.entity.isnot(None)).order_by(Item.item_id) return [{'id': i.item_id, 'name': i.label(lang=lang)} for i in q] def save_items(self, items, debug=None): if debug is None: def debug(msg): pass debug('save items') seen = {} for qid, v in items.items(): wikidata_id = int(qid[1:]) item = Item.query.get(wikidata_id) debug(f'saving: {qid}') if item: item.location = v['location'] else: item = Item(item_id=wikidata_id, location=v['location']) session.add(item) for k in 'enwiki', 'categories', 'query_label': if k in v: setattr(item, k, v[k]) tags = set(v['tags']) # if wikidata says this is a place then adding tags # from wikipedia can just confuse things # Wikipedia articles sometimes combine a village and a windmill # or a neighbourhood and a light rail station. # Exception for place tags, we always add place tags from # Wikipedia categories. if 'categories' in v: is_place = any(t.startswith('place') for t in tags) for t in matcher.categories_to_tags(v['categories']): if t.startswith('place') or not is_place: tags.add(t) # drop_building_tag(tags) tags -= skip_tags item.tags = tags if qid in seen: continue seen[qid] = item existing = PlaceItem.query.filter_by(item=item, place=self).one_or_none() if not existing: place_item = PlaceItem(item=item, place=self) session.add(place_item) debug(f'saved: {qid}') for item in self.items: if item.qid in seen: continue link = PlaceItem.query.filter_by(item=item, place=self).one() session.delete(link) debug('done') return seen def load_items(self, bbox=None, debug=False): if bbox is None: bbox = self.bbox items = self.bbox_wikidata_items(bbox) if debug: print('{:d} items'.format(len(items))) wikipedia.add_enwiki_categories(items) self.save_items(items) session.commit() def load_extracts(self, debug=False, progress=None): for code, _ in self.languages_wikidata(): self.load_extracts_wiki(debug=debug, progress=progress, code=code) def load_extracts_wiki(self, debug=False, progress=None, code='en'): wiki = code + 'wiki' by_title = { item.sitelinks()[wiki]['title']: item for item in self.items if wiki in (item.sitelinks() or {}) } query_iter = wikipedia.get_extracts(by_title.keys(), code=code) for title, extract in query_iter: item = by_title[title] if debug: print(title) item.extracts[wiki] = extract if wiki == 'enwiki': item.extract_names = wikipedia.html_names(extract) if progress: progress(item) def wbgetentities(self, debug=False): sub = (session.query(Item.item_id).join(ItemTag).group_by( Item.item_id).subquery()) q = (self.items.filter(Item.item_id == sub.c.item_id).options( load_only(Item.qid))) if debug: print('running wbgetentities query') print(q) print(q.count()) items = {i.qid: i for i in q} if debug: print('{} items'.format(len(items))) for qid, entity in wikidata_api.entity_iter(items.keys(), debug=debug): if debug: print(qid) items[qid].entity = entity def languages_osm(self): lang_count = Counter() candidate_count = 0 candidate_has_language_count = 0 for c in self.items_with_candidates().with_entities(ItemCandidate): candidate_count += 1 candidate_has_language = False for lang in c.languages(): lang_count[lang] += 1 candidate_has_language = True if candidate_has_language: candidate_has_language_count += 1 return sorted(lang_count.items(), key=lambda i: i[1], reverse=True) def languages_wikidata(self): lang_count = Counter() item_count = self.items.count() count_sv = self.country_code in {'se', 'fi'} for item in self.items: if item.entity and 'labels' in item.entity: keys = item.entity['labels'].keys() if not count_sv and keys == {'ceb', 'sv'}: continue for lang in keys: if '-' in lang or lang == 'ceb': continue lang_count[lang] += 1 if item_count > 10: # truncate the long tail of languages lang_count = { key: count for key, count in lang_count.items() if key == 'en' or count / item_count > 0.1 } if self.country_code == 'us': lang_count = { key: count for key, count in lang_count.items() if key in {'en', 'es'} } if self.country_code == 'gb': lang_count = { key: count for key, count in lang_count.items() if key in {'en', 'fr', 'de', 'cy'} } return sorted(lang_count.items(), key=lambda i: i[1], reverse=True)[:10] def languages(self): wikidata = self.languages_wikidata() osm = dict(self.languages_osm()) return [{ 'code': code, 'wikidata': count, 'osm': osm.get(code) } for code, count in wikidata] def most_common_language(self): lang_count = Counter() for item in self.items: if item.entity and 'labels' in item.entity: for lang in item.entity['labels'].keys(): lang_count[lang] += 1 try: return lang_count.most_common(1)[0][0] except IndexError: return None def reset_all_items_to_not_done(self): place_items = (PlaceItem.query.join(Item).filter( Item.entity.isnot(None), PlaceItem.place == self, PlaceItem.done == true()).order_by(PlaceItem.item_id)) for place_item in place_items: place_item.done = False session.commit() def matcher_query(self): return (PlaceItem.query.join(Item).filter( Item.entity.isnot(None), PlaceItem.place == self, or_(PlaceItem.done.is_(None), PlaceItem.done != true())).order_by(PlaceItem.item_id)) def run_matcher(self, debug=False, progress=None): if progress is None: def progress(candidates, item): pass conn = session.bind.raw_connection() cur = conn.cursor() self.existing_wikidata = matcher.get_existing(cur, self.prefix) place_items = self.matcher_query() total = place_items.count() # too many items means something has gone wrong assert total < 60_000 for num, place_item in enumerate(place_items): item = place_item.item if debug: print('searching for', item.label()) print(item.tags) if item.skip_item_during_match(): candidates = [] else: t0 = time() candidates = matcher.find_item_matches(cur, item, self.prefix, debug=debug) seconds = time() - t0 if debug: print('find_item_matches took {:.1f}'.format(seconds)) print('{}: {}'.format(len(candidates), item.label())) progress(candidates, item) # if this is a refresh we remove candidates that no longer match as_set = {(i['osm_type'], i['osm_id']) for i in candidates} for c in item.candidates[:]: if c.edits.count(): continue # foreign keys mean we can't remove saved candidates if (c.osm_type, c.osm_id) not in as_set: c.bad_matches.delete() session.delete(c) if not candidates: continue for i in candidates: c = ItemCandidate.query.get( (item.item_id, i['osm_id'], i['osm_type'])) if c: c.update(i) else: c = ItemCandidate(**i, item=item) session.add(c) place_item.done = True if num % 100 == 0: session.commit() self.item_count = self.items.count() self.candidate_count = self.items_with_candidates_count() session.commit() conn.close() def load_isa(self, progress=None): if progress is None: def progress(msg): pass isa_map = { item.qid: [isa_qid for isa_qid in item.instanceof()] for item in self.items } isa_map = {qid: l for qid, l in isa_map.items() if l} if not isa_map: return download_isa = set() isa_obj_map = {} for qid, isa_list in isa_map.items(): isa_objects = [] # some Wikidata items feature two 'instance of' statements that point to # the same item. # Example: Cambridge University Museum of Zoology (Q5025605) # https://www.wikidata.org/wiki/Q5025605 seen_isa_qid = set() for isa_qid in isa_list: if isa_qid in seen_isa_qid: continue seen_isa_qid.add(isa_qid) item_id = int(isa_qid[1:]) isa = IsA.query.get(item_id) if not isa or not isa.entity: download_isa.add(isa_qid) if not isa: isa = IsA(item_id=item_id) session.add(isa) isa_obj_map[isa_qid] = isa isa_objects.append(isa) item = Item.query.get(qid[1:]) item.isa = isa_objects for qid, entity in wikidata_api.entity_iter(download_isa): isa_obj_map[qid].entity = entity session.commit() def do_match(self, debug=True): if self.state == 'ready': # already done return if not self.state or self.state == 'refresh': print('load items') self.load_items() # includes categories self.state = 'tags' session.commit() if self.state == 'tags': print('wbgetentities') self.wbgetentities(debug=debug) print('load extracts') self.load_extracts(debug=debug) self.state = 'wbgetentities' session.commit() if self.state in ('wbgetentities', 'overpass_error', 'overpass_timeout'): print('loading_overpass') self.get_overpass() self.state = 'postgis' session.commit() if self.state == 'postgis': print('running osm2pgsql') self.load_into_pgsql(capture_stderr=False) self.state = 'osm2pgsql' session.commit() if self.state == 'osm2pgsql': print('run matcher') self.run_matcher(debug=debug) self.state = 'load_isa' session.commit() if self.state == 'load_isa': print('load isa') self.load_isa() print('ready') self.state = 'ready' session.commit() def get_overpass(self): oql = self.get_oql() if self.area_in_sq_km < 800: r = overpass.run_query_persistent(oql) assert r self.save_overpass(r.content) else: self.chunk() def get_items(self): items = [ item for item in self.items_with_candidates() if all('wikidata' not in c.tags for c in item.candidates) ] filter_list = matcher.filter_candidates_more(items, bad=get_bad(items)) add_tags = [] for item, match in filter_list: picked = match.get('candidate') if not picked: continue dist = picked.dist intersection = set() for k, v in picked.tags.items(): tag = k + '=' + v if k in item.tags or tag in item.tags: intersection.add(tag) if dist < 400: symbol = '+' elif dist < 4000 and intersection == {'place=island'}: symbol = '+' elif dist < 3000 and intersection == {'natural=wetland'}: symbol = '+' elif dist < 2000 and intersection == {'natural=beach'}: symbol = '+' elif dist < 2000 and intersection == {'natural=bay'}: symbol = '+' elif dist < 2000 and intersection == {'aeroway=aerodrome'}: symbol = '+' elif dist < 1000 and intersection == {'amenity=school'}: symbol = '+' elif dist < 800 and intersection == {'leisure=park'}: symbol = '+' elif dist < 2000 and intersection == {'landuse=reservoir'}: symbol = '+' elif dist < 3000 and item.tags == {'place', 'admin_level'}: symbol = '+' elif dist < 3000 and item.tags == { 'place', 'place=town', 'admin_level' }: symbol = '+' elif dist < 3000 and item.tags == { 'admin_level', 'place', 'place=neighbourhood' } and 'place' in picked.tags: symbol = '+' else: symbol = '?' print('{:1s} {:9s} {:5.0f} {!r} {!r}'.format( symbol, item.qid, picked.dist, item.tags, intersection)) if symbol == '+': add_tags.append((item, picked)) return add_tags def chunk_n(self, n): n = max(1, n) (south, north, west, east) = self.bbox ns = (north - south) / n ew = (east - west) / n chunks = [] for row in range(n): for col in range(n): chunk = (south + ns * row, south + ns * (row + 1), west + ew * col, west + ew * (col + 1)) want_chunk = func.ST_Intersects(Place.geom, envelope(chunk)) want = (session.query(want_chunk).filter( Place.place_id == self.place_id).scalar()) if want: chunks.append(chunk) return chunks def get_chunks(self): bbox_chunks = list(self.polygon_chunk(size=place_chunk_size)) chunks = [] need_self = True # include self in first non-empty chunk for num, chunk in enumerate(bbox_chunks): filename = self.chunk_filename(num, bbox_chunks) oql = self.oql_for_chunk(chunk, include_self=need_self) chunks.append({ 'num': num, 'oql': oql, 'filename': filename, }) if need_self and oql: need_self = False return chunks def chunk_filename(self, num, chunks): if len(chunks) == 1: return '{}.xml'.format(self.place_id) return '{}_{:03d}_{:03d}.xml'.format(self.place_id, num, len(chunks)) def chunk(self): chunk_size = utils.calc_chunk_size(self.area_in_sq_km) chunks = self.chunk_n(chunk_size) print('chunk size:', chunk_size) files = [] for num, chunk in enumerate(chunks): filename = self.chunk_filename(num, len(chunks)) # print(num, q.count(), len(tags), filename, list(tags)) full = os.path.join('overpass', filename) files.append(full) if os.path.exists(full): continue oql = self.oql_for_chunk(chunk, include_self=(num == 0)) r = overpass.run_query_persistent(oql) if not r: print(oql) assert r open(full, 'wb').write(r.content) cmd = ['osmium', 'merge'] + files + ['-o', self.overpass_filename] print(' '.join(cmd)) subprocess.run(cmd) def oql_for_chunk(self, chunk, include_self=False): q = self.items.filter( cast(Item.location, Geometry).contained(envelope(chunk))) tags = set() for item in q: tags |= set(item.tags) tags.difference_update(skip_tags) tags = matcher.simplify_tags(tags) if not (tags): print('no tags, skipping') return ymin, ymax, xmin, xmax = chunk bbox = '{:f},{:f},{:f},{:f}'.format(ymin, xmin, ymax, xmax) oql = overpass.oql_for_area(self.overpass_type, self.osm_id, tags, bbox, None, include_self=include_self) return oql def chunk_count(self): return sum(1 for _ in self.polygon_chunk(size=place_chunk_size)) def geojson_chunks(self): chunks = [] for chunk in self.polygon_chunk(size=place_chunk_size): clip = func.ST_Intersection(Place.geom, envelope(chunk)) geojson = (session.query(func.ST_AsGeoJSON( clip, 4)).filter(Place.place_id == self.place_id).scalar()) chunks.append(geojson) return chunks def wikidata_chunk_size(self, size=22): if self.osm_type == 'node': return 1 area = self.area_in_sq_km if area < 3000 and not self.wikidata_query_timeout: return 1 return utils.calc_chunk_size(area, size=size) def polygon_chunk(self, size=64): stmt = (session.query( func.ST_Dump(Place.geom.cast(Geometry())).label('x')).filter_by( place_id=self.place_id).subquery()) q = session.query( stmt.c.x.path[1], func.ST_Area(stmt.c.x.geom.cast(Geography)) / (1000 * 1000), func.Box2D(stmt.c.x.geom)) for num, area, box2d in q: chunk_size = utils.calc_chunk_size(area, size=size) west, south, east, north = map(float, re_box.match(box2d).groups()) for chunk in bbox_chunk((south, north, west, east), chunk_size): yield chunk def latest_matcher_run(self): return self.matcher_runs.order_by(PlaceMatcher.start.desc()).first() def obj_for_json(self, include_geom=False): keys = [ 'osm_type', 'osm_id', 'display_name', 'name', 'extratags', 'address', 'namedetails', 'state', 'lat', 'lon', 'area_in_sq_km', 'name_for_changeset', 'name_for_change_comment', 'bbox', ] out = {key: getattr(self, key) for key in keys} out['added'] = str(self.added) if include_geom: out['geom'] = json.loads(self.geojson) items = [] for item in self.items: if not item.sitelinks(): continue cur = { 'labels': item.labels, 'qid': item.qid, 'url': item.wikidata_uri, 'item_identifiers': item.get_item_identifiers(), 'names': item.names(), 'sitelinks': item.sitelinks(), 'location': item.get_lat_lon(), } if item.categories: cur['categories'] = item.categories matches = [{ 'osm_type': m.osm_type, 'osm_id': m.osm_id, 'dist': m.dist, 'label': m.label, } for m in item.candidates] if matches: cur['matches'] = matches items.append(cur) out['items'] = items return out def refresh_nominatim(self): hit = nominatim.reverse(self.osm_type, self.osm_id) self.update_from_nominatim(hit) session.commit() def is_in(self): if self.overpass_is_in: return self.overpass_is_in # self.overpass_is_in = overpass.is_in(self.overpass_type, self.osm_id) self.overpass_is_in = overpass.is_in_lat_lon(self.lat, self.lon) if self.overpass_is_in: session.commit() return self.overpass_is_in def suggest_larger_areas(self): ret = [] for e in reversed(self.is_in() or []): osm_type, osm_id, bounds = e['type'], e['id'], e['bounds'] if osm_type == self.osm_type and osm_id == self.osm_id: continue box = func.ST_MakeEnvelope(bounds['minlon'], bounds['minlat'], bounds['maxlon'], bounds['maxlat'], 4326) q = func.ST_Area(box.cast(Geography)) bbox_area = session.query(q).scalar() area_in_sq_km = bbox_area / (1000 * 1000) if area_in_sq_km < 10 or area_in_sq_km > 40_000: continue place = Place.from_osm(osm_type, osm_id) if not place: continue place.admin_level = e['tags'].get( 'admin_level') or None if 'tags' in e else None ret.append(place) ret.sort(key=lambda place: place.area_in_sq_km) return ret def get_candidate_items(self): items = self.items_with_candidates() if self.existing_wikidata: existing = { qid: set(tuple(i) for i in osm_list) for qid, osm_list in self.existing_wikidata.items() } else: existing = {} items = [ item for item in items if item.qid not in existing and all('wikidata' not in c.tags for c in item.candidates) ] need_commit = False for item in items: for c in item.candidates: if c.set_match_detail(): need_commit = True if need_commit: session.commit() return items
def calculate_summary(self, product_name: str, time: Range) -> TimePeriodOverview: """ Create a summary of the given product/time range. """ log = self.log.bind(product_name=product_name, time=time) log.debug("summary.query") begin_time, end_time, where_clause = self._where(product_name, time) select_by_srid = (select(( func.ST_SRID(DATASET_SPATIAL.c.footprint).label("srid"), func.count().label("dataset_count"), func.ST_Transform( func.ST_Union(DATASET_SPATIAL.c.footprint), self._target_srid(), type_=Geometry(), ).label("footprint_geometry"), func.sum(DATASET_SPATIAL.c.size_bytes).label("size_bytes"), func.max(DATASET_SPATIAL.c.creation_time).label( "newest_dataset_creation_time"), )).where(where_clause).group_by("srid").alias("srid_summaries")) # Union all srid groups into one summary. result = self._engine.execute( select(( func.sum( select_by_srid.c.dataset_count).label("dataset_count"), func.array_agg(select_by_srid.c.srid).label("srids"), func.sum(select_by_srid.c.size_bytes).label("size_bytes"), func.ST_Union( func.ST_Buffer(select_by_srid.c.footprint_geometry, 0), type_=Geometry(srid=self._target_srid()), ).label("footprint_geometry"), func.max(select_by_srid.c.newest_dataset_creation_time).label( "newest_dataset_creation_time"), func.now().label("summary_gen_time"), ))) rows = result.fetchall() log.debug("summary.query.done", srid_rows=len(rows)) assert len(rows) == 1 row = dict(rows[0]) row["dataset_count"] = int( row["dataset_count"]) if row["dataset_count"] else 0 if row["footprint_geometry"] is not None: row["footprint_crs"] = self._get_srid_name( row["footprint_geometry"].srid) row["footprint_geometry"] = geo_shape.to_shape( row["footprint_geometry"]) else: row["footprint_crs"] = None row["crses"] = None if row["srids"] is not None: row["crses"] = {self._get_srid_name(s) for s in row["srids"]} del row["srids"] # Convert from Python Decimal if row["size_bytes"] is not None: row["size_bytes"] = int(row["size_bytes"]) has_data = row["dataset_count"] > 0 log.debug("counter.calc") # Initialise all requested days as zero day_counts = Counter({ d.date(): 0 for d in pd.date_range(begin_time, end_time, closed="left") }) region_counts = Counter() if has_data: day_counts.update( Counter({ day.date(): count for day, count in self._engine.execute( select([ func.date_trunc( "day", DATASET_SPATIAL.c.center_time.op( "AT TIME ZONE")(self.grouping_time_zone), ).label("day"), func.count(), ]).where(where_clause).group_by("day")) })) region_counts = Counter({ item: count for item, count in self._engine.execute( select([ DATASET_SPATIAL.c.region_code.label("region_code"), func.count(), ]).where(where_clause).group_by("region_code")) }) summary = TimePeriodOverview( **row, timeline_period="day", time_range=Range(begin_time, end_time), timeline_dataset_counts=day_counts, region_dataset_counts=region_counts, # TODO: filter invalid from the counts? footprint_count=row["dataset_count"] or 0, ) log.debug( "summary.calc.done", dataset_count=summary.dataset_count, footprints_missing=summary.dataset_count - summary.footprint_count, ) return summary
def get_collection_items( collection_id=None, roles=None, item_id=None, bbox=None, datetime=None, ids=None, collections=None, intersects=None, page=1, limit=10, query=None, **kwargs, ) -> Pagination: """Retrieve a list of collection items based on filters. :param collection_id: Single Collection ID to include in the search for items. Only Items in one of the provided Collection will be searched, defaults to None :type collection_id: str, optional :param item_id: item identifier, defaults to None :type item_id: str, optional :param bbox: bounding box for intersection [west, north, east, south], defaults to None :type bbox: list, optional :param datetime: Single date+time, or a range ('/' seperator), formatted to RFC 3339, section 5.6. Use double dots '..' for open date ranges, defaults to None. If the start or end date of an image generated by a temporal composition intersects the given datetime or range it will be included in the result. :type datetime: str, optional :param ids: Array of Item ids to return. All other filter parameters that further restrict the number of search results are ignored, defaults to None :type ids: list, optional :param collections: Array of Collection IDs to include in the search for items. Only Items in one of the provided Collections will be searched, defaults to None :type collections: list, optional :param intersects: Searches items by performing intersection between their geometry and provided GeoJSON geometry. All GeoJSON geometry types must be supported., defaults to None :type intersects: dict, optional :param page: The page offset of results, defaults to 1 :type page: int, optional :param limit: The maximum number of results to return (page size), defaults to 10 :type limit: int, optional :return: list of collectio items :rtype: list """ columns = [ func.concat(Collection.name, "-", Collection.version).label("collection"), Collection.collection_type, Collection._metadata.label("meta"), Item._metadata.label("item_meta"), Item.name.label("item"), Item.id, Item.collection_id, Item.start_date.label("start"), Item.end_date.label("end"), Item.assets, Item.created, Item.updated, cast(Item.cloud_cover, Float).label("cloud_cover"), func.ST_AsGeoJSON(Item.geom).label("geom"), func.ST_XMin(Item.geom).label("xmin"), func.ST_XMax(Item.geom).label("xmax"), func.ST_YMin(Item.geom).label("ymin"), func.ST_YMax(Item.geom).label("ymax"), Tile.name.label("tile"), ] if roles is None: roles = [] where = [ Collection.id == Item.collection_id, or_(Collection.is_public.is_(True), Collection.id.in_([int(r.split(":")[0]) for r in roles])), ] collections_where = _where_collections(collection_id, collections) collections_where.append( or_(Collection.is_public.is_(True), Collection.id.in_([int(r.split(":")[0]) for r in roles]))) outer_join = [(Tile, [Item.tile_id == Tile.id])] _geom_tables = [] _collections = Collection.query().filter(*collections_where).all() if bbox or intersects: grids = GridRefSys.query().filter( GridRefSys.id.in_([c.grid_ref_sys_id for c in _collections])).all() for grid in grids: geom_table = grid.geom_table if geom_table is None: continue _geom_tables.append(geom_table) if ids is not None: if isinstance(ids, str): ids = ids.split(",") where += [Item.name.in_(ids)] else: where += _where_collections(collection_id, collections) if item_id is not None: where += [Item.name.like(item_id)] if query: filters = create_query_filter(query) if filters: where += filters if intersects is not None: # Intersect with native grid if there is geom_expr = func.ST_GeomFromGeoJSON(str(intersects)) grids_where, joins = intersect_grids(geom_expr, geom_tables=_geom_tables) where += grids_where outer_join += joins elif bbox is not None: try: if isinstance(bbox, str): bbox = bbox.split(",") bbox = [float(x) for x in bbox] if bbox[0] == bbox[2] or bbox[1] == bbox[3]: raise InvalidBoundingBoxError("") geom_expr = func.ST_MakeEnvelope(bbox[0], bbox[1], bbox[2], bbox[3], func.ST_SRID(Item.geom)) grid_where, joins = intersect_grids(geom_expr, geom_tables=_geom_tables) where += grid_where outer_join += joins except (ValueError, InvalidBoundingBoxError) as e: abort(400, f"'{bbox}' is not a valid bbox.") if datetime is not None: if "/" in datetime: matches_open = ("..", "") time_start, time_end = datetime.split("/") if time_start in matches_open: # open start date_filter = [ or_(Item.start_date <= time_end, Item.end_date <= time_end) ] elif time_end in matches_open: # open end date_filter = [ or_(Item.start_date >= time_start, Item.end_date >= time_start) ] else: # closed range date_filter = [ or_( and_(Item.start_date >= time_start, Item.start_date <= time_end), and_(Item.end_date >= time_start, Item.end_date <= time_end), and_(Item.start_date < time_start, Item.end_date > time_end), ) ] else: date_filter = [ and_(Item.start_date <= datetime, Item.end_date >= datetime) ] where += date_filter query = session.query(*columns) for entity, join_conditions in outer_join: query = query.outerjoin(entity, *join_conditions) try: query = query.filter(*where).order_by(Item.start_date.desc(), Item.id) result = query.paginate(page=int(page), per_page=int(limit), error_out=False, max_per_page=BDC_STAC_MAX_LIMIT) return result except Exception as err: msg = str(err) if hasattr(err, "orig"): msg = str(err.orig) abort(400, msg.rstrip())
def check_scenes(cls, collections: str, start_date: datetime, end_date: datetime, catalog: str = None, dataset: str = None, grid: str = None, tiles: list = None, bbox: list = None, catalog_kwargs=None, only_tiles=False): """Check for the scenes in remote provider and compares with the Collection Builder.""" bbox_list = [] if grid and tiles: grid = GridRefSys.query().filter(GridRefSys.name == grid).first_or_404(f'Grid "{grid}" not found.') geom_table = grid.geom_table rows = db.session.query( geom_table.c.tile, func.ST_Xmin(func.ST_Transform(geom_table.c.geom, 4326)).label('xmin'), func.ST_Ymin(func.ST_Transform(geom_table.c.geom, 4326)).label('ymin'), func.ST_Xmax(func.ST_Transform(geom_table.c.geom, 4326)).label('xmax'), func.ST_Ymax(func.ST_Transform(geom_table.c.geom, 4326)).label('ymax'), ).filter(geom_table.c.tile.in_(tiles)).all() for row in rows: bbox_list.append((row.tile, (row.xmin, row.ymin, row.xmax, row.ymax))) else: bbox_list.append(('', bbox)) instance, provider = get_provider(catalog) collection_map = dict() collection_ids = list() for _collection in collections: collection, version = _collection.split('-') collection = Collection.query().filter( Collection.name == collection, Collection.version == version ).first_or_404(f'Collection "{collection}-{version}" not found.') collection_ids.append(collection.id) collection_map[_collection] = collection options = dict(start_date=start_date, end_date=end_date) if catalog_kwargs: options.update(catalog_kwargs) redis = current_app.redis output = dict( collections={cname: dict(total_scenes=0, total_missing=0, missing_external=[]) for cname in collections} ) items = {cid: set() for cid in collection_ids} external_scenes = set() for tile, _bbox in bbox_list: with redis.pipeline() as pipe: if only_tiles: entry = tile options['tile'] = tile else: options['bbox'] = _bbox entry = _bbox periods = _generate_periods(start_date.replace(tzinfo=None), end_date.replace(tzinfo=None)) for period_start, period_end in periods: _items = db.session.query(Item.name, Item.collection_id).filter( Item.collection_id.in_(collection_ids), func.ST_Intersects( func.ST_MakeEnvelope( *_bbox, func.ST_SRID(Item.geom) ), Item.geom ), or_( and_(Item.start_date >= period_start, Item.start_date <= period_end), and_(Item.end_date >= period_start, Item.end_date <= period_end), and_(Item.start_date < period_start, Item.end_date > period_end), ) ).order_by(Item.name).all() for item in _items: items[item.collection_id].add(item.name) options['start_date'] = period_start.strftime('%Y-%m-%d') options['end_date'] = period_end.strftime('%Y-%m-%d') key = f'scenes:{catalog}:{dataset}:{period_start.strftime("%Y%m%d")}_{period_end.strftime("%Y%m%d")}_{entry}' pipe.get(key) provider_scenes = [] if not redis.exists(key): provider_scenes = provider.search(dataset, **options) provider_scenes = [s.scene_id for s in provider_scenes] pipe.set(key, json.dumps(provider_scenes)) external_scenes = external_scenes.union(set(provider_scenes)) cached_scenes = pipe.execute() for cache in cached_scenes: # When cache is True, represents set the value were cached. if cache is not None and cache is not True: external_scenes = external_scenes.union(set(json.loads(cache))) output['total_external'] = len(external_scenes) for _collection_name, _collection in collection_map.items(): _items = set(items[_collection.id]) diff = list(external_scenes.difference(_items)) output['collections'][_collection_name]['total_scenes'] = len(_items) output['collections'][_collection_name]['total_missing'] = len(diff) output['collections'][_collection_name]['missing_external'] = diff for cname, _internal_collection in collection_map.items(): if cname != _collection_name: diff = list(_items.difference(set(items[_internal_collection.id]))) output['collections'][_collection_name][f'total_missing_{cname}'] = len(diff) output['collections'][_collection_name][f'missing_{cname}'] = diff return output
def get_collection_items(collection_id=None, roles=[], item_id=None, bbox=None, time=None, ids=None, collections=None, cubes=None, intersects=None, page=1, limit=10, query=None, **kwargs): """Retrieve a list of collection items based on filters. :param collection_id: Single Collection ID to include in the search for items. Only Items in one of the provided Collection will be searched, defaults to None :type collection_id: str, optional :param item_id: item identifier, defaults to None :type item_id: str, optional :param bbox: bounding box for intersection [west, north, east, south], defaults to None :type bbox: list, optional :param time: Single date+time, or a range ('/' seperator), formatted to RFC 3339, section 5.6, defaults to None :type time: str, optional :param ids: Array of Item ids to return. All other filter parameters that further restrict the number of search results are ignored, defaults to None :type ids: list, optional :param collections: Array of Collection IDs to include in the search for items. Only Items in one of the provided Collections will be searched, defaults to None :type collections: list, optional :param cubes: Bool indicating if only cubes should be returned, defaults to None :type cubes: bool, optional :param intersects: Searches items by performing intersection between their geometry and provided GeoJSON geometry. All GeoJSON geometry types must be supported., defaults to None :type intersects: dict, optional :param page: The page offset of results, defaults to 1 :type page: int, optional :param limit: The maximum number of results to return (page size), defaults to 10 :type limit: int, optional :return: list of collectio items :rtype: list """ columns = [ Collection.name.label('collection'), Item.name.label('item'), Item.start_date.label('start'), Item.end_date.label('end'), Item.assets, func.ST_AsGeoJSON(Item.geom).label('geom'), func.Box2D(Item.geom).label('bbox'), Tile.name.label('tile') ] where = [ Collection.id == Item.collection_id, Item.tile_id == Tile.id, or_(Collection.is_public.is_(True), Collection.id.in_([int(r.split(':')[0]) for r in roles])) ] if ids is not None: where += [Item.id.in_(ids.split(','))] elif item_id is not None: where += [Item.id.like(item_id)] else: if collections is not None: where += [Collection.name.in_(collections.split(','))] elif collection_id is not None: where += [Collection.name.like(collection_id)] if intersects is not None: where += [ func.ST_Intersects(func.ST_GeomFromGeoJSON(str(intersects)), Item.geom) ] if query: filters = create_query_filter(query) if filters: where += filters if bbox is not None: try: split_bbox = [float(x) for x in bbox.split(',')] where += [ func.ST_Intersects( func.ST_MakeEnvelope(split_bbox[0], split_bbox[1], split_bbox[2], split_bbox[3], func.ST_SRID(Item.geom)), Item.geom) ] except: raise ( InvalidBoundingBoxError(f"'{bbox}' is not a valid bbox.'")) if time is not None: if "/" in time: time_start, time_end = time.split("/") time_end = datetime.fromisoformat(time_end) where += [ or_(Item.end_date <= time_end, Item.start_date <= time_end) ] else: time_start = datetime.fromisoformat(time) where += [ or_(Item.start_date >= time_start, Item.end_date >= time_start) ] query = session.query(*columns).filter(*where).order_by( Item.start_date.desc()) result = query.paginate(page=int(page), per_page=int(limit), error_out=False, max_per_page=int(BDC_STAC_MAX_LIMIT)) return result