def on_start(self): super(DiscoveryService, self).on_start() cfg_datastore = CFG.get_safe('container.datastore.default_server') if cfg_datastore != "postgresql": raise Exception("Discovery service does not support datastores other than postgresql") self.ds_discovery = DatastoreDiscovery(self)
def on_start(self): # pragma no cover super(DiscoveryService,self).on_start() cfg_datastore = CFG.get_safe('container.datastore.default_server') if cfg_datastore != "postgresql": raise Exception("Discovery service does not support datastores other than postgresql") self.ds_discovery = DatastoreDiscovery(self)
def on_start(self): # pragma no cover super(DiscoveryService,self).on_start() self.use_es = CFG.get_safe('system.elasticsearch', False) self.elasticsearch_host = CFG.get_safe('server.elasticsearch.host','localhost') self.elasticsearch_port = CFG.get_safe('server.elasticsearch.port','9200') self.ep = EventPublisher(event_type = 'SearchBufferExceededEvent') self.heuristic_cutoff = 4 self.cfg_datastore = CFG.get_safe('container.datastore.default_server', "couchdb") self.ds_discovery = None if self.cfg_datastore != "couchdb": self.ds_discovery = DatastoreDiscovery(self)
class DiscoveryService(BaseDiscoveryService): MAX_SEARCH_RESULTS=CFG.get_safe('service.discovery.max_search_results', 250) def on_start(self): # pragma no cover super(DiscoveryService,self).on_start() cfg_datastore = CFG.get_safe('container.datastore.default_server') if cfg_datastore != "postgresql": raise Exception("Discovery service does not support datastores other than postgresql") self.ds_discovery = DatastoreDiscovery(self) #=================================================================== # Query Methods #=================================================================== def query(self, query=None, id_only=True): """Issue a query against the indexes as specified in the query, applying filters and operators accordingly. The query format is a structured dict. See the query format definition: https://confluence.oceanobservatories.org/display/CIDev/Discovery+Service+Query+Format @param query dict @param id_only bool @retval results list """ validate_true(query, 'Invalid query') return self.request(query, id_only) def parse(self, search_request='', id_only=True): """Parses a given string request and assembles the query, processes the query and returns the results of the query. This is the primary means of interfacing with the search features in discovery. See the query language definition: https://confluence.oceanobservatories.org/display/CIDev/Discovery+Service+Query+Format @param search_request str @param id_only bool @retval results list """ log.info("Search DSL: %s", search_request) query_request = self._parse_query_string(search_request) return self.request(query_request, id_only=id_only) def _parse_query_string(self, query_string): """Given a query string in Discovery service DSL, parse and return query structure""" parser = QueryLanguage() query_request = parser.parse(query_string) return query_request def request(self, query=None, id_only=True): if not query: raise BadRequest('No request query provided') if "QUERYEXP" in query and self.ds_discovery: # Support for datastore queries pass elif 'query' not in query: raise BadRequest('Unsuported request. %s' % query) res = self.ds_discovery.execute_query(query, id_only=id_only) return res #=================================================================== # Special Query Methods #=================================================================== def query_association(self, resource_id='', depth=0, id_only=False): validate_true(resource_id, 'Unspecified resource') if depth: resource_ids = self.iterative_traverse(resource_id, depth-1) else: resource_ids = self.traverse(resource_id) if id_only: return resource_ids if not isinstance(resource_ids, list): resource_ids = list(resource_ids) resources = self.clients.resource_registry.read_mult(resource_ids) return resources def query_owner(self, resource_id='', depth=0, id_only=False): validate_true(resource_id, 'Unspecified resource') if depth: resource_ids = self.iterative_traverse(resource_id, depth-1) else: resource_ids = self.reverse_traverse(resource_id) if id_only: return resource_ids if not isinstance(resource_ids, list): resource_ids = list(resource_ids) resources = self.clients.resource_registry.read_mult(resource_ids) return resources def query_collection(self,collection_id='', id_only=False): validate_true(collection_id, 'Unspecified collection id') resource_ids = self.clients.index_management.list_collection_resources(collection_id, id_only=True) if id_only: return resource_ids resources = map(self.clients.resource_registry.read,resource_ids) return resources def traverse(self, resource_id=''): """Breadth-first traversal of the association graph for a specified resource. @param resource_id str @retval resources list """ def edges(resource_ids=[]): if not isinstance(resource_ids, list): resource_ids = list(resource_ids) return self.clients.resource_registry.find_objects_mult(subjects=resource_ids,id_only=True)[0] visited_resources = deque(edges([resource_id])) traversal_queue = deque() done = False t = None while not done: t = traversal_queue or deque(visited_resources) traversal_queue = deque() for e in edges(t): if not e in visited_resources: visited_resources.append(e) traversal_queue.append(e) if not len(traversal_queue): done = True return list(visited_resources) def reverse_traverse(self, resource_id=''): """Breadth-first traversal of the association graph for a specified resource. @param resource_id str @retval resources list """ def edges(resource_ids=[]): if not isinstance(resource_ids,list): resource_ids = list(resource_ids) return self.clients.resource_registry.find_subjects_mult(objects=resource_ids,id_only=True)[0] visited_resources = deque(edges([resource_id])) traversal_queue = deque() done = False t = None while not done: t = traversal_queue or deque(visited_resources) traversal_queue = deque() for e in edges(t): if not e in visited_resources: visited_resources.append(e) traversal_queue.append(e) if not len(traversal_queue): done = True return list(visited_resources) def iterative_traverse(self, resource_id='', limit=-1): ''' Iterative breadth first traversal of the resource associations ''' #-------------------------------------------------------------------------------- # Retrieve edges for this resource #-------------------------------------------------------------------------------- def edges(resource_ids=[]): if not isinstance(resource_ids, list): resource_ids = list(resource_ids) return self.clients.resource_registry.find_objects_mult(subjects=resource_ids,id_only=True)[0] gathered = deque() visited_resources = deque(edges([resource_id])) while limit>0: t = gathered or deque(visited_resources) for e in edges(t): if not e in visited_resources: visited_resources.append(e) gathered.append(e) if not len(gathered): break t = deque(gathered) gathered = deque() limit -= 1 return list(visited_resources) def iterative_reverse_traverse(self, resource_id='', limit=-1): ''' Iterative breadth first traversal of the resource associations ''' #-------------------------------------------------------------------------------- # Retrieve edges for this resource #-------------------------------------------------------------------------------- def edges(resource_ids=[]): if not isinstance(resource_ids, list): resource_ids = list(resource_ids) return self.clients.resource_registry.find_subjects_mult(objects=resource_ids,id_only=True)[0] gathered = deque() visited_resources = deque(edges([resource_id])) while limit>0: t = gathered or deque(visited_resources) for e in edges(t): if not e in visited_resources: visited_resources.append(e) gathered.append(e) if not len(gathered): break t = deque(gathered) gathered = deque() limit -= 1 return list(visited_resources) #=================================================================== # View Management #=================================================================== def create_view(self, view_name='', description='', fields=None, order=None, filters=''): """Creates a view which has the specified search fields, the order in which the search fields are presented to a query and a term filter. @param view_name Name of the view @param description Simple descriptive sentence @param fields Search fields @param order List of fields to determine order of precendence in which the results are presented @param filters Simple term filter """ res, _ = self.clients.resource_registry.find_resources(name=view_name, id_only=True) if len(res) > 0: raise BadRequest('The view resource with name: %s, already exists.' % view_name) #====================== # Arg Validations #====================== validate_is_instance(fields,list, 'Specified fields must be a list.') validate_true(len(fields)>0, 'Specfied fields must be a list.') if order is not None: validate_is_instance(order,list, 'Specified order must be a list of fields') for field in order: if not field in fields: raise BadRequest('The specified ordering field was not part of the search fields.') fields = set(fields) # Convert fields to a set for aggregation across the catalogs #====================================================================================================== # Priorty Queue Index Matching #====================================================================================================== pq = [] # Priority queue for matching catalog_id = None catalogs, _ = self.clients.resource_registry.find_resources(restype=RT.Catalog, id_only=False) for catalog in catalogs: if set(catalog.catalog_fields).issubset(fields): index_num = len(self.clients.catalog_management.list_indexes(catalog._id)) heapq.heappush(pq, (index_num,catalog)) if pq: weight, catalog = heapq.heappop(pq) if weight < self.heuristic_cutoff: catalog_id = catalog._id if catalog_id is None: catalog_id = self.clients.catalog_management.create_catalog('%s_catalog'% view_name, keywords=list(fields)) view_res = View(name=view_name, description=description) view_res.order = order view_res.filters = filters view_id, _ = self.clients.resource_registry.create(view_res) self.clients.resource_registry.create_association(subject=view_id, predicate=PRED.hasCatalog,object=catalog_id) return view_id def read_view(self, view_id=''): return self.clients.resource_registry.read(view_id) def update_view(self, view=None): self.clients.resource_registry.update(view) return True def delete_view(self, view_id=''): _, assocs = self.clients.resource_registry.find_objects_mult(subjects=[view_id]) for assoc in assocs: self.clients.resource_registry.delete_association(assoc._id) self.clients.resource_registry.delete(view_id) return True
class DiscoveryService(BaseDiscoveryService): SEARCH_BUFFER_SIZE=CFG.get_safe('service.discovery.search_buffer_size', 1048576) MAX_SEARCH_RESULTS=CFG.get_safe('service.discovery.max_search_results', 250) """ class docstring """ def on_start(self): # pragma no cover super(DiscoveryService,self).on_start() self.use_es = CFG.get_safe('system.elasticsearch', False) self.elasticsearch_host = CFG.get_safe('server.elasticsearch.host','localhost') self.elasticsearch_port = CFG.get_safe('server.elasticsearch.port','9200') self.ep = EventPublisher(event_type = 'SearchBufferExceededEvent') self.heuristic_cutoff = 4 self.cfg_datastore = CFG.get_safe('container.datastore.default_server', "couchdb") self.ds_discovery = None if self.cfg_datastore != "couchdb": self.ds_discovery = DatastoreDiscovery(self) @staticmethod def es_cleanup(): es_host = CFG.get_safe('server.elasticsearch.host', 'localhost') es_port = CFG.get_safe('server.elasticsearch.port', '9200') es = ep.ElasticSearch( host=es_host, port=es_port, timeout=10 ) indexes = STD_INDEXES.keys() indexes.append('%s_resources_index' % get_sys_name().lower()) indexes.append('%s_events_index' % get_sys_name().lower()) for index in indexes: IndexManagementService._es_call(es.river_couchdb_delete,index) IndexManagementService._es_call(es.index_delete,index) #=================================================================== # Views #=================================================================== def create_view(self, view_name='', description='', fields=None, order=None, filters=''): """Creates a view which has the specified search fields, the order in which the search fields are presented to a query and a term filter. @param view_name Name of the view @param description Simple descriptive sentence @param fields Search fields @param order List of fields to determine order of precendence in which the results are presented @param filter Simple term filter @param view_name str @param description str @param fields list @param order list @param filters str @retval view_id str """ res, _ = self.clients.resource_registry.find_resources(name=view_name, id_only=True) if len(res) > 0: raise BadRequest('The view resource with name: %s, already exists.' % view_name) #====================== # Arg Validations #====================== validate_is_instance(fields,list, 'Specified fields must be a list.') validate_true(len(fields)>0, 'Specfied fields must be a list.') if order is not None: validate_is_instance(order,list, 'Specified order must be a list of fields') for field in order: if not field in fields: raise BadRequest('The specified ordering field was not part of the search fields.') fields = set(fields) # Convert fields to a set for aggregation across the catalogs #====================================================================================================== # Priorty Queue Index Matching #====================================================================================================== pq = [] # Priority queue for matching catalog_id = None catalogs, _ = self.clients.resource_registry.find_resources(restype=RT.Catalog, id_only=False) for catalog in catalogs: if set(catalog.catalog_fields).issubset(fields): index_num = len(self.clients.catalog_management.list_indexes(catalog._id)) heapq.heappush(pq, (index_num,catalog)) if pq: weight, catalog = heapq.heappop(pq) if weight < self.heuristic_cutoff: catalog_id = catalog._id if catalog_id is None: catalog_id = self.clients.catalog_management.create_catalog('%s_catalog'% view_name, keywords=list(fields)) view_res = View(name=view_name, description=description) view_res.order = order view_res.filters = filters view_id, _ = self.clients.resource_registry.create(view_res) self.clients.resource_registry.create_association(subject=view_id, predicate=PRED.hasCatalog,object=catalog_id) return view_id def read_view(self, view_id=''): return self.clients.resource_registry.read(view_id) def update_view(self, view=None): self.clients.resource_registry.update(view) return True def delete_view(self, view_id=''): _, assocs = self.clients.resource_registry.find_objects_mult(subjects=[view_id]) for assoc in assocs: self.clients.resource_registry.delete_association(assoc._id) self.clients.resource_registry.delete(view_id) return True def list_catalogs(self, view_id=''): catalogs, _ = self.clients.resource_registry.find_objects(subject=view_id, object_type=RT.Catalog, predicate=PRED.hasCatalog, id_only=True) return catalogs #=================================================================== # Helper Methods #=================================================================== def _match_query_sources(self, source_name): index = self.clients.index_management.find_indexes(source_name) if index: return index _, resources = self.clients.resource_registry.find_resources(name=source_name, id_only=True) for res in resources: t = res['type'] if t == 'View' or t == 'ElasticSearchIndex' or t == 'Catalog': return res['id'] return None #=================================================================== # Query Methods #=================================================================== def query(self, query=None, id_only=True): validate_true(query,'Invalid query') return self.request(query, id_only) def query_couch(self, index_id='', key='', limit=0, offset=0, id_only=True): raise BadRequest('Not Implemented Yet') # cc = self.container # # datastore_name = source.datastore_name # db = cc.datastore_manager.get_datastore(datastore_name) # view_name = source.view_name # opts = DotDict(include_docs=True) # opts.start_key = [query.query] # opts.end_key = [query.query,{}] # if query.results: # opts.limit = query.results # if query.offset: # opts.skip = query.offset # # return db.query_view(view_name,opts=opts) def traverse(self, resource_id=''): """Breadth-first traversal of the association graph for a specified resource. @param resource_id str @retval resources list """ def edges(resource_ids=[]): if not isinstance(resource_ids, list): resource_ids = list(resource_ids) return self.clients.resource_registry.find_objects_mult(subjects=resource_ids,id_only=True)[0] visited_resources = deque(edges([resource_id])) traversal_queue = deque() done = False t = None while not done: t = traversal_queue or deque(visited_resources) traversal_queue = deque() for e in edges(t): if not e in visited_resources: visited_resources.append(e) traversal_queue.append(e) if not len(traversal_queue): done = True return list(visited_resources) def reverse_traverse(self, resource_id=''): """Breadth-first traversal of the association graph for a specified resource. @param resource_id str @retval resources list """ def edges(resource_ids=[]): if not isinstance(resource_ids,list): resource_ids = list(resource_ids) return self.clients.resource_registry.find_subjects_mult(objects=resource_ids,id_only=True)[0] visited_resources = deque(edges([resource_id])) traversal_queue = deque() done = False t = None while not done: t = traversal_queue or deque(visited_resources) traversal_queue = deque() for e in edges(t): if not e in visited_resources: visited_resources.append(e) traversal_queue.append(e) if not len(traversal_queue): done = True return list(visited_resources) def iterative_traverse(self, resource_id='', limit=-1): ''' Iterative breadth first traversal of the resource associations ''' #-------------------------------------------------------------------------------- # Retrieve edges for this resource #-------------------------------------------------------------------------------- def edges(resource_ids=[]): if not isinstance(resource_ids, list): resource_ids = list(resource_ids) return self.clients.resource_registry.find_objects_mult(subjects=resource_ids,id_only=True)[0] gathered = deque() visited_resources = deque(edges([resource_id])) while limit>0: t = gathered or deque(visited_resources) for e in edges(t): if not e in visited_resources: visited_resources.append(e) gathered.append(e) if not len(gathered): break t = deque(gathered) gathered = deque() limit -= 1 return list(visited_resources) def iterative_reverse_traverse(self, resource_id='', limit=-1): ''' Iterative breadth first traversal of the resource associations ''' #-------------------------------------------------------------------------------- # Retrieve edges for this resource #-------------------------------------------------------------------------------- def edges(resource_ids=[]): if not isinstance(resource_ids, list): resource_ids = list(resource_ids) return self.clients.resource_registry.find_subjects_mult(objects=resource_ids,id_only=True)[0] gathered = deque() visited_resources = deque(edges([resource_id])) while limit>0: t = gathered or deque(visited_resources) for e in edges(t): if not e in visited_resources: visited_resources.append(e) gathered.append(e) if not len(gathered): break t = deque(gathered) gathered = deque() limit -= 1 return list(visited_resources) def intersect(self, left=[], right=[]): """The intersection between two sets of resources. @param left list @param right list @retval result list """ return list(set(left).intersection(right)) def union(self, left=[], right=[]): return list(set(left).union(right)) def parse(self, search_request='', id_only=True): parser = QueryLanguage() query_request = parser.parse(search_request) return self.request(query_request, id_only=id_only) def query_request(self, query=None, limit=0, id_only=False): validate_is_instance(query,dict, 'invalid query') #--------------------------------------------- # Term Search #--------------------------------------------- if QueryLanguage.query_is_term_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id= source_id, field = query['field'], value = query['value'], limit = limit, id_only = id_only ) if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_term(**kwargs) #--------------------------------------------- # Fuzzy searching (phrases and such) #--------------------------------------------- elif QueryLanguage.query_is_fuzzy_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id= source_id, fuzzy = True, field = query['field'], value = query['fuzzy'], limit = limit, id_only = id_only ) if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_term(**kwargs) #--------------------------------------------- # Match searching (phrases and such) #--------------------------------------------- elif QueryLanguage.query_is_match_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id= source_id, match = True, field = query['field'], value = query['match'], limit = limit, id_only = id_only ) if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_term(**kwargs) #--------------------------------------------- # Association Search #--------------------------------------------- elif QueryLanguage.query_is_association_search(query): kwargs = dict( resource_id = query['association'], id_only = id_only ) if query.get('depth'): kwargs['depth'] = query['depth'] return self.query_association(**kwargs) elif QueryLanguage.query_is_owner_search(query): kwargs = dict( resource_id = query['owner'], id_only = id_only ) if query.get('depth'): kwargs['depth'] = query['depth'] return self.query_owner(**kwargs) #--------------------------------------------- # Range Search #--------------------------------------------- elif QueryLanguage.query_is_range_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id = source_id, field = query['field'], limit = limit, id_only = id_only ) if get_safe(query,'range.from') is not None: kwargs['from_value'] = query['range']['from'] if get_safe(query,'range.to') is not None: kwargs['to_value'] = query['range']['to'] if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_range(**kwargs) #--------------------------------------------- # Time Search #--------------------------------------------- elif QueryLanguage.query_is_time_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id = source_id, field = query['field'], limit = limit, id_only = id_only ) if get_safe(query,'time.from') is not None: kwargs['from_value'] = query['time']['from'] if get_safe(query,'time.to') is not None: kwargs['to_value'] = query['time']['to'] if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_time(**kwargs) #--------------------------------------------- # Time Bounds Search #--------------------------------------------- elif QueryLanguage.query_is_time_bounds_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id = source_id, field = query['field'], limit = limit, id_only = id_only ) if get_safe(query,'time_bounds.from') is not None: kwargs['from_value'] = query['time_bounds']['from'] if get_safe(query,'time_bounds.to') is not None: kwargs['to_value'] = query['time_bounds']['to'] if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_time_bounds(**kwargs) #--------------------------------------------- # Vertical Bounds Search #--------------------------------------------- elif QueryLanguage.query_is_vertical_bounds_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id = source_id, field = query['field'], limit = limit, id_only = id_only ) if get_safe(query,'vertical_bounds.from') is not None: kwargs['from_value'] = query['vertical_bounds']['from'] if get_safe(query,'vertical_bounds.to') is not None: kwargs['to_value'] = query['vertical_bounds']['to'] if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_vertical_bounds(**kwargs) #--------------------------------------------- # Collection Search #--------------------------------------------- elif QueryLanguage.query_is_collection_search(query): return self.query_collection( collection_id = query['collection'], id_only = id_only ) #--------------------------------------------- # Geo Distance Search #--------------------------------------------- elif QueryLanguage.query_is_geo_distance_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id = source_id, field = query['field'], origin = [query['lon'], query['lat']], distance = query['dist'], units = query['units'], id_only = id_only ) if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_geo_distance(**kwargs) #--------------------------------------------- # Geo Bounding Box Search #--------------------------------------------- elif QueryLanguage.query_is_geo_bbox_search(query): source_id = self._match_query_sources(query['index']) or query['index'] kwargs = dict( source_id = source_id, field = query['field'], top_left = query['top_left'], bottom_right = query['bottom_right'], limit = limit, id_only = id_only, ) if query.get('limit'): kwargs['limit'] = query['limit'] if query.get('order'): kwargs['order'] = query['order'] if query.get('offset'): kwargs['offset'] = query['offset'] return self.query_geo_bbox(**kwargs) #@todo: query for couch raise BadRequest('improper query: %s' % query) def _multi(self, cb,source, *args, **kwargs): ''' Manage the different collections of indexes for queries, views, catalogs Expand the resource into it's components and call the callback for each subcategory ''' if isinstance(source, View): catalogs = self.list_catalogs(source._id) result_queue = list() for catalog in catalogs: result_queue.extend(cb(catalog, *args, **kwargs)) if kwargs.has_key('limit') and kwargs['limit']: return result_queue[:kwargs['limit']] return result_queue if isinstance(source, Catalog): indexes = self.clients.catalog_management.list_indexes(source._id, id_only=True) result_queue = list() for index in indexes: result_queue.extend(cb(index, *args, **kwargs)) if kwargs.has_key('limit') and kwargs['limit']: return result_queue[:kwargs['limit']] return result_queue return None def query_term(self, source_id='', field='', value='', fuzzy=False, match=False, order=None, limit=0, offset=0, id_only=False): ''' Elasticsearch Query against an index > discovery.query_index('indexID', 'name', '*', order={'name':'asc'}, limit=20, id_only=False) ''' if not self.use_es: raise BadRequest('Can not make queries without ElasticSearch, enable system.elasticsearch to make queries.') validate_true(source_id, 'Unspecified source_id') validate_true(field, 'Unspecified field') validate_true(value, 'Unspecified value') es = ep.ElasticSearch(host=self.elasticsearch_host, port=self.elasticsearch_port) source = self.clients.resource_registry.read(source_id) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # If source is a view, catalog or collection go through it and recursively call query_range on all the results in the indexes #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - iterate = self._multi(self.query_term, source, field=field, value=value, order=order, limit=limit, offset=offset, id_only=id_only) if iterate is not None: return iterate index = source validate_is_instance(index, ElasticSearchIndex, '%s does not refer to a valid index.' % index) if order: validate_is_instance(order,dict, 'Order is incorrect.') es.sort(**order) if limit: es.size(limit) if offset: es.from_offset(offset) if field == '*': field = '_all' if fuzzy: query = ep.ElasticQuery.fuzzy_like_this(value, fields=[field]) elif match: match_query = ep.ElasticQuery.match(field=field,query=value) query = {"match_phrase_prefix":match_query['match']} elif '*' in value: query = ep.ElasticQuery.wildcard(field=field, value=value) else: query = ep.ElasticQuery.field(field=field, query=value) response = IndexManagementService._es_call(es.search_index_advanced,index.index_name,query) IndexManagementService._check_response(response) return self._results_from_response(response, id_only) def query_range(self, source_id='', field='', from_value=None, to_value=None, order=None, limit=0, offset=0, id_only=False): if not self.use_es: raise BadRequest('Can not make queries without ElasticSearch, enable in res/config/pyon.yml') if from_value is not None: validate_true(isinstance(from_value,int) or isinstance(from_value,float), 'from_value is not a valid number') if to_value is not None: validate_true(isinstance(to_value,int) or isinstance(to_value,float), 'to_value is not a valid number') validate_true(source_id, 'source_id not specified') es = ep.ElasticSearch(host=self.elasticsearch_host, port=self.elasticsearch_port) source = self.clients.resource_registry.read(source_id) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # If source is a view, catalog or collection go through it and recursively call query_range on all the results in the indexes #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - iterate = self._multi(self.query_range, source, field=field, from_value=from_value, to_value=to_value, order=order, limit=limit, offset=offset, id_only=id_only) if iterate is not None: return iterate index = source validate_is_instance(index,ElasticSearchIndex,'%s does not refer to a valid index.' % source_id) if order: validate_is_instance(order,dict,'Order is incorrect.') es.sort(**order) if limit: es.size(limit) if field == '*': field = '_all' query = ep.ElasticQuery.range( field = field, from_value = from_value, to_value = to_value ) response = IndexManagementService._es_call(es.search_index_advanced,index.index_name,query) IndexManagementService._check_response(response) return self._results_from_response(response, id_only) def query_time(self, source_id='', field='', from_value=None, to_value=None, order=None, limit=0, offset=0, id_only=False): if not self.use_es: raise BadRequest('Can not make queries without ElasticSearch, enable in res/config/pyon.yml') if from_value is not None: validate_is_instance(from_value,basestring,'"From" is not a valid string (%s)' % from_value) if to_value is not None: validate_is_instance(to_value,basestring,'"To" is not a valid string') es = ep.ElasticSearch(host=self.elasticsearch_host, port=self.elasticsearch_port) source = self.clients.resource_registry.read(source_id) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # If source is a view, catalog or collection go through it and recursively call query_time on all the results in the indexes #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - iterate = self._multi(self.query_time, source, field=field, from_value=from_value, to_value=to_value, order=order, limit=limit, offset=offset, id_only=id_only) if iterate is not None: return iterate index = source validate_is_instance(index,ElasticSearchIndex,'%s does not refer to a valid index.' % source_id) if order: validate_is_instance(order,dict,'Order is incorrect.') es.sort(**order) if limit: es.size(limit) if field == '*': field = '_all' if from_value is not None: from_value = calendar.timegm(dateutil.parser.parse(from_value).timetuple()) * 1000 if to_value is not None: to_value = calendar.timegm(dateutil.parser.parse(to_value).timetuple()) * 1000 query = ep.ElasticQuery.range( field = field, from_value = from_value, to_value = to_value ) response = IndexManagementService._es_call(es.search_index_advanced,index.index_name,query) IndexManagementService._check_response(response) return self._results_from_response(response, id_only) def query_time_bounds(self, source_id='', field='', from_value=None, to_value=None, order=None, limit=0, offset=0, id_only=False): if from_value is not None: validate_is_instance(from_value,basestring,'"From" is not a valid string (%s)' % from_value) if to_value is not None: validate_is_instance(to_value,basestring,'"To" is not a valid string') es = ep.ElasticSearch(host=self.elasticsearch_host, port=self.elasticsearch_port) source = self.clients.resource_registry.read(source_id) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # If source is a view, catalog or collection go through it and recursively call query_time on all the results in the indexes #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - iterate = self._multi(self.query_time, source, field=field, from_value=from_value, to_value=to_value, order=order, limit=limit, offset=offset, id_only=id_only) if iterate is not None: return iterate index = source validate_is_instance(index,ElasticSearchIndex,'%s does not refer to a valid index.' % source_id) if order: validate_is_instance(order,dict,'Order is incorrect.') es.sort(**order) if field == '*': field = '_all' start_time = 'start_datetime' end_time = 'end_datetime' else: start_time = '%s.start_datetime' % field end_time = '%s.end_datetime' % field if from_value is not None: from_value = calendar.timegm(dateutil.parser.parse(from_value).timetuple()) * 1000 if to_value is not None: to_value = calendar.timegm(dateutil.parser.parse(to_value).timetuple()) * 1000 query = { "query": { "match_all": {} }, "filter": { "and": [ { "or": [ { "range": { start_time: { "gte": from_value } } }, { "range": { end_time: { "gte": from_value } } } ] }, { "or": [ { "range": { start_time: { "lte": to_value } } }, { "range": { end_time: { "lte": to_value } } } ] } ] } } if limit: query['size'] = limit if offset: query['from'] = offset response = IndexManagementService._es_call(es.raw_query,'%s/_search' % index.index_name,method='POST', data=query, host=self.elasticsearch_host, port=self.elasticsearch_port) IndexManagementService._check_response(response) return self._results_from_response(response, id_only) def query_vertical_bounds(self, source_id='', field='', from_value=None, to_value=None, order=None, limit=0, offset=0, id_only=False): if from_value is not None: validate_is_instance(from_value,float,'"From" is not a valid float (%s)' % from_value) if to_value is not None: validate_is_instance(to_value,float,'"To" is not a valid float') es = ep.ElasticSearch(host=self.elasticsearch_host, port=self.elasticsearch_port) source = self.clients.resource_registry.read(source_id) #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # If source is a view, catalog or collection go through it and recursively call query_time on all the results in the indexes #- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - iterate = self._multi(self.query_time, source, field=field, from_value=from_value, to_value=to_value, order=order, limit=limit, offset=offset, id_only=id_only) if iterate is not None: return iterate index = source validate_is_instance(index,ElasticSearchIndex,'%s does not refer to a valid index.' % source_id) if order: validate_is_instance(order,dict,'Order is incorrect.') es.sort(**order) if field == '*': field = '_all' vertical_min = 'geospatial_vertical_min' vertical_max = 'geospatial_vertical_max' else: vertical_min = '%s.geospatial_vertical_min' % field vertical_max = '%s.geospatial_vertical_max' % field query = { "query": { "match_all": {} }, "filter": { "and": [ { "or": [ { "range": { vertical_min: { "gte": from_value } } }, { "range": { vertical_max: { "gte": from_value } } } ] }, { "or": [ { "range": { vertical_min: { "lte": to_value } } }, { "range": { vertical_max: { "lte": to_value } } } ] } ] } } if limit: query['size'] = limit if offset: query['from'] = offset response = IndexManagementService._es_call(es.raw_query,'%s/_search' % index.index_name,method='POST', data=query, host=self.elasticsearch_host, port=self.elasticsearch_port) IndexManagementService._check_response(response) retval= self._results_from_response(response, id_only) return retval def query_association(self,resource_id='', depth=0, id_only=False): validate_true(resource_id, 'Unspecified resource') if depth: resource_ids = self.iterative_traverse(resource_id, depth-1) else: resource_ids = self.traverse(resource_id) if id_only: return resource_ids if not isinstance(resource_ids, list): resource_ids = list(resource_ids) resources = self.clients.resource_registry.read_mult(resource_ids) return resources def query_owner(self, resource_id='', depth=0, id_only=False): validate_true(resource_id, 'Unspecified resource') if depth: resource_ids = self.iterative_traverse(resource_id, depth-1) else: resource_ids = self.reverse_traverse(resource_id) if id_only: return resource_ids if not isinstance(resource_ids, list): resource_ids = list(resource_ids) resources = self.clients.resource_registry.read_mult(resource_ids) return resources def query_collection(self,collection_id='', id_only=False): validate_true(collection_id, 'Unspecified collection id') resource_ids = self.clients.index_management.list_collection_resources(collection_id, id_only=True) if id_only: return resource_ids resources = map(self.clients.resource_registry.read,resource_ids) return resources def query_geo_distance(self, source_id='', field='', origin=None, distance='', units='mi',order=None, limit=0, offset=0, id_only=False): validate_true(isinstance(origin,(tuple,list)) , 'Origin is not a list or tuple.') validate_true(len(origin)==2, 'Origin is not of the right size: (2)') if not self.use_es: raise BadRequest('Can not make queries without ElasticSearch, enable in res/config/pyon.yml') es = ep.ElasticSearch(host=self.elasticsearch_host, port=self.elasticsearch_port) source = self.clients.resource_registry.read(source_id) iterate = self._multi(self.query_geo_distance, source=source, field=field, origin=origin, distance=distance) if iterate is not None: return iterate index = source validate_is_instance(index,ElasticSearchIndex, '%s does not refer to a valid index.' % index) sorts = ep.ElasticSort() if order is not None and isinstance(order,dict): sort_field = order.keys()[0] value = order[sort_field] sorts.sort(sort_field,value) es.sorted(sorts) if limit: es.size(limit) if offset: es.from_offset(offset) if field == '*': field = '_all' sorts.geo_distance(field, origin, units) es.sorted(sorts) filter = ep.ElasticFilter.geo_distance(field,origin, '%s%s' %(distance,units)) es.filtered(filter) query = ep.ElasticQuery.match_all() response = IndexManagementService._es_call(es.search_index_advanced,index.index_name,query) IndexManagementService._check_response(response) return self._results_from_response(response,id_only) def query_geo_bbox(self, source_id='', field='', top_left=None, bottom_right=None, order=None, limit=0, offset=0, id_only=False): validate_true(isinstance(top_left, (list,tuple)), 'Top Left is not a list or a tuple') validate_true(len(top_left)==2, 'Top Left is not of the right size: (2)') validate_true(isinstance(bottom_right, (list,tuple)), 'Bottom Right is not a list or a tuple') validate_true(len(bottom_right)==2, 'Bottom Right is not of the right size: (2)') if not self.use_es: raise BadRequest('Can not make queries without ElasticSearch, enable in res/config/pyon.yml') es = ep.ElasticSearch(host=self.elasticsearch_host, port=self.elasticsearch_port) source = self.clients.resource_registry.read(source_id) iterate = self._multi(self.query_geo_bbox, source=source, field=field, top_left=top_left, bottom_right=bottom_right, order=order, limit=limit, offset=offset, id_only=id_only) if iterate is not None: return iterate index = source validate_is_instance(index,ElasticSearchIndex, '%s does not refer to a valid index.' % index) sorts = ep.ElasticSort() if order is not None and isinstance(order,dict): sort_field = order.keys()[0] value = order[sort_field] sorts.sort(sort_field,value) es.sorted(sorts) if limit: es.size(limit) if offset: es.from_offset(offset) if field == '*': field = '_all' filter = ep.ElasticFilter.geo_bounding_box(field, top_left, bottom_right) es.filtered(filter) query = ep.ElasticQuery.match_all() response = IndexManagementService._es_call(es.search_index_advanced,index.index_name,query) IndexManagementService._check_response(response) return self._results_from_response(response,id_only) def es_complex_query(self,query,and_queries=None,or_queries=None): pass def es_map_query(self, query): ''' Maps an query request to an ElasticSearch query ''' if not self.use_es: raise BadRequest('Can not make queries without ElasticSearch, enable in res/config/pyon.yml') if QueryLanguage.query_is_term_search(query): return ep.ElasticQuery.wildcard(field=query['field'],value=query['value']) if QueryLanguage.query_is_range_search(query): return ep.ElasticQuery.range( field = query['field'], from_value = query['range']['from'], to_value = query['range']['to'] ) def request(self, query=None, id_only=True): if not query: raise BadRequest('No request query provided') if "QUERYEXP" in query and self.ds_discovery: # Support for datastore queries pass else: if not query.has_key('query'): raise BadRequest('Unsuported request. %s') #============================== # Check the form of the query #============================== #@todo: convert to IonObject if not (query.has_key('query')): raise BadRequest('Improper query request: %s' % query) # Inject RR query execution e.g. for postgres if self.ds_discovery: res = self.ds_discovery.execute_query(query, id_only=id_only) return res # --------------------------- # Number of results to return # --------------------------- limit = int(query.get('limit',self.MAX_SEARCH_RESULTS)) query_queue = list() query = DotDict(query) # -- former tier-1 (no and/or) search, returns an elasticsearch object if (len(query.get('and',[])) + len(query.get('or',[])) == 0 ): return self.query_request(query.query,limit=self.SEARCH_BUFFER_SIZE)[:limit] query_queue.append(self.query_request(query.query,limit=self.SEARCH_BUFFER_SIZE, id_only=True)) #================== # Intersection #================== for q in query.get('and',[]): query_queue.append(self.query_request(q, limit=self.SEARCH_BUFFER_SIZE, id_only=True)) while len(query_queue) > 1: tmp = self.intersect(query_queue.pop(), query_queue.pop()) query_queue.append(tmp) #================== # Union #================== for q in query.get('or',[]): query_queue.append(self.query_request(q, limit=self.SEARCH_BUFFER_SIZE, id_only=True)) while len(query_queue) > 1: tmp = self.union(query_queue.pop(), query_queue.pop()) query_queue.append(tmp) if id_only: return query_queue[0][:limit] objects = self.clients.resource_registry.read_mult(query_queue[0][:limit]) return objects def raise_search_buffer_exceeded(self): self.ep.publish_event(origin='Discovery Service', description='Search buffer was exceeded, results may not contain all the possible results.') def _results_from_response(self, response, id_only): deserializer = IonObjectDeserializer(obj_registry=get_obj_registry()) if not (response.has_key('hits') and response['hits'].has_key('hits')): return [] hits = response['hits']['hits'] if len(hits) > 0: if len(hits) >= self.SEARCH_BUFFER_SIZE: log.warning("Query results exceeded search buffer limitations") self.raise_search_buffer_exceeded() if id_only: return [str(i['_id']) for i in hits] results = map(deserializer.deserialize,hits) return results else: return []
class DiscoveryService(BaseDiscoveryService): MAX_SEARCH_RESULTS = CFG.get_safe('service.discovery.max_search_results', 250) def on_start(self): super(DiscoveryService, self).on_start() cfg_datastore = CFG.get_safe('container.datastore.default_server') if cfg_datastore != "postgresql": raise Exception("Discovery service does not support datastores other than postgresql") self.ds_discovery = DatastoreDiscovery(self) #=================================================================== # Query Methods #=================================================================== def parse(self, search_request='', id_only=True, search_args=None): """Parses a given string request and assembles the query, processes the query and returns the results of the query. See the query language definition: https://confluence.oceanobservatories.org/display/CIDev/Discovery+Service+Query+Format @param search_request str @param id_only bool @param search_args dict @retval results list """ log.info("Search DSL: %s", search_request) query_request = self._parse_query_string(search_request) return self._discovery_request(query_request, id_only=id_only, search_args=search_args, query_params=search_args) def _parse_query_string(self, query_string): """Given a query string in Discovery service DSL, parse and return query structure""" parser = QueryLanguage() query_request = parser.parse(query_string) return query_request def query(self, query=None, id_only=True, search_args=None): """Issue a query provided in structured dict format or internal datastore query format. Returns a list of resource or event objects or their IDs only. Search_args may contain parameterized values. See the query format definition: https://confluence.oceanobservatories.org/display/CIDev/Discovery+Service+Query+Format @param query dict @param id_only bool @param search_args dict @retval results list """ validate_true(query, 'Invalid query') return self._discovery_request(query, id_only, search_args=search_args, query_params=search_args) def query_view(self, view_id='', view_name='', ext_query=None, id_only=True, search_args=None): """Execute an existing query as defined within a View resource, providing additional arguments for parameterized values. If ext_query is provided, it will be combined with the query defined by the View. Search_args may contain parameterized values. Returns a list of resource or event objects or their IDs only. """ if not view_id and not view_name: raise BadRequest("Must provide argument view_id or view_name") if view_id and view_name: raise BadRequest("Cannot provide both arguments view_id and view_name") if view_id: view_obj = self.clients.resource_registry.read(view_id) else: view_obj = self.ds_discovery.get_builtin_view(view_name) if not view_obj: view_objs, _ = self.clients.resource_registry.find_resources(restype=RT.View, name=view_name) if not view_objs: raise NotFound("View with name '%s' not found" % view_name) view_obj = view_objs[0] if view_obj.type_ != RT.View: raise BadRequest("Argument view_id is not a View resource") view_query = view_obj.view_definition if not QUERY_EXP_KEY in view_query: raise BadRequest("Unknown View query format") # Get default query params and override them with provided args param_defaults = {param.name: param.default for param in view_obj.view_parameters} query_params = param_defaults if view_obj.param_values: query_params.update(view_obj.param_values) if search_args: query_params.update(search_args) # Merge ext_query into query if ext_query: if ext_query["where"] and view_query["where"]: view_query["where"] = [DQ.EXP_AND, [view_query["where"], ext_query["where"]]] else: view_query["where"] = view_query["where"] or ext_query["where"] if ext_query["order_by"]: # Override ordering if present view_query["where"] = ext_query["order_by"] # Other query settings view_qargs = view_query["query_args"] ext_qargs = ext_query["query_args"] view_qargs["id_only"] = ext_qargs.get("id_only", view_qargs["id_only"]) view_qargs["limit"] = ext_qargs.get("limit", view_qargs["limit"]) view_qargs["skip"] = ext_qargs.get("skip", view_qargs["skip"]) return self._discovery_request(view_query, id_only=id_only, search_args=search_args, query_params=query_params) def _discovery_request(self, query=None, id_only=True, search_args=None, query_params=None): search_args = search_args or {} if not query: raise BadRequest('No request query provided') if QUERY_EXP_KEY in query and self.ds_discovery: query.setdefault("query_args", {})["id_only"] = id_only # Query in datastore query format (dict) log.debug("Executing datastore query: %s", query) elif "QUERYDSL" in query: # Query in DSL format if "query_str" not in query: raise BadRequest('No query_str provided') query = self._parse_query_string(query["query_str"]) elif 'query' not in query: raise BadRequest('Unsupported request. %s' % query) # if count requested, run id_only query without limit/skip count = search_args.get("count", False) if count: # Only return the count of ID only search query.pop("limit", None) query.pop("skip", None) res = self.ds_discovery.execute_query(query, id_only=True, query_args=search_args, query_params=query_params) return [len(res)] # TODO: Not all queries are permissible by all users # Execute the query query_results = self.ds_discovery.execute_query(query, id_only=id_only, query_args=search_args, query_params=query_params) # Strip out unwanted object attributes for size filtered_res = self._strip_query_results(query_results, id_only=id_only, search_args=search_args) return filtered_res def _strip_query_results(self, query_results, id_only, search_args): # Filter the results for smaller result size attr_filter = search_args.get("attribute_filter", []) if type(attr_filter) not in (list, tuple): raise BadRequest("Illegal argument type: attribute_filter") if not id_only and attr_filter: filtered_res = [dict(__noion__=True, **{k: v for k, v in obj.__dict__.iteritems() if k in attr_filter or k in {"_id", "type_"}}) for obj in query_results] return filtered_res return query_results #=================================================================== # View Management #=================================================================== def create_view(self, view=None): if view is None or not isinstance(view, View): raise BadRequest("Illegal argument: view") # view_objs, _ = self.clients.resource_registry.find_resources(restype=RT.View, name=view.name) # if view_objs: # raise BadRequest("View with name '%s' already exists" % view.name) view_id, _ = self.clients.resource_registry.create(view) return view_id def read_view(self, view_id=''): view_res = self.clients.resource_registry.read(view_id) if not isinstance(view_res, View): raise BadRequest("Resource %s is not a View" % view_id) return view_res def update_view(self, view=None): if view is None or not isinstance(view, View): raise BadRequest("Illegal argument: view") self.clients.resource_registry.update(view) return True def delete_view(self, view_id=''): self.clients.resource_registry.delete(view_id) return True def create_catalog_view(self, view_name='', description='', fields=None, order=None, filters=''): """Creates a view which has the specified search fields, the order in which the search fields are presented to a query and a term filter. @param view_name Name of the view @param description Simple descriptive sentence @param fields Search fields @param order List of fields to determine order of precendence in which the results are presented @param filters Simple term filter """ res, _ = self.clients.resource_registry.find_resources(name=view_name, id_only=True) if len(res) > 0: raise BadRequest('The view resource with name: %s, already exists.' % view_name) #====================== # Arg Validations #====================== validate_is_instance(fields,list, 'Specified fields must be a list.') validate_true(len(fields)>0, 'Specfied fields must be a list.') if order is not None: validate_is_instance(order,list, 'Specified order must be a list of fields') for field in order: if not field in fields: raise BadRequest('The specified ordering field was not part of the search fields.') fields = set(fields) # Convert fields to a set for aggregation across the catalogs #====================================================================================================== # Priorty Queue Index Matching #====================================================================================================== pq = [] # Priority queue for matching catalog_id = None catalogs, _ = self.clients.resource_registry.find_resources(restype=RT.Catalog, id_only=False) for catalog in catalogs: if set(catalog.catalog_fields).issubset(fields): index_num = len(self.clients.catalog_management.list_indexes(catalog._id)) heapq.heappush(pq, (index_num,catalog)) if pq: weight, catalog = heapq.heappop(pq) if weight < 4: catalog_id = catalog._id if catalog_id is None: catalog_id = self.clients.catalog_management.create_catalog('%s_catalog'% view_name, keywords=list(fields)) view_res = View(name=view_name, description=description) view_res.order = order view_res.filters = filters view_id, _ = self.clients.resource_registry.create(view_res) self.clients.resource_registry.create_association(subject=view_id, predicate=PRED.hasCatalog,object=catalog_id) return view_id