def update_from_dict(self, d: Dict) -> "Search": """ Apply options from a serialized body to the current instance. Modifies the object in-place. Used mostly by ``from_dict``. """ d = d.copy() if "query" in d: self._query = Query(d.pop("query")) if "post_filter" in d: self._post_filter = Query(d.pop("post_filter")) aggs = d.pop("aggs", d.pop("aggregations", {})) if aggs: self._aggs = Aggs(aggs) if "sort" in d: self._sort = d.pop("sort") if "_source" in d: self._source = d.pop("_source") if "highlight" in d: high = d.pop("highlight").copy() self._highlight = high.pop("fields") self._highlight_opts = high if "suggest" in d: self._suggest = d.pop("suggest") if "text" in self._suggest: text = self._suggest.pop("text") for s in self._suggest.values(): s.setdefault("text", text) if "script_fields" in d: self._script_fields = d.pop("script_fields") self._params.update(d) return self
def test_parse_as_tabular(self): # with single agg at root my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_tabular(row_as_tuple=True) self.assertEqual(index_names, ["classification_type", "global_metrics.field.name"]) self.assertEqual( index_values, [ ( ("multilabel", "ispracticecompatible"), { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128 }, ), ( ("multilabel", "gpc"), { "avg_f1_micro": 0.95, "avg_nb_classes": 183.21, "doc_count": 119 }, ), ( ("multilabel", "preservationmethods"), { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76 }, ), ( ("multiclass", "kind"), { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370 }, ), ( ("multiclass", "gpc"), { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198 }, ), ], )
def get_wrapper_declared_agg(): return (Aggs(mapping=MAPPING).groupby( ["classification_type", "global_metrics.field.name"]).aggs([ Avg("avg_nb_classes", field="global_metrics.dataset.nb_classes"), Avg( "avg_f1_micro", field="global_metrics.performance.test.micro.f1_score", ), ]))
def test_parse_as_tabular_multiple_roots(self): # with multiple aggs at root my_agg = Aggs({ "classification_type": { "terms": { "field": "classification_type" } }, "avg_f1_score": { "avg": { "field": "global_metrics.performance.test.micro.f1_score" } }, }) raw_response = { "classification_type": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "multiclass", "doc_count": 439 }, { "key": "multilabel", "doc_count": 433 }, ], }, "avg_f1_score": { "value": 0.815 }, } index_names, index_values = Aggregations( data=raw_response, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_tabular(row_as_tuple=True, expand_sep=" || ") self.assertEqual(index_names, []) self.assertEqual( index_values, [( (), { "avg_f1_score": 0.815, "classification_type || multiclass": 439, "classification_type || multilabel": 433, }, )], )
def test_parse_as_dataframe(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) df = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_dataframe() self.assertIsInstance(df, pd.DataFrame) self.assertEqual(set(df.index.names), {"classification_type", "global_metrics.field.name"}) self.assertEqual(set(df.columns), {"avg_f1_micro", "avg_nb_classes", "doc_count"}) self.assertEqual( df.index.to_list(), [ ("multilabel", "ispracticecompatible"), ("multilabel", "gpc"), ("multilabel", "preservationmethods"), ("multiclass", "kind"), ("multiclass", "gpc"), ], ) self.assertEqual( df.to_dict(orient="rows"), [ { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128 }, { "avg_f1_micro": 0.95, "avg_nb_classes": 183.21, "doc_count": 119 }, { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76 }, { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370 }, { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198 }, ], )
def test_normalize_buckets(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_normalized() self.assertEqual(ordered(response), ordered(sample.EXPECTED_NORMALIZED_RESPONSE))
def test_parse_as_tree(self, *_): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response = Aggregations( data=sample.ES_AGG_RESPONSE, aggs=my_agg, index=None, client=None, query=None, ).serialize_as_tree() self.assertIsInstance(response, AggsResponseTree) self.assertEqual(response.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR)
def __init__(self, using=None, index=None, mapping=None): """ Search request to elasticsearch. :arg using: `Elasticsearch` instance to use :arg index: limit the search to index :arg mapping: mapping used for query validation All the parameters supplied (or omitted) at creation type can be later overridden by methods (`using`, `index` and `mapping` respectively). """ self._sort = [] self._source = None self._highlight = {} self._highlight_opts = {} self._suggest = {} self._script_fields = {} mapping = Mapping(mapping) self._mapping = mapping self._aggs = Aggs(mapping=mapping) self._query = Query(mapping=mapping) self._post_filter = Query(mapping=mapping) super(Search, self).__init__(using=using, index=index)
def test_grouping_agg(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) agg_response = Aggregations(data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)) # none provided self.assertIsNone(agg_response._grouping_agg()[0]) # fake provided with self.assertRaises(KeyError): agg_response._grouping_agg("yolo") # not bucket provided with self.assertRaises(ValueError): agg_response._grouping_agg("avg_f1_micro") # real provided self.assertEqual( agg_response._grouping_agg("global_metrics.field.name")[0], "global_metrics.field.name", )
def test_response_tree(self, uuid_mock): uuid_mock.side_effect = range(1000) my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response_tree = AggsResponseTree(aggs=my_agg, index=None).parse( sample.ES_AGG_RESPONSE) self.assertEqual(response_tree.__str__(), sample.EXPECTED_RESPONSE_TREE_REPR) self.assertEqual(len(response_tree.list()), 18) multilabel_gpc_bucket = next( (b for b in response_tree.list() if b.level == "global_metrics.field.name" and b.key == "gpc")) # bucket properties will give parents levels and keys self.assertEqual( response_tree.bucket_properties(multilabel_gpc_bucket), OrderedDict([ ("global_metrics.field.name", "gpc"), ("classification_type", "multilabel"), ]), )
def __init__( self, using: Optional[Elasticsearch] = None, index: Optional[Union[str, Tuple[str], List[str]]] = None, mappings: Optional[Union[MappingsDict, Mappings]] = None, nested_autocorrect: bool = False, repr_auto_execute: bool = False, document_class: DocumentMeta = None, ) -> None: """ Search request to elasticsearch. :arg using: `Elasticsearch` instance to use :arg index: limit the search to index :arg mappings: mappings used for query validation :arg nested_autocorrect: in case of missing nested clause, will insert it automatically :arg repr_auto_execute: execute query and display results as dataframe, requires client to be provided All the parameters supplied (or omitted) at creation type can be later overridden by methods (`using`, `index` and `mappings` respectively). """ self._sort: List[Union[str, Dict[str, Any]]] = [] self._source: Any = None self._highlight: Dict[str, Any] = {} self._highlight_opts: Dict[str, Any] = {} self._suggest: Dict[str, Any] = {} self._script_fields: Dict[str, Any] = {} mappings = _mappings(mappings) self._mappings: Optional[Mappings] = mappings self._aggs: Aggs = Aggs(mappings=mappings, nested_autocorrect=nested_autocorrect) self._query: Query = Query(mappings=mappings, nested_autocorrect=nested_autocorrect) self._post_filter: Query = Query(mappings=mappings, nested_autocorrect=nested_autocorrect) self._repr_auto_execute: bool = repr_auto_execute self._document_class: Optional[DocumentMeta] = document_class super(Search, self).__init__(using=using, index=index)
def test_parse_as_dataframe(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) df = Aggregations(data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)).to_dataframe( grouped_by="global_metrics.field.name") self.assertIsInstance(df, pd.DataFrame) self.assertEqual(set(df.index.names), {"classification_type", "global_metrics.field.name"}) self.assertEqual(set(df.columns), {"avg_f1_micro", "avg_nb_classes", "doc_count"}) self.assertEqual( df.to_dict(orient="index"), { ("multiclass", "gpc"): { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198, }, ("multiclass", "kind"): { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370, }, ("multilabel", "ispracticecompatible"): { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128, }, ("multilabel", "preservationmethods"): { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76, }, }, )
def test_client_bound_response(self, uuid_mock): uuid_mock.side_effect = range(1000) client_mock = Mock(spec=["search"]) my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mapping=MAPPING) response_tree = AggsResponseTree(aggs=my_agg, index=None).parse( sample.ES_AGG_RESPONSE) response = IResponse( client=client_mock, tree=response_tree, index_name="some_index", depth=1, query={"term": { "some_field": 1 }}, ) # ensure that navigation to attributes works with autocompletion (dir is used in ipython) self.assertIn("classification_type_multiclass", dir(response)) self.assertIn("classification_type_multilabel", dir(response)) multilabel = response.classification_type_multilabel self.assertIsInstance(multilabel, IResponse) self.assertIs(multilabel._initial_tree, response._tree) self.assertIn("global_metrics_field_name_gpc", dir(multilabel)) gpc = multilabel.global_metrics_field_name_gpc self.assertIsInstance(gpc, IResponse) self.assertIs(gpc._initial_tree, response._tree) # test filter query used to list documents belonging to bucket self.assertTrue( equal_queries( gpc.get_bucket_filter(), { "bool": { "must": [ { "term": { "global_metrics.field.name": { "value": "gpc" } } }, { "term": { "classification_type": { "value": "multilabel" } } }, { "term": { "some_field": { "value": 1 } } }, ] } }, ))
def test_parse_as_tabular(self): # with single agg at root my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)).to_tabular( index_orient=True, grouped_by="global_metrics.field.name") self.assertEqual(index_names, ["classification_type", "global_metrics.field.name"]) self.assertEqual( index_values, { ("multilabel", "ispracticecompatible"): { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "doc_count": 128, }, ("multilabel", "preservationmethods"): { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "doc_count": 76, }, ("multiclass", "kind"): { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "doc_count": 370, }, ("multiclass", "gpc"): { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "doc_count": 198, }, }, ) # index_orient = False index_names, index_values = Aggregations( data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)).to_tabular( index_orient=False, grouped_by="global_metrics.field.name") self.assertEqual(index_names, ["classification_type", "global_metrics.field.name"]) self.assertEqual( index_values, [ { "avg_f1_micro": 0.72, "avg_nb_classes": 18.71, "classification_type": "multilabel", "doc_count": 128, "global_metrics.field.name": "ispracticecompatible", }, { "avg_f1_micro": 0.8, "avg_nb_classes": 9.97, "classification_type": "multilabel", "doc_count": 76, "global_metrics.field.name": "preservationmethods", }, { "avg_f1_micro": 0.89, "avg_nb_classes": 206.5, "classification_type": "multiclass", "doc_count": 370, "global_metrics.field.name": "kind", }, { "avg_f1_micro": 0.93, "avg_nb_classes": 211.12, "classification_type": "multiclass", "doc_count": 198, "global_metrics.field.name": "gpc", }, ], )
def test_parse_as_tabular_multiple_roots(self): # with multiple aggs at root my_agg = Aggs({ "classification_type": { "terms": { "field": "classification_type" } }, "avg_f1_score": { "avg": { "field": "global_metrics.performance.test.micro.f1_score" } }, }) raw_response = { "classification_type": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "multiclass", "doc_count": 439 }, { "key": "multilabel", "doc_count": 433 }, ], }, "avg_f1_score": { "value": 0.815 }, } index_names, index_values = Aggregations( data=raw_response, _search=Search().aggs(my_agg)).to_tabular(index_orient=True, expand_sep=" || ") self.assertEqual(index_names, []) self.assertEqual( index_values, { (): { "avg_f1_score": 0.815, "classification_type || multiclass": 439, "classification_type || multilabel": 433, } }, ) # with specified grouped_by index_names, index_values = Aggregations( data=raw_response, _search=Search().aggs(my_agg)).to_tabular( grouped_by="classification_type") self.assertEqual(index_names, ["classification_type"]) self.assertEqual( index_values, { ("multiclass", ): { "doc_count": 439 }, ("multilabel", ): { "doc_count": 433 } }, )
class Search(Request): def __init__(self, using=None, index=None, mapping=None): """ Search request to elasticsearch. :arg using: `Elasticsearch` instance to use :arg index: limit the search to index :arg mapping: mapping used for query validation All the parameters supplied (or omitted) at creation type can be later overridden by methods (`using`, `index` and `mapping` respectively). """ self._sort = [] self._source = None self._highlight = {} self._highlight_opts = {} self._suggest = {} self._script_fields = {} mapping = Mapping(mapping) self._mapping = mapping self._aggs = Aggs(mapping=mapping) self._query = Query(mapping=mapping) self._post_filter = Query(mapping=mapping) super(Search, self).__init__(using=using, index=index) def query(self, *args, **kwargs): s = self._clone() s._query = s._query.query(*args, **kwargs) return s query.__doc__ = Query.query.__doc__ def filter(self, *args, **kwargs): s = self._clone() s._query = s._query.filter(*args, **kwargs) return s filter.__doc__ = Query.filter.__doc__ def must_not(self, *args, **kwargs): s = self._clone() s._query = s._query.must_not(*args, **kwargs) return s must_not.__doc__ = Query.must_not.__doc__ def should(self, *args, **kwargs): s = self._clone() s._query = s._query.should(*args, **kwargs) return s should.__doc__ = Query.should.__doc__ def must(self, *args, **kwargs): s = self._clone() s._query = s._query.must(*args, **kwargs) return s must.__doc__ = Query.must.__doc__ def aggs(self, *args, **kwargs): s = self._clone() s._aggs = s._aggs.aggs(*args, **kwargs) return s aggs.__doc__ = Aggs.aggs.__doc__ def groupby(self, *args, **kwargs): s = self._clone() s._aggs = s._aggs.groupby(*args, **kwargs) return s groupby.__doc__ = Aggs.groupby.__doc__ def __iter__(self): """ Iterate over the hits. """ return iter(self.execute()) def __getitem__(self, n): """ Support slicing the `Search` instance for pagination. Slicing equates to the from/size parameters. E.g.:: s = Search().query(...)[0:25] is equivalent to:: s = Search().query(...).params(from=0, size=25) """ s = self._clone() if isinstance(n, slice): # If negative slicing, abort. if n.start and n.start < 0 or n.stop and n.stop < 0: raise ValueError("Search does not support negative slicing.") # Elasticsearch won't get all results so we default to size: 10 if # stop not given. s._params["from"] = n.start or 0 s._params["size"] = n.stop - (n.start or 0) if n.stop is not None else 10 return s else: # This is an index lookup, equivalent to slicing by [n:n+1]. # If negative index, abort. if n < 0: raise ValueError("Search does not support negative indexing.") s._params["from"] = n s._params["size"] = 1 return s def size(self, size): """Equivalent to:: s = Search().params(size=size) """ s = self._clone() s._params["size"] = size return s @classmethod def from_dict(cls, d): """ Construct a new `Search` instance from a raw dict containing the search body. Useful when migrating from raw dictionaries. Example:: s = Search.from_dict({ "query": { "bool": { "must": [...] } }, "aggs": {...} }) s = s.filter('term', published=True) """ s = cls() s.update_from_dict(d) return s def _clone(self): """ Return a clone of the current search request. Performs a shallow copy of all the underlying objects. Used internally by most state modifying APIs. """ s = self.__class__(using=self._using, index=self._index, mapping=self._mapping) s._params = self._params.copy() s._sort = self._sort[:] s._source = copy.copy( self._source) if self._source is not None else None s._highlight = self._highlight.copy() s._highlight_opts = self._highlight_opts.copy() s._suggest = self._suggest.copy() s._script_fields = self._script_fields.copy() s._aggs = self._aggs.clone() s._query = self._query.clone() s._post_filter = self._post_filter.clone() s._mapping = self._mapping.clone() return s def update_from_dict(self, d): """ Apply options from a serialized body to the current instance. Modifies the object in-place. Used mostly by ``from_dict``. """ d = d.copy() if "query" in d: self._query = Query(d.pop("query")) if "post_filter" in d: self._post_filter = Query(d.pop("post_filter")) aggs = d.pop("aggs", d.pop("aggregations", {})) if aggs: self._aggs = Aggs(aggs) if "sort" in d: self._sort = d.pop("sort") if "_source" in d: self._source = d.pop("_source") if "highlight" in d: high = d.pop("highlight").copy() self._highlight = high.pop("fields") self._highlight_opts = high if "suggest" in d: self._suggest = d.pop("suggest") if "text" in self._suggest: text = self._suggest.pop("text") for s in self._suggest.values(): s.setdefault("text", text) if "script_fields" in d: self._script_fields = d.pop("script_fields") self._params.update(d) return self def script_fields(self, **kwargs): """ Define script fields to be calculated on hits. See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-script-fields.html for more details. Example:: s = Search() s = s.script_fields(times_two="doc['field'].value * 2") s = s.script_fields( times_three={ 'script': { 'inline': "doc['field'].value * params.n", 'params': {'n': 3} } } ) """ s = self._clone() for name in kwargs: if isinstance(kwargs[name], string_types): kwargs[name] = {"script": kwargs[name]} s._script_fields.update(kwargs) return s def source(self, fields=None, **kwargs): """ Selectively control how the _source field is returned. :arg fields: wildcard string, array of wildcards, or dictionary of includes and excludes If ``fields`` is None, the entire document will be returned for each hit. If fields is a dictionary with keys of 'includes' and/or 'excludes' the fields will be either included or excluded appropriately. Calling this multiple times with the same named parameter will override the previous values with the new ones. Example:: s = Search() s = s.source(includes=['obj1.*'], excludes=["*.description"]) s = Search() s = s.source(includes=['obj1.*']).source(excludes=["*.description"]) """ s = self._clone() if fields and kwargs: raise ValueError( "You cannot specify fields and kwargs at the same time.") if fields is not None: s._source = fields return s if kwargs and not isinstance(s._source, dict): s._source = {} for key, value in kwargs.items(): if value is None: try: del s._source[key] except KeyError: pass else: s._source[key] = value return s def sort(self, *keys): """ Add sorting information to the search request. If called without arguments it will remove all sort requirements. Otherwise it will replace them. Acceptable arguments are:: 'some.field' '-some.other.field' {'different.field': {'any': 'dict'}} so for example:: s = Search().sort( 'category', '-title', {"price" : {"order" : "asc", "mode" : "avg"}} ) will sort by ``category``, ``title`` (in descending order) and ``price`` in ascending order using the ``avg`` mode. The API returns a copy of the Search object and can thus be chained. """ s = self._clone() s._sort = [] for k in keys: if isinstance(k, string_types) and k.startswith("-"): if k[1:] == "_score": raise ValueError("Sorting by `-_score` is not allowed.") k = {k[1:]: {"order": "desc"}} s._sort.append(k) return s def highlight_options(self, **kwargs): """ Update the global highlighting options used for this request. For example:: s = Search() s = s.highlight_options(order='score') """ s = self._clone() s._highlight_opts.update(kwargs) return s def highlight(self, *fields, **kwargs): """ Request highlighting of some fields. All keyword arguments passed in will be used as parameters for all the fields in the ``fields`` parameter. Example:: Search().highlight('title', 'body', fragment_size=50) will produce the equivalent of:: { "highlight": { "fields": { "body": {"fragment_size": 50}, "title": {"fragment_size": 50} } } } If you want to have different options for different fields you can call ``highlight`` twice:: Search().highlight('title', fragment_size=50).highlight('body', fragment_size=100) which will produce:: { "highlight": { "fields": { "body": {"fragment_size": 100}, "title": {"fragment_size": 50} } } } """ s = self._clone() for f in fields: s._highlight[f] = kwargs return s def suggest(self, name, text, **kwargs): """ Add a suggestions request to the search. :arg name: name of the suggestion :arg text: text to suggest on All keyword arguments will be added to the suggestions body. For example:: s = Search() s = s.suggest('suggestion-1', 'Elasticsearch', term={'field': 'body'}) """ s = self._clone() s._suggest[name] = {"text": text} s._suggest[name].update(kwargs) return s def to_dict(self, count=False, **kwargs): """ Serialize the search into the dictionary that will be sent over as the request's body. :arg count: a flag to specify if we are interested in a body for count - no aggregations, no pagination bounds etc. All additional keyword arguments will be included into the dictionary. """ d = {} if self._query: d["query"] = self._query.to_dict() # count request doesn't care for sorting and other things if not count: if self._post_filter: d["post_filter"] = self._post_filter.to_dict() if self._aggs: d["aggs"] = self._aggs.to_dict() if self._sort: d["sort"] = self._sort d.update(self._params) if self._source not in (None, {}): d["_source"] = self._source if self._highlight: d["highlight"] = {"fields": self._highlight} d["highlight"].update(self._highlight_opts) if self._suggest: d["suggest"] = self._suggest if self._script_fields: d["script_fields"] = self._script_fields d.update(kwargs) return d def count(self): """ Return the number of hits matching the query and filters. Note that only the actual number is returned. """ es = get_connection(self._using) d = self.to_dict(count=True) return es.count(index=self._index, body=d)["count"] def execute(self): """ Execute the search and return an instance of ``Response`` wrapping all the data. """ es = get_connection(self._using) return Response(es.search(index=self._index, body=self.to_dict()), search=self) def scan(self): """ Turn the search into a scan search and return a generator that will iterate over all the documents matching the query. Use ``params`` method to specify any additional arguments you with to pass to the underlying ``scan`` helper from ``elasticsearch-py`` - https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan """ es = get_connection(self._using) for hit in scan(es, query=self.to_dict(), index=self._index): yield hit def delete(self): """ delete() executes the query by delegating to delete_by_query() """ es = get_connection(self._using) return es.delete_by_query(index=self._index, body=self.to_dict()) def __repr__(self): return json.dumps(self.to_dict(), indent=2)
class Search(DSLMixin, Request): _type_name = "search" def __init__( self, using: Optional[Elasticsearch] = None, index: Optional[Union[str, Tuple[str], List[str]]] = None, mappings: Optional[Union[MappingsDict, Mappings]] = None, nested_autocorrect: bool = False, repr_auto_execute: bool = False, document_class: DocumentMeta = None, ) -> None: """ Search request to elasticsearch. :arg using: `Elasticsearch` instance to use :arg index: limit the search to index :arg mappings: mappings used for query validation :arg nested_autocorrect: in case of missing nested clause, will insert it automatically :arg repr_auto_execute: execute query and display results as dataframe, requires client to be provided All the parameters supplied (or omitted) at creation type can be later overridden by methods (`using`, `index` and `mappings` respectively). """ self._sort: List[Union[str, Dict[str, Any]]] = [] self._source: Any = None self._highlight: Dict[str, Any] = {} self._highlight_opts: Dict[str, Any] = {} self._suggest: Dict[str, Any] = {} self._script_fields: Dict[str, Any] = {} mappings = _mappings(mappings) self._mappings: Optional[Mappings] = mappings self._aggs: Aggs = Aggs(mappings=mappings, nested_autocorrect=nested_autocorrect) self._query: Query = Query(mappings=mappings, nested_autocorrect=nested_autocorrect) self._post_filter: Query = Query(mappings=mappings, nested_autocorrect=nested_autocorrect) self._repr_auto_execute: bool = repr_auto_execute self._document_class: Optional[DocumentMeta] = document_class super(Search, self).__init__(using=using, index=index) def query(self, type_or_query: TypeOrQuery, insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, compound_param: str = None, **body: Any) -> "Search": s = self._clone() s._query = s._query.query(type_or_query, insert_below=insert_below, on=on, mode=mode, compound_param=compound_param, **body) return s query.__doc__ = Query.query.__doc__ def bool(self, must: Optional[SingleOrMultipleQueryClause] = None, should: Optional[SingleOrMultipleQueryClause] = None, must_not: Optional[SingleOrMultipleQueryClause] = None, filter: Optional[SingleOrMultipleQueryClause] = None, insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, **body: Any) -> "Search": s = self._clone() s._query = s._query.bool(must=must, should=should, filter=filter, must_not=must_not, insert_below=insert_below, on=on, mode=mode, **body) return s bool.__doc__ = Query.bool.__doc__ def filter(self, type_or_query: TypeOrQuery, insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, **body: Any) -> "Search": s = self._clone() s._query = s._query.filter(type_or_query, insert_below=insert_below, on=on, mode=mode, bool_body=bool_body, **body) return s filter.__doc__ = Query.filter.__doc__ def must_not(self, type_or_query: TypeOrQuery, insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, **body: Any) -> "Search": s = self._clone() s._query = s._query.must_not(type_or_query, insert_below=insert_below, on=on, mode=mode, bool_body=bool_body, **body) return s must_not.__doc__ = Query.must_not.__doc__ def should(self, type_or_query: TypeOrQuery, insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, **body: Any) -> "Search": s = self._clone() s._query = s._query.should(type_or_query, insert_below=insert_below, on=on, mode=mode, bool_body=bool_body, **body) return s should.__doc__ = Query.should.__doc__ def must(self, type_or_query: TypeOrQuery, insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, **body: Any) -> "Search": s = self._clone() s._query = s._query.must(type_or_query, insert_below=insert_below, on=on, mode=mode, bool_body=bool_body, **body) return s must.__doc__ = Query.must.__doc__ def exclude(self, type_or_query: TypeOrQuery, insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, bool_body: ClauseBody = None, **body: Any) -> "Search": """Must not wrapped in filter context.""" s = self._clone() s._query = s._query.filter( Bool(must_not=Query._q(type_or_query=type_or_query, **body)), insert_below=insert_below, on=on, mode=mode, bool_body=bool_body, ) return s def post_filter(self, type_or_query: TypeOrQuery, insert_below: Optional[QueryName] = None, on: Optional[QueryName] = None, mode: InsertionModes = ADD, compound_param: str = None, **body: Any) -> "Search": s = self._clone() s._post_filter = s._post_filter.query(type_or_query=type_or_query, insert_below=insert_below, on=on, mode=mode, compound_param=compound_param, **body) return s def agg(self, name: AggName, type_or_agg: Optional[TypeOrAgg] = None, insert_below: Optional[AggName] = None, at_root: bool_ = False, **body: Any) -> "Search": s = self._clone() s._aggs = s._aggs.agg(name, type_or_agg=type_or_agg, insert_below=insert_below, at_root=at_root, **body) return s agg.__doc__ = Aggs.agg.__doc__ def aggs( self, aggs: Union[AggsDictOrNode, "Aggs"], insert_below: Optional[AggName] = None, at_root: bool_ = False, ) -> "Search": s = self._clone() s._aggs = s._aggs.aggs(aggs, insert_below=insert_below, at_root=at_root) return s aggs.__doc__ = Aggs.aggs.__doc__ def groupby(self, name: AggName, type_or_agg: Optional[TypeOrAgg] = None, insert_below: Optional[AggName] = None, at_root: bool_ = False, **body: Any) -> "Search": s = self._clone() s._aggs = s._aggs.groupby(name, type_or_agg=type_or_agg, insert_below=insert_below, at_root=at_root, **body) return s groupby.__doc__ = Aggs.groupby.__doc__ def __iter__(self) -> Iterator[Hit]: """ Iterate over the hits. Return iterable of ``pandagg.response.Hit``. """ return iter(self.execute()) def __getitem__(self, n: Union[slice, List, int]) -> "Search": """ Support slicing the `Search` instance for pagination. Slicing equates to the from/size parameters. E.g.:: s = Search().query(...)[0:25] is equivalent to:: s = Search().query(...).params(from=0, size=25) """ s = self._clone() if isinstance(n, slice): # If negative slicing, abort. if n.start and n.start < 0 or n.stop and n.stop < 0: raise ValueError("Search does not support negative slicing.") # Elasticsearch won't get all results so we default to size: 10 if # stop not given. s._params["from"] = n.start or 0 s._params["size"] = n.stop - (n.start or 0) if n.stop is not None else 10 return s if isinstance(n, list): return s.source(includes=n) # This is an index lookup, equivalent to slicing by [n:n+1]. # If negative index, abort. if n < 0: raise ValueError("Search does not support negative indexing.") s._params["from"] = n s._params["size"] = 1 return s def size(self, size: int) -> "Search": """ Equivalent to:: s = Search().params(size=size) """ s = self._clone() s._params["size"] = size return s @classmethod def from_dict(cls, d: Dict) -> "Search": """ Construct a new `Search` instance from a raw dict containing the search body. Useful when migrating from raw dictionaries. Example:: s = Search.from_dict({ "query": { "bool": { "must": [...] } }, "aggs": {...} }) s = s.filter('term', published=True) """ s = cls() s.update_from_dict(d) return s def _clone(self) -> "Search": """ Return a clone of the current search request. Performs a shallow copy of all the underlying objects. Used internally by most state modifying APIs. """ s = Search(using=self._using, index=self._index, mappings=self._mappings) s._params = self._params.copy() s._sort = self._sort[:] s._source = copy.copy( self._source) if self._source is not None else None s._highlight = self._highlight.copy() s._highlight_opts = self._highlight_opts.copy() s._suggest = self._suggest.copy() s._script_fields = self._script_fields.copy() s._aggs = self._aggs.clone() s._query = self._query.clone() s._post_filter = self._post_filter.clone() s._mappings = None if self._mappings is None else self._mappings.clone( ) s._repr_auto_execute = self._repr_auto_execute s._document_class = self._document_class return s def update_from_dict(self, d: Dict) -> "Search": """ Apply options from a serialized body to the current instance. Modifies the object in-place. Used mostly by ``from_dict``. """ d = d.copy() if "query" in d: self._query = Query(d.pop("query")) if "post_filter" in d: self._post_filter = Query(d.pop("post_filter")) aggs = d.pop("aggs", d.pop("aggregations", {})) if aggs: self._aggs = Aggs(aggs) if "sort" in d: self._sort = d.pop("sort") if "_source" in d: self._source = d.pop("_source") if "highlight" in d: high = d.pop("highlight").copy() self._highlight = high.pop("fields") self._highlight_opts = high if "suggest" in d: self._suggest = d.pop("suggest") if "text" in self._suggest: text = self._suggest.pop("text") for s in self._suggest.values(): s.setdefault("text", text) if "script_fields" in d: self._script_fields = d.pop("script_fields") self._params.update(d) return self def script_fields(self, **kwargs: Any) -> "Search": """ Define script fields to be calculated on hits. See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-script-fields.html for more details. Example:: s = Search() s = s.script_fields(times_two="doc['field'].value * 2") s = s.script_fields( times_three={ 'script': { 'inline': "doc['field'].value * params.n", 'params': {'n': 3} } } ) """ s = self._clone() for name in kwargs: if isinstance(kwargs[name], str): kwargs[name] = {"script": kwargs[name]} s._script_fields.update(kwargs) return s def source(self, fields: Union[str, List[str], Dict[str, Any]] = None, **kwargs: Any) -> "Search": """ Selectively control how the _source field is returned. :arg fields: wildcard string, array of wildcards, or dictionary of includes and excludes If ``fields`` is None, the entire document will be returned for each hit. If fields is a dictionary with keys of 'includes' and/or 'excludes' the fields will be either included or excluded appropriately. Calling this multiple times with the same named parameter will override the previous values with the new ones. Example:: s = Search() s = s.source(includes=['obj1.*'], excludes=["*.description"]) s = Search() s = s.source(includes=['obj1.*']).source(excludes=["*.description"]) """ s = self._clone() if fields and kwargs: raise ValueError( "You cannot specify fields and kwargs at the same time.") if fields is not None: s._source = fields return s if kwargs and not isinstance(s._source, dict): s._source = {} for key, value in kwargs.items(): if value is None: try: del s._source[key] except KeyError: pass else: s._source[key] = value return s def sort(self, *keys: Union[str, Dict[str, Any]]) -> "Search": """ Add sorting information to the search request. If called without arguments it will remove all sort requirements. Otherwise it will replace them. Acceptable arguments are:: 'some.field' '-some.other.field' {'different.field': {'any': 'dict'}} so for example:: s = Search().sort( 'category', '-title', {"price" : {"order" : "asc", "mode" : "avg"}} ) will sort by ``category``, ``title`` (in descending order) and ``price`` in ascending order using the ``avg`` mode. The API returns a copy of the Search object and can thus be chained. """ s = self._clone() s._sort = [] for k in keys: if isinstance(k, str) and k.startswith("-"): if k[1:] == "_score": raise ValueError("Sorting by `-_score` is not allowed.") k = {k[1:]: {"order": "desc"}} s._sort.append(k) return s def highlight_options(self, **kwargs: Any) -> "Search": """ Update the global highlighting options used for this request. For example:: s = Search() s = s.highlight_options(order='score') """ s = self._clone() s._highlight_opts.update(kwargs) return s def highlight(self, *fields: str, **kwargs: Any) -> "Search": """ Request highlighting of some fields. All keyword arguments passed in will be used as parameters for all the fields in the ``fields`` parameter. Example:: Search().highlight('title', 'body', fragment_size=50) will produce the equivalent of:: { "highlight": { "fields": { "body": {"fragment_size": 50}, "title": {"fragment_size": 50} } } } If you want to have different options for different fields you can call ``highlight`` twice:: Search().highlight('title', fragment_size=50).highlight('body', fragment_size=100) which will produce:: { "highlight": { "fields": { "body": {"fragment_size": 100}, "title": {"fragment_size": 50} } } } """ s = self._clone() for f in fields: s._highlight[f] = kwargs return s def suggest(self, name: str, text: str, **kwargs: Any) -> "Search": """ Add a suggestions request to the search. :arg name: name of the suggestion :arg text: text to suggest on All keyword arguments will be added to the suggestions body. For example:: s = Search() s = s.suggest('suggestion-1', 'Elasticsearch', term={'field': 'body'}) """ s = self._clone() s._suggest[name] = {"text": text} s._suggest[name].update(kwargs) return s def to_dict(self, count: bool_ = False, **kwargs: Any) -> SearchDict: """ Serialize the search into the dictionary that will be sent over as the request's body. :arg count: a flag to specify if we are interested in a body for count - no aggregations, no pagination bounds etc. All additional keyword arguments will be included into the dictionary. """ d: SearchDict = {} if self._query: dq = self._query.to_dict() if dq: d["query"] = dq # count request doesn't care for sorting and other things if not count: if self._post_filter: pfd = self._post_filter.to_dict() if pfd: d["post_filter"] = pfd if self._aggs: d["aggs"] = self._aggs.to_dict() if self._sort: d["sort"] = self._sort # query params are not typed in search dict d.update(self._params) # type: ignore if self._source not in (None, {}): d["_source"] = self._source if self._highlight: highlights: Dict[str, Any] = {"fields": self._highlight} highlights.update(self._highlight_opts) d["highlight"] = highlights if self._suggest: d["suggest"] = self._suggest if self._script_fields: d["script_fields"] = self._script_fields # TODO: check if those kwargs are really useful d.update(kwargs) # type: ignore return d def count(self) -> int: """ Return the number of hits matching the query and filters. Note that only the actual number is returned. """ es = self._get_connection() d = self.to_dict(count=True) return es.count(index=self._index, body=d)["count"] def execute(self) -> SearchResponse: """ Execute the search and return an instance of ``Response`` wrapping all the data. """ es = self._get_connection() raw_data = es.search(index=self._index, body=self.to_dict()) return SearchResponse(data=raw_data, _search=self) # type: ignore def scan_composite_agg(self, size: int) -> Iterator[BucketDict]: """Iterate over the whole aggregation composed buckets, yields buckets.""" s: Search = self._clone().size(0) s._aggs = s._aggs.as_composite(size=size) a_name, _ = s._aggs.get_composition_supporting_agg() r: SearchResponse = s.execute() buckets: List[BucketDict] = r.aggregations.data[ a_name][ # type: ignore "buckets"] after_key: AfterKey = r.aggregations.data[a_name][ "after_key"] # type: ignore init: bool = True while init or len(buckets) == size: init = False s._aggs = s._aggs.as_composite(size=size, after=after_key) r = s.execute() agg_clause_response = r.aggregations.data[a_name] buckets = agg_clause_response["buckets"] # type: ignore for bucket in buckets: yield bucket if "after_key" in agg_clause_response: after_key = agg_clause_response["after_key"] # type: ignore else: break def scan_composite_agg_at_once(self, size: int) -> Aggregations: """Iterate over the whole aggregation composed buckets (converting Aggs into composite agg if possible), and return all buckets at once in a Aggregations instance. """ all_buckets = list(self.scan_composite_agg(size=size)) s: Search = self._clone().size(0) s._aggs = s._aggs.as_composite(size=size) agg_name: AggName agg_name, _ = s._aggs.get_composition_supporting_agg() # type: ignore # artificially merge all buckets as if they were returned in a single query return Aggregations(_search=s, data={agg_name: { "buckets": all_buckets }}) def scan(self) -> Iterator[Hit]: """ Turn the search into a scan search and return a generator that will iterate over all the documents matching the query. Use ``params`` method to specify any additional arguments you with to pass to the underlying ``scan`` helper from ``elasticsearch-py`` - https://elasticsearch-py.readthedocs.io/en/master/helpers.html#elasticsearch.helpers.scan """ es = self._get_connection() for hit in scan(es, query=self.to_dict(), index=self._index): yield Hit(hit, _document_class=self._document_class) def delete(self) -> DeleteByQueryResponse: """ delete() executes the query by delegating to delete_by_query() """ es = self._get_connection() return es.delete_by_query(index=self._index, body=self.to_dict()) # type: ignore def __eq__(self, other: Any) -> bool_: return (isinstance(other, Search) and other._index == self._index and other.to_dict() == self.to_dict()) def _auto_execution_df_result(self) -> pd.DataFrame: try: import pandas as pd # noqa except ImportError: raise ImportError("repr_auto_execute requires pandas dependency") if self._aggs.to_dict(): # hits are not necessary to display aggregation results r = self.size(0).execute() return r.aggregations.to_dataframe() r = self.execute() return r.hits.to_dataframe() def __repr__(self) -> str: # inspired by https://github.com/elastic/eland/blob/master/eland/dataframe.py#L471 idea to execute search at # __repr__ to have more interactive experience if not self._repr_auto_execute: return json.dumps(self.to_dict(), indent=2) return self._auto_execution_df_result().__repr__() def _repr_html_(self) -> Optional[str]: if not self._repr_auto_execute: return None return self._auto_execution_df_result()._repr_html_()
def test_normalize_buckets(self): my_agg = Aggs(sample.EXPECTED_AGG_QUERY, mappings=MAPPINGS) response = Aggregations(data=sample.ES_AGG_RESPONSE, _search=Search().aggs(my_agg)).to_normalized() self.assertEqual(ordered(response), ordered(sample.EXPECTED_NORMALIZED_RESPONSE))