def __get_query_agg_percentiles(cls, field, agg_id=None): """ Create an es_dsl aggregation object for getting the percentiles value of a field. In general this is used to get the median (0.5) percentile. :param field: field from which the get the percentiles values :return: a tuple with the aggregation id and es_dsl aggregation object. Ex: { "percentile": { "field": <field> } """ if not agg_id: agg_id = cls.AGGREGATION_ID query_agg = A("percentiles", field=field) return (agg_id, query_agg)
def _get_aggregation(self, **extra): params = { 'path': self.nestedfield, 'aggs': { 'val': { 'terms': { 'field': self.field, 'size': 40, 'min_doc_count': 1 } } } } params.update(self.kwargs) params.update(extra) return A('nested', **params)
def get_cardinality(self, field=None): """ Create a cardinality aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError( "Please provide field to apply aggregation to!") agg = A("cardinality", field=field, precision_threshold=self.precision_threshold) self.aggregations['cardinality_' + field] = agg return self
def organisation_pids(self): """Get organisations pids.""" organisations = set() search = DocumentsSearch().filter('term', contribution__agent__pid=self.pid) size = current_app.config.get('RERO_ILS_AGGREGATION_SIZE').get( 'organisations') agg = A('terms', field='holdings.organisation.organisation_pid', size=size) search.aggs.bucket('organisation', agg) results = search.execute() for result in results.aggregations.organisation.buckets: if result.doc_count: organisations.add(result.key) return list(organisations)
def infoFirewallActions(self): s = Search(index='ossim-osdepym*') s = s.query('match_all') s = s.filter('range', log_date={ "gte": 1554087600000, "lte": 1556679599999 }) s.aggs.bucket( 'actions', A('terms', field='action.keyword', size=10, order={"_count": "desc"})) return s.execute().aggregations.actions.buckets
def __get_query_agg_cardinality(cls, field, agg_id=None): """ Create an es_dsl aggregation object for getting the approximate count of distinct values of a field. :param field: field from which the get count of distinct values :return: a tuple with the aggregation id and es_dsl aggregation object. Ex: { "cardinality": { "field": <field>, "precision_threshold": 3000 } """ if not agg_id: agg_id = cls.AGGREGATION_ID query_agg = A("cardinality", field=field, precision_threshold=cls.ES_PRECISION) return (agg_id, query_agg)
def test_get_extended_stats(self): """ Test the extended statistics aggregation """ field = self.field1 # without field param with self.assertRaises(AttributeError): self.Query_test_object.get_extended_stats() # with field param self.Query_test_object.get_extended_stats(field) test_agg = A("extended_stats", field=field) agg_name, agg = self.Query_test_object.aggregations.popitem() self.assertEqual('extended_stats_' + field, agg_name) self.assertEqual(agg, test_agg)
def get_loans_by_item_pids(item_pids): """Get loans for the given item pid list.""" states = \ current_app.config['CIRCULATION_STATES_LOAN_ACTIVE'] loan_search = LoansSearch() \ .filter('terms', state=states) \ .filter('terms', item_pid__value=item_pids) \ .source(['pid', 'item_pid.value', 'start_date', 'end_date', 'state', '_created']) agg = A('terms', field='item_pid.value', size=chunk_size) loan_search.aggs.bucket('loans_count', agg) loan_search = loan_search.extra( collapse={ 'field': 'item_pid.value', "inner_hits": { "name": "most_recent", "size": 1, "sort": [{ "_created": "desc" }], } }) # default results size for the execute method is 10. # We need to set this to the chunk size" results = loan_search[0:chunk_size].execute() agg_buckets = {} for result in results.aggregations.loans_count.buckets: agg_buckets[result.key] = result.doc_count loans = {} for loan_hit in results: # get most recent loans loan_data = loan_hit.meta.inner_hits.most_recent[0]\ .to_dict() item_pid = loan_data['item_pid']['value'] loans[item_pid] = { 'loans_count': agg_buckets.get(item_pid, 0), 'last_transaction_date': ciso8601.parse_datetime(loan_data['_created']).date() } if loan_data.get('state') == LoanState.ITEM_ON_LOAN: loans[item_pid]['checkout_date'] = ciso8601.\ parse_datetime(loan_data['start_date']).date() loans[item_pid]['due_date'] = ciso8601.\ parse_datetime(loan_data['end_date']).date() return loans
def infoFlowLogs(self): s = Search(index='ossim-osdepym*') s = s.query('match_all') s = s.filter('range', log_date={ "gte": 1554087600000, "lte": 1556679599999 }) s.aggs.bucket( 'users', A('date_histogram', field='log_date', interval="30m", time_zone="America/Argentina/Buenos_Aires", min_doc_count=1)) return s.execute().aggregations.users.buckets
def BuildRootTree(self): s = Search() t = Q('has_parent', type='hostname', query=Q('query_string', query="*")) aggs = A('terms', field='AuditType.Generator', size=16) s.aggs.bucket('datatypes', aggs) query = s.query(t) try: r = requests.post(self.es_host + ":" + self.es_port + self.index + self.type_audit_type + '/_search', data=json.dumps(query.to_dict()), auth=(self.elastic_user, self.elastic_pass), verify=False) except ConnectionError as e: ret = {"connection_error": e.args[0]} return ret data = [{ "id": "stackable", "parent": "#", "text": "Stackable Data", "type": "root" }] i = [ 'w32services', 'w32tasks', 'w32scripting-persistence', 'w32prefetch', 'w32network-dns', 'urlhistory', 'filedownloadhistory' ] for x in r.json()['aggregations']['datatypes']['buckets']: if x['key'] not in i: pass else: data.append({ "id": x['key'], "parent": "stackable", "text": x['key'], "children": True, "type": "stack" }) return data
def test_nb_sales_by_product_type_filter_product_type_1(elasticsearch_sale): # Number of sales, by product type, for product_type_1 a = A('filter', term={'products.product_type': 'product_type_1'}) a.bucket( 'reverse_nested_root', 'reverse_nested', ) search = get_search() search.aggs.bucket( 'products', 'nested', path='products', ).bucket( 'product_type_1', a, ) write_output(search, 'nb_sales_by_product_type_filter_product_type_1')
def extend_elasticsearch_search_with_sub_aggregation(self, search: AccountSearch): """ This template method is called if the `self.sub_agg_key` is supplied, in order to post-process the query and inject a sub-aggregation on a secondary dimension (that is subordinate to the first agg_key's dimension). Example: Subtier Agency spending rolled up to Toptier Agency spending """ sub_bucket_count = 1000 # get_number_of_unique_terms_for_accounts(self.filter_query, f"{self.sub_agg_key}") size = sub_bucket_count shard_size = sub_bucket_count + 100 if shard_size > 10000: raise ForbiddenException( "Current filters return too many unique items. Narrow filters to return results or use downloads." ) # Sub-aggregation to append to primary agg sub_group_by_sub_agg_key_values = {"field": self.sub_agg_key, "size": size, "shard_size": shard_size} sub_group_by_sub_agg_key = A("terms", **sub_group_by_sub_agg_key_values) sub_dim_metadata = A( "top_hits", size=1, sort=[{"financial_accounts_by_award.update_date": {"order": "desc"}}], _source={"includes": self.sub_top_hits_fields}, ) sub_sum_covid_outlay = A( "sum", field="financial_accounts_by_award.gross_outlay_amount_by_award_cpe", script={"source": "doc['financial_accounts_by_award.is_final_balances_for_fy'].value ? _value : 0"}, ) sub_sum_covid_obligation = A("sum", field="financial_accounts_by_award.transaction_obligated_amount") sub_count_awards_by_dim = A("reverse_nested", **{}) sub_award_count = A("value_count", field="financial_account_distinct_award_key") loan_value = A("sum", field="total_loan_value") sub_group_by_sub_agg_key.metric("dim_metadata", sub_dim_metadata).metric( "sum_transaction_obligated_amount", sub_sum_covid_obligation ).metric("sum_gross_outlay_amount_by_award_cpe", sub_sum_covid_outlay).bucket( "count_awards_by_dim", sub_count_awards_by_dim ).metric( "award_count", sub_award_count ).metric( "sum_loan_value", loan_value ) # Append sub-agg to primary agg, and include the sub-agg's sum metric aggs too search.aggs[self.agg_group_name]["group_by_dim_agg"].bucket(self.sub_agg_group_name, sub_group_by_sub_agg_key)
def test_scan_aggs_with_multiple_aggs(data_client): s = Search(index='flat-git') key_aggs = [ { 'files': A('terms', field='files') }, { 'months': { 'date_histogram': { 'field': 'committed_date', 'interval': 'month' } } }, ] file_list = list(scan_aggs(s, key_aggs)) assert len(file_list) == 47
def get_loans_count_by_library_for_patron_pid(patron_pid, filter_states=None): """Get loans count for patron and aggregate result on library_pid. :param patron_pid: The patron pid :param filter_states: loans type to filters :return: a dict with library_pid as key, number of loans as value """ filter_states = filter_states or [] # prevent mutable argument warning agg = A('terms', field='library_pid') search = search_by_patron_item_or_document(patron_pid=patron_pid, filter_states=filter_states) search.aggs.bucket('library', agg) search = search[0:0] results = search.execute() stats = {} for result in results.aggregations.library.buckets: stats[result.key] = result.doc_count return stats
def get_by_id(self, discussion_id, min_il=0, max_il=100): """Return a single discussion by discussion_id""" search = self._prepare_search() \ .filter("match", discussion_id=discussion_id) search.aggs.bucket('discussions', A('terms', field='discussion_id')) \ .bucket("unread", "filter", term={"is_unread": True}) result = search.execute() if not result.hits or len(result.hits) < 1: return None message = self.get_last_message(discussion_id, min_il, max_il, True) discussion = DiscussionIndex(discussion_id) discussion.total_count = result.hits.total discussion.last_message = message discussion.unread_count = result.aggregations.discussions.buckets[ 0].unread.doc_count return discussion
def get_percentiles(self, field=None, percents=None): """ Create a percentile aggregation object and add it to the aggregation dict :param field: the field present in the index that is to be aggregated :param percents: the specific percentiles to be calculated default: [1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0] :returns: self, which allows the method to be chainable with the other methods """ if not field: raise AttributeError("Please provide field to apply aggregation to!") if not percents: percents = [1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0] agg = A("percentiles", field=field, percents=percents) self.aggregations['percentiles_' + field] = agg return self
def test_scan_aggs_with_multiple_aggs(data_client): s = Search(index="flat-git") key_aggs = [ { "files": A("terms", field="files") }, { "months": { "date_histogram": { "field": "committed_date", "interval": "month" } } }, ] file_list = list(scan_aggs(s, key_aggs)) assert len(file_list) == 47
def by_period(self, field=None, period=None, timezone=None, start=None, end=None): """ Create a date histogram aggregation using the last added aggregation for the current object. Add this date_histogram aggregation into self.aggregations :param field: the index field to create the histogram from :param period: the interval which elasticsearch supports, ex: "month", "week" and such :param timezone: custom timezone :param start: custom start date for the date histogram, default: start date under range :param end: custom end date for the date histogram, default: end date under range :returns: self, which allows the method to be chainable with the other methods """ hist_period = period if period else self.interval_ time_zone = timezone if timezone else "UTC" start_ = start if start else self.start_date end_ = end if end else self.end_date bounds = self.get_bounds(start_, end_) date_field = field if field else "grimoire_creation_date" agg_key = "date_histogram_" + date_field if agg_key in self.aggregations.keys(): agg = self.aggregations[agg_key] else: agg = A("date_histogram", field=date_field, interval=hist_period, time_zone=time_zone, min_doc_count=0, **bounds) child_agg_counter = self.child_agg_counter_dict[agg_key] child_name, child_agg = self.aggregations.popitem() agg.metric(child_agg_counter, child_agg) self.aggregations[agg_key] = agg self.child_agg_counter_dict[agg_key] += 1 return self
def _create_aggregate(self, catalog: CatalogName, filters: MutableFilters, facet_config, agg): """ Creates the aggregation to be used in ElasticSearch :param catalog: The name of the catalog to create the aggregations for :param filters: Translated filters from 'files/' endpoint call :param facet_config: Configuration for the facets (i.e. facets on which to construct the aggregate) in '{browser:es_key}' form :param agg: Current aggregate where this aggregation is occurring. Syntax in browser form :return: returns an Aggregate object to be used in a Search query """ # Pop filter of current Aggregate excluded_filter = filters.pop(facet_config[agg], None) # Create the appropriate filters filter_query = self._create_query(catalog, filters) # Create the filter aggregate aggregate = A('filter', filter_query) # Make an inner aggregate that will contain the terms in question _field = f'{facet_config[agg]}.keyword' service_config = self.service_config(catalog) if agg == 'project': _sub_field = service_config.translation['projectId'] + '.keyword' aggregate.bucket('myTerms', 'terms', field=_field, size=config.terms_aggregation_size).bucket( 'myProjectIds', 'terms', field=_sub_field, size=config.terms_aggregation_size) else: aggregate.bucket('myTerms', 'terms', field=_field, size=config.terms_aggregation_size) aggregate.bucket('untagged', 'missing', field=_field) if agg == 'fileFormat': # FIXME: Use of shadow field is brittle # https://github.com/DataBiosphere/azul/issues/2289 file_size_field = service_config.translation['fileSize'] + '_' aggregate.aggs['myTerms'].metric('size_by_type', 'sum', field=file_size_field) aggregate.aggs['untagged'].metric('size_by_type', 'sum', field=file_size_field) # If the aggregate in question didn't have any filter on the API # call, skip it. Otherwise insert the popped # value back in if excluded_filter is not None: filters[facet_config[agg]] = excluded_filter return aggregate
def clusters(): """ Отображает AJAX-версию страницы с кластеризацией. Предназначено для замены display_clusters() после тестирования. """ person = request.values.get('filter') print(person) Face._index.refresh() total = Face.search().count() named = Face.search().filter("exists", field="person").count() status = "{:.1%} ({} out of {}) faces are named. Clusters count: {}".format( named / total, named, total, Cluster.search().count()) a = A("terms", field="person.raw", size=10000) ps = Search() ps.aggs.bucket("persons", a) psr = ps.execute() persons = [b.key for b in psr.aggs['persons']] if person: s = Cluster.search().filter("prefix", person=person).sort("-face_count") results = s[0:10000].execute() else: s = Cluster.search().exclude("exists", field="person") s.query = FunctionScore(query=s.query, functions=[ SF('random_score', weight=100), SF('field_value_factor', field="face_count", weight=1) ], score_mode="avg", boost_mode="replace") results = s[0:50].execute() return render_template('clusters.html', clusters=results, persons=persons, status=status)
def _get_search_instance(self, **kwargs): ''' Instantiate and return an Elasticsearch search instance. Set the maximum number of aggregation buckets to 10. In practical terms, bucket size is the number of siblings at a given taxonomy level. As of 7/17/17, we have seven siblings on level_0 and an average of nine on level_1. Electronics currently has 14 children, i.e. buckets, so we'll set our bucket size to 15. ''' # set the max number of buckets to retrieve agg_level = kwargs.get('agg_level') BUCKET_SIZE = 15 s = Search(index='article') agg_on_field = agg_level + '.raw' cat_lv_dict = {'field': agg_on_field} a = A('terms', size=BUCKET_SIZE, **cat_lv_dict) s.aggs.bucket('category', a) return s, BUCKET_SIZE
def test_get_aggregation(self): expected = A({ 'nested': { 'path': 'some_field' }, 'aggs': { 'min_start': { 'min': { 'field': 'some_field.start' } }, 'max_end': { 'max': { 'field': 'some_field.end' } } } }) assert self.facet.get_aggregation() == expected
def get_queryset(self): """Adds aggregations and sets an empty size to return only facets.""" creator = A("nested", path="creators") creator_name = A("terms", field="creators.title.keyword", size=100) subject = A("nested", path="terms") subject_name = A("terms", field="terms.title.keyword", size=100) format = A("terms", field="formats.keyword") max_date = A("max", field="dates.end", format="epoch_millis") min_date = A("min", field="dates.begin", format="epoch_millis") online = A('filter', Q('terms', online=[True])) self.search.aggs.bucket('creator', creator).bucket("name", creator_name) self.search.aggs.bucket('subject', subject).bucket("name", subject_name) self.search.aggs.bucket('format', format) self.search.aggs.bucket("max_date", max_date) self.search.aggs.bucket("min_date", min_date) self.search.aggs.bucket("online", online) return (self.search.extra(size=0).query(self.get_structured_query()) if self.request.GET.get(settings.REST_FRAMEWORK["SEARCH_PARAM"]) else self.search.extra(size=0))
def _get_download_ids_generator(cls, search: Union[AwardSearch, TransactionSearch], size: int): """ Takes an AwardSearch or TransactionSearch object (that specifies the index, filter, and source) and returns a generator that yields list of IDs in chunksize SIZE. """ max_retries = 10 total = search.handle_count(retries=max_retries) if total is None: logger.error( "Error retrieving total results. Max number of attempts reached." ) return max_iterations = settings.MAX_DOWNLOAD_LIMIT // size req_iterations = (total // size) + 1 num_iterations = min(max(1, req_iterations), max_iterations) # Setting the shard_size below works in this case because we are aggregating on a unique field. Otherwise, this # would not work due to the number of records. Other places this is set are in the different spending_by # endpoints which are either routed or contain less than 10k unique values, both allowing for the shard # size to be manually set to 10k. for iteration in range(num_iterations): aggregation = A( "terms", field=cls._source_field, include={ "partition": iteration, "num_partitions": num_iterations }, size=size, shard_size=size, ) search.aggs.bucket("results", aggregation) response = search.handle_execute(retries=max_retries).to_dict() if response is None: raise Exception("Breaking generator, unable to reach cluster") results = [] for bucket in response["aggregations"]["results"]["buckets"]: results.append(bucket["key"]) yield results
def get_total_results(keyword): group_by_agg_key_values = { "filters": {category: {"terms": {"type": types}} for category, types in INDEX_ALIASES_TO_AWARD_TYPES.items()} } aggs = A("filters", **group_by_agg_key_values) filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keyword_search": [es_minimal_sanitize(keyword)]} ) search = TransactionSearch().filter(filter_query) search.aggs.bucket("types", aggs) response = search.handle_execute() if response is not None: try: return response["aggregations"]["types"]["buckets"] except KeyError: logger.error("Unexpected Response") else: logger.error("No Response") return None
def get_daily_volume(self, from_date, to_date): s = Search(using='operations', index="deex-*") s = s.extra(size=0) s = s.query('bool', filter = [ Q('term', operation_type=4), Q('range', block_data__block_time={'gte': from_date, 'lte': to_date}), Q('term', operation_history__op_object__fill_price__quote__asset_id__keyword=config.CORE_ASSET_ID) ]) a = A('date_histogram', field='block_data.block_time', interval='1d', format='yyyy-MM-dd') \ .metric('volume', 'sum', field='operation_history.op_object.fill_price.quote.amount') s.aggs.bucket('volume_over_time', a) response = s.execute() daily_volumes = [] for daily_volume in response.aggregations.volume_over_time.buckets: daily_volumes.append({ 'date': daily_volume.key_as_string, 'volume': daily_volume.volume.value }) return daily_volumes
def get(self): """ Answer to GET requests. Used for `visualize` tab instance search typeahead. Write a list of all instance names in Elasticsearch. """ s = Result.search() a = A("terms", field="instance_name", size=0) # set size to 0 so all results are returned s.aggs.bucket("unique_instances", a) s = s.params(search_type="count") res = s.execute() names = [ x["key"] for x in res.aggregations["unique_instances"]["buckets"] ] return self.write(json.dumps(names))
def applyAggregations(self): rootAgg = self.query.aggs.bucket('editions', A('nested', path='editions')) lastAgg = rootAgg for i, agg in enumerate(self.appliedAggregations): currentAgg = 'edition_filter_{}'.format(i) lastAgg = lastAgg.bucket(currentAgg, agg) lastAgg.bucket('lang_parent', 'nested', path='editions.languages')\ .bucket( 'languages', 'terms', **{'field': 'editions.languages.language', 'size': 200} )\ .bucket('editions_per', 'reverse_nested') lastAgg.bucket( 'formats', 'terms', **{'field': 'editions.formats', 'size': 10} )\ .bucket('editions_per', 'reverse_nested')
def test_by_period_without_args(self): """ Test the date histogram aggregation with no parameters """ test_agg = A("date_histogram", field=self.date_field1, interval=self.interval, time_zone=self.timezone, min_doc_count=0, **{}) test_agg.metric(0, "cardinality", field=self.field1, precision_threshold=self.precision_threshold) self.Query_test_object.get_cardinality(self.field1)\ .by_period() agg_name, agg = self.Query_test_object.aggregations.popitem() self.assertEqual(agg, test_agg, msg='\n{0}\n{1}'.format(agg, test_agg))
def test_by_organizations(self): """ Test nested aggregation wrt author organizations Just checking if the aggregation exists in the dict, for now Because there is no org field in 'git' data source """ test_agg = A("terms", field="author_domain", missing="others", size=self.size) test_agg.metric(0, "cardinality", field=self.field1, precision_threshold=self.precision_threshold) self.Query_test_object.get_cardinality(self.field1)\ .by_organizations("author_domain") agg_name, agg = self.Query_test_object.aggregations.popitem() self.assertEqual(agg, test_agg, msg='\n{0}\n{1}'.format(agg, test_agg))