def get_all_terms(self, args): """ Performs a terms aggregation for each field to get every existing term. """ self.es = Elasticsearch( host=self.rules['es_host'], port=self.rules['es_port'], timeout=self.rules.get('es_conn_timeout', 50), send_get_body_as=self.rules.get('send_get_body_as', 'GET') ) window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30})) field_name = {"field": "", "size": 2147483647} # Integer.MAX_VALUE query_template = {"aggs": {"values": {"terms": field_name}}} if args and args.start: end = ts_to_dt(args.start) else: end = ts_now() start = end - window_size step = datetime.timedelta(**self.rules.get('window_step_size', {'days': 1})) for field in self.fields: tmp_start = start tmp_end = min(start + step, end) time_filter = {self.rules['timestamp_field']: {'lt': dt_to_ts(tmp_end), 'gte': dt_to_ts(tmp_start)}} query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}} query = {'aggs': {'filtered': query_template}} # For composite keys, we will need to perform sub-aggregations if type(field) == list: self.seen_values.setdefault(tuple(field), []) level = query_template['aggs'] # Iterate on each part of the composite key and add a sub aggs clause to the elastic search query for i, sub_field in enumerate(field): level['values']['terms']['field'] = add_raw_postfix(sub_field) if i < len(field) - 1: # If we have more fields after the current one, then set up the next nested structure level['values']['aggs'] = {'values': {'terms': copy.deepcopy(field_name)}} level = level['values']['aggs'] else: self.seen_values.setdefault(field, []) # For non-composite keys, only a single agg is needed field_name['field'] = add_raw_postfix(field) # Query the entire time range in small chunks while tmp_start < end: if self.rules.get('use_strftime_index'): index = format_index(self.rules['index'], tmp_start, tmp_end) else: index = self.rules['index'] res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout='50s') if 'aggregations' in res: buckets = res['aggregations']['filtered']['values']['buckets'] if type(field) == list: # For composite keys, make the lookup based on all fields # Make it a tuple since it can be hashed and used in dictionary lookups for bucket in buckets: # We need to walk down the hierarchy and obtain the value at each level self.seen_values[tuple(field)] += self.flatten_aggregation_hierarchy(bucket) else: keys = [bucket['key'] for bucket in buckets] self.seen_values[field] += keys else: self.seen_values.setdefault(field, []) if tmp_start == tmp_end: break tmp_start = tmp_end tmp_end = min(tmp_start + step, end) time_filter[self.rules['timestamp_field']] = {'lt': dt_to_ts(tmp_end), 'gte': dt_to_ts(tmp_start)} for key, values in self.seen_values.iteritems(): if not values: if type(key) == tuple: # If we don't have any results, it could either be because of the absence of any baseline data # OR it may be because the composite key contained a non-primitive type. Either way, give the # end-users a heads up to help them debug what might be going on. elastalert_logger.warning(( 'No results were found from all sub-aggregations. This can either indicate that there is ' 'no baseline data OR that a non-primitive field was used in a composite key.' )) else: elastalert_logger.info('Found no values for %s' % (field)) continue self.seen_values[key] = list(set(values)) elastalert_logger.info('Found %s unique values for %s' % (len(values), key))
def get_all_terms(self, args): """ Performs a terms aggregation for each field to get every existing term. """ self.es = elasticsearch_client(self.rules) window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30})) field_name = {"field": "", "size": 2147483647} # Integer.MAX_VALUE query_template = {"aggs": {"values": {"terms": field_name}}} if args and args.start: end = ts_to_dt(args.start) else: end = ts_now() start = end - window_size step = datetime.timedelta(**self.rules.get('window_step_size', {'days': 1})) for field in self.fields: tmp_start = start tmp_end = min(start + step, end) time_filter = {self.rules['timestamp_field']: {'lt': dt_to_ts(tmp_end), 'gte': dt_to_ts(tmp_start)}} query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}} query = {'aggs': {'filtered': query_template}} # For composite keys, we will need to perform sub-aggregations if type(field) == list: self.seen_values.setdefault(tuple(field), []) level = query_template['aggs'] # Iterate on each part of the composite key and add a sub aggs clause to the elastic search query for i, sub_field in enumerate(field): level['values']['terms']['field'] = add_raw_postfix(sub_field) if i < len(field) - 1: # If we have more fields after the current one, then set up the next nested structure level['values']['aggs'] = {'values': {'terms': copy.deepcopy(field_name)}} level = level['values']['aggs'] else: self.seen_values.setdefault(field, []) # For non-composite keys, only a single agg is needed field_name['field'] = add_raw_postfix(field) # Query the entire time range in small chunks while tmp_start < end: if self.rules.get('use_strftime_index'): index = format_index(self.rules['index'], tmp_start, tmp_end) else: index = self.rules['index'] res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout='50s') if 'aggregations' in res: buckets = res['aggregations']['filtered']['values']['buckets'] if type(field) == list: # For composite keys, make the lookup based on all fields # Make it a tuple since it can be hashed and used in dictionary lookups for bucket in buckets: # We need to walk down the hierarchy and obtain the value at each level self.seen_values[tuple(field)] += self.flatten_aggregation_hierarchy(bucket) else: keys = [bucket['key'] for bucket in buckets] self.seen_values[field] += keys else: self.seen_values.setdefault(field, []) if tmp_start == tmp_end: break tmp_start = tmp_end tmp_end = min(tmp_start + step, end) time_filter[self.rules['timestamp_field']] = {'lt': dt_to_ts(tmp_end), 'gte': dt_to_ts(tmp_start)} for key, values in self.seen_values.iteritems(): if not values: if type(key) == tuple: # If we don't have any results, it could either be because of the absence of any baseline data # OR it may be because the composite key contained a non-primitive type. Either way, give the # end-users a heads up to help them debug what might be going on. elastalert_logger.warning(( 'No results were found from all sub-aggregations. This can either indicate that there is ' 'no baseline data OR that a non-primitive field was used in a composite key.' )) else: elastalert_logger.info('Found no values for %s' % (field)) continue self.seen_values[key] = list(set(values)) elastalert_logger.info('Found %s unique values for %s' % (len(values), key))