def search_fuzzy(request=None, project_id=None): project_id = project_id if project_id \ else json.loads(request.session['project_id']) index_name = elastic_cache_key(project_id, 'ec2') ebs_index_name = elastic_cache_key(project_id, 'ebs') elb_index_name = elastic_cache_key(project_id, 'elb') eip_index_name = elastic_cache_key(project_id, 'eip') vpc_index_name = elastic_cache_key(project_id, 'vpc') subnet_index_name = elastic_cache_key(project_id, 'subnet') security_group_index_name = elastic_cache_key(project_id, 'security_group') st = request.GET.get('st', None) client = Elasticsearch(hosts=settings.ELASTIC_SEARCH_NODES) query = { "query": { "query_string": { "fields": ["title"], "query": "*" + st + "*", } }, } total = client.search(index=[ index_name, ebs_index_name, elb_index_name, eip_index_name, vpc_index_name, subnet_index_name, security_group_index_name ], doc_type=[ "instance_id", "name_title", "prip_title", "puip_title", "ebs", "eip", "elb", "vpc", "subnet", "security_group_id", "security_group_name" ], body=query, ignore_unavailable=True)['hits']['total'] # Get Total search result and set size parameter equal to that, to get all results # ToDo Discuss and Optimize query['size'] = total search_results = client.search(index=[ index_name, ebs_index_name, elb_index_name, eip_index_name, vpc_index_name, subnet_index_name, security_group_index_name ], doc_type=[ "instance_id", "name_title", "prip_title", "puip_title", "ebs", "eip", "elb", "vpc", "subnet", "security_group_id", "security_group_name" ], body=query, ignore_unavailable=True) return search_results
def main(): es_host = raw_input("Elasticsearch host: ") es_port = raw_input("Elasticsearch port: ") db_name = raw_input("Dashboard name: ") send_get_body_as = raw_input( "Method for querying Elasticsearch[GET]: ") or 'GET' es = Elasticsearch(host=es_host, port=es_port, send_get_body_as=send_get_body_as) query = {'query': {'term': {'_id': db_name}}} res = es.search(index='kibana-int', doc_type='dashboard', body=query, _source_include=['dashboard']) if not res['hits']['hits']: print("No dashboard %s found" % (db_name)) exit() db = json.loads(res['hits']['hits'][0]['_source']['dashboard']) config_filters = filters_from_dashboard(db) print("\nPartial Config file") print("-----------\n") print("name: %s" % (db_name)) print("es_host: %s" % (es_host)) print("es_port: %s" % (es_port)) print("filter:") print(yaml.safe_dump(config_filters))
class TestReindexer(unittest.TestCase): def setUp(self): self.source_index = "reindex" self.target_index = "reindex-a" self.client = Elasticsearch() self.reindexer = Reindexer(self.client) self.schema_manager = SchemaManager(self.client) # try: # read_only_setting = {"index": {"blocks": {"read_only": False}}} # self.client.indices.put_settings(index=self.source_index, body=read_only_setting) # except: # pass self.client.indices.create(index=self.source_index) def tearDown(self): for index in [self.source_index, self.target_index]: try: self.client.indices.delete(index=index) except: pass def test_reindex(self): create = [] for i in ['a', 'b', 'c', 'd', 'e']: doc = { '_op_type': 'create', '_index': self.source_index, '_type': 'document', 'doc': {'name': i} } create.append(doc) bulk(self.client, create, refresh=True) docs = self.client.search(index=self.source_index) self.assertEqual(len(docs['hits']['hits']), 5) self.reindexer.do_reindex(self.source_index, self.target_index, 3) self.client.indices.refresh(','.join([self.source_index, self.target_index])) docs = self.client.search(index=self.source_index) self.assertEqual(len(docs['hits']['hits']), 5) docs = self.client.search(index=self.target_index) self.assertEqual(len(docs['hits']['hits']), 5)
def get_dashboard(self, rule, db_name): """ Download dashboard which matches use_kibana_dashboard from elasticsearch. """ es = Elasticsearch(host=rule['es_host'], port=rule['es_port']) if not db_name: raise EAException("use_kibana_dashboard undefined") query = {'query': {'term': {'_id': db_name}}} try: res = es.search(index='kibana-int', doc_type='dashboard', body=query, _source_include=['dashboard']) except ElasticsearchException as e: raise EAException("Error querying for dashboard: %s" % (e)) if res['hits']['hits']: return json.loads(res['hits']['hits'][0]['_source']['dashboard']) else: raise EAException("Could not find dashboard named %s" % (db_name))
def main(): es_host = raw_input("Elasticsearch host: ") es_port = raw_input("Elasticsearch port: ") db_name = raw_input("Dashboard name: ") es = Elasticsearch(host=es_host, port=es_port) query = {'query': {'term': {'_id': db_name}}} res = es.search(index='kibana-int', doc_type='dashboard', body=query, _source_include=['dashboard']) if not res['hits']['hits']: print("No dashboard %s found" % (db_name)) exit() db = json.loads(res['hits']['hits'][0]['_source']['dashboard']) config_filters = filters_from_dashboard(db) print("\nPartial Config file") print("-----------\n") print("name: %s" % (db_name)) print("es_host: %s" % (es_host)) print("es_port: %s" % (es_port)) print("filter:") print(yaml.safe_dump(config_filters))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--host', help='Elasticsearch host') parser.add_argument('--port', type=int, help='Elasticsearch port') parser.add_argument('--url-prefix', help='Elasticsearch URL prefix') parser.add_argument('--no-auth', action='store_const', const=True, help='Suppress prompt for basic auth') parser.add_argument('--ssl', action='store_true', default=None, help='Use SSL') parser.add_argument('--no-ssl', dest='ssl', action='store_false', help='Do not use SSL') parser.add_argument('--index', help='Index name to create') parser.add_argument('--old-index', help='Old index name to copy') parser.add_argument('--boto-profile', default=None, help='Boto profile to use for signing requests') parser.add_argument('--aws-region', default=None, help='AWS Region to use for signing requests') args = parser.parse_args() if os.path.isfile('../config.yaml'): filename = '../config.yaml' elif os.path.isfile('config.yaml'): filename = 'config.yaml' else: filename = '' if filename: with open(filename) as config_file: data = yaml.load(config_file) host = args.host if args.host else data.get('es_host') port = args.port if args.port else data.get('es_port') username = data.get('es_username') password = data.get('es_password') url_prefix = args.url_prefix if args.url_prefix is not None else data.get( 'es_url_prefix', '') use_ssl = args.ssl if args.ssl is not None else data.get('use_ssl') aws_region = data.get('aws_region', None) else: username = None password = None aws_region = args.aws_region host = args.host if args.host else raw_input( 'Enter elasticsearch host: ') port = args.port if args.port else int( raw_input('Enter elasticsearch port: ')) use_ssl = (args.ssl if args.ssl is not None else raw_input('Use SSL? t/f: ').lower() in ('t', 'true')) if args.no_auth is None: username = raw_input('Enter optional basic-auth username: '******'Enter optional basic-auth password: '******'Enter optional Elasticsearch URL prefix: ')) auth = Auth() http_auth = auth(host=host, username=username, password=password, aws_region=aws_region, boto_profile=args.boto_profile) es = Elasticsearch(host=host, port=port, use_ssl=use_ssl, connection_class=RequestsHttpConnection, http_auth=http_auth, url_prefix=url_prefix) silence_mapping = { 'silence': { 'properties': { 'rule_name': { 'index': 'not_analyzed', 'type': 'string' }, 'until': { 'type': 'date', 'format': 'dateOptionalTime' }, '@timestamp': { 'format': 'dateOptionalTime', 'type': 'date' } } } } ess_mapping = { 'elastalert_status': { 'properties': { 'rule_name': { 'index': 'not_analyzed', 'type': 'string' }, '@timestamp': { 'format': 'dateOptionalTime', 'type': 'date' } } } } es_mapping = { 'elastalert': { 'properties': { 'rule_name': { 'index': 'not_analyzed', 'type': 'string' }, '@timestamp': { 'format': 'dateOptionalTime', 'type': 'date' }, 'alert_time': { 'format': 'dateOptionalTime', 'type': 'date' }, 'match_body': { 'enabled': False, 'type': 'object' }, 'aggregate_id': { 'index': 'not_analyzed', 'type': 'string' } } } } past_mapping = { 'past_elastalert': { 'properties': { 'rule_name': { 'index': 'not_analyzed', 'type': 'string' }, 'match_body': { 'enabled': False, 'type': 'object' }, '@timestamp': { 'format': 'dateOptionalTime', 'type': 'date' }, 'aggregate_id': { 'index': 'not_analyzed', 'type': 'string' } } } } error_mapping = { 'elastalert_error': { 'properties': { 'data': { 'type': 'object', 'enabled': False }, '@timestamp': { 'format': 'dateOptionalTime', 'type': 'date' } } } } index = args.index if args.index is not None else raw_input( 'New index name? (Default elastalert_status) ') if not index: index = 'elastalert_status' old_index = (args.old_index if args.old_index is not None else raw_input('Name of existing index to copy? (Default None) ')) res = None if old_index: print('Downloading existing data...') res = es.search(index=old_index, body={}, size=500000) print('Got %s documents' % (len(res['hits']['hits']))) es_index = IndicesClient(es) if es_index.exists(index): print('Index ' + index + ' already exists. Skipping index creation.') return None es.indices.create(index) # To avoid a race condition. TODO: replace this with a real check time.sleep(2) es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping) es.indices.put_mapping(index=index, doc_type='past_elastalert', body=past_mapping) print('New index %s created' % index) if res: bulk = ''.join([ '%s\n%s\n' % (json.dumps({'create': { '_type': doc['_type'], '_index': index }}), json.dumps(doc['_source'])) for doc in res['hits']['hits'] ]) print('Uploading data...') es.bulk(body=bulk, index=index) print('Done!')
from elasticsearch.client import Elasticsearch from elasticsearch import helpers indexName = "morocco-99" print("Index Name: ", indexName) es = Elasticsearch(hosts="http://localhost:9200") results = es.search(body={ "_source": "html", "size": 100, "query": { "bool": { "must_not": { "exists": { "field": "processedText" } } } } }, index=indexName) if len(results['hits']['hits']) > 0: print("Records Found: ", len(results['hits']['hits']) , "Processing Now") import re from bs4 import BeautifulSoup for item in range(len(results['hits']['hits'])): print("Processing", results['hits']['hits'][item]['_id']) soup = BeautifulSoup(results['hits']['hits'][item]['_source']['html'], 'html.parser') for script in soup(["script", "style",""]): script.extract()
def reindex(from_hosts, from_index, to_hosts, to_index, to_type, source='{"query":{"match_all":{}}}', max_docs=0, page_size=10, logging_per_docs=1000, es_scroll='5m', request_timeout=60): if from_index is None: logger.warn('from_index is empty.') return from_es = Elasticsearch(hosts=from_hosts) to_es = Elasticsearch(hosts=to_hosts) scroll_id = None counter = 0 running = True bulk_data = [] while(running): try: if scroll_id is None: response = from_es.search(index=from_index, body=source, params={"request_timeout": request_timeout, "scroll": es_scroll, "size": page_size}) else: response = from_es.scroll(scroll_id=scroll_id, params={"request_timeout": request_timeout, "scroll": es_scroll}) if len(response['hits']['hits']) == 0: running = False break scroll_id = response['_scroll_id'] for hit in response['hits']['hits']: if '_source' in hit: counter += 1 if counter % logging_per_docs == 0: logger.info(u'Loaded {0} docs.'.format(counter)) if max_docs > 0 and counter >= max_docs: logger.info(u'{0} docs are loaded, but it exceeded {1} docs.'.format(counter, max_docs)) running = False break op_index = to_index if to_index is not None else hit['_index'] op_type = to_type if to_type is not None else hit['_type'] bulk_data.append({"index": {"_index": op_index, "_type": op_type, "_id": hit['_id']} }) bulk_data.append(hit['_source']) if len(bulk_data) != 0: to_es.bulk(body=bulk_data, params={"request_timeout": request_timeout}) bulk_data = [] except NotFoundError: break except: logger.exception(u"Failed to load documents from Elasticsearch(Loaded {0} doc).".format(counter)) break if len(bulk_data) != 0: to_es.bulk(body=bulk_data, params={"request_timeout": request_timeout}) logger.info('Loaded {0} documents.'.format(counter))
def get_changes(es: Elasticsearch, index_name: str, url: str) -> list: search_body = get_changes_query(url) response = es.search(index=index_name, body=search_body) return response['hits']['hits']
class NewTermsRule(RuleType): """ Alerts on a new value in a list of fields. """ def __init__(self, rule, args=None): super(NewTermsRule, self).__init__(rule, args) self.seen_values = {} # Allow the use of query_key or fields if 'fields' not in self.rules: if 'query_key' not in self.rules: raise EAException("fields or query_key must be specified") self.fields = self.rules['query_key'] else: self.fields = self.rules['fields'] if not self.fields: raise EAException("fields must not be an empty list") if type(self.fields) != list: self.fields = [self.fields] if self.rules.get('use_terms_query') and ( len(self.fields) != 1 or len(self.fields) == 1 and type(self.fields[0]) == list ): raise EAException("use_terms_query can only be used with a single non-composite field") try: self.get_all_terms(args) except Exception as e: # Refuse to start if we cannot get existing terms raise EAException('Error searching for existing terms: %s' % (e)) def get_all_terms(self, args): """ Performs a terms aggregation for each field to get every existing term. """ self.es = Elasticsearch( host=self.rules['es_host'], port=self.rules['es_port'], timeout=self.rules.get('es_conn_timeout', 50), send_get_body_as=self.rules.get('send_get_body_as', 'GET') ) window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30})) field_name = {"field": "", "size": 2147483647} # Integer.MAX_VALUE query_template = {"aggs": {"values": {"terms": field_name}}} if args and args.start: end = ts_to_dt(args.start) else: end = ts_now() start = end - window_size if self.rules.get('use_strftime_index'): index = format_index(self.rules['index'], start, end) else: index = self.rules['index'] time_filter = {self.rules['timestamp_field']: {'lte': dt_to_ts(end), 'gte': dt_to_ts(start)}} query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}} query = {'aggs': {'filtered': query_template}} for field in self.fields: # For composite keys, we will need to perform sub-aggregations if type(field) == list: level = query_template['aggs'] # Iterate on each part of the composite key and add a sub aggs clause to the elastic search query for i, sub_field in enumerate(field): level['values']['terms']['field'] = add_raw_postfix(sub_field) if i < len(field) - 1: # If we have more fields after the current one, then set up the next nested structure level['values']['aggs'] = {'values': {'terms': copy.deepcopy(field_name)}} level = level['values']['aggs'] else: # For non-composite keys, only a single agg is needed field_name['field'] = add_raw_postfix(field) res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout='50s') if 'aggregations' in res: buckets = res['aggregations']['filtered']['values']['buckets'] if type(field) == list: # For composite keys, make the lookup based on all fields # Make it a tuple since it can be hashed and used in dictionary lookups self.seen_values[tuple(field)] = [] for bucket in buckets: # We need to walk down the hierarchy and obtain the value at each level self.seen_values[tuple(field)] += self.flatten_aggregation_hierarchy(bucket) # If we don't have any results, it could either be because of the absence of any baseline data # OR it may be because the composite key contained a non-primitive type. Either way, give the # end-users a heads up to help them debug what might be going on. if not self.seen_values[tuple(field)]: elastalert_logger.warning(( 'No results were found from all sub-aggregations. This can either indicate that there is ' 'no baseline data OR that a non-primitive field was used in a composite key.' )) else: keys = [bucket['key'] for bucket in buckets] self.seen_values[field] = keys elastalert_logger.info('Found %s unique values for %s' % (len(keys), field)) else: self.seen_values[field] = [] elastalert_logger.info('Found no values for %s' % (field)) def flatten_aggregation_hierarchy(self, root, hierarchy_tuple=()): """ For nested aggregations, the results come back in the following format: { "aggregations" : { "filtered" : { "doc_count" : 37, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "1.1.1.1", # IP address (root) "doc_count" : 13, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "80", # Port (sub-aggregation) "doc_count" : 3, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "ack", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 }, { "key" : "syn", # Reason (sub-aggregation, leaf-node) "doc_count" : 1 } ] } }, { "key" : "82", # Port (sub-aggregation) "doc_count" : 3, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "ack", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 }, { "key" : "syn", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 } ] } } ] } }, { "key" : "2.2.2.2", # IP address (root) "doc_count" : 4, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "443", # Port (sub-aggregation) "doc_count" : 3, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "ack", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 }, { "key" : "syn", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 } ] } } ] } } ] } } } } Each level will either have more values and buckets, or it will be a leaf node We'll ultimately return a flattened list with the hierarchies appended as strings, e.g the above snippet would yield a list with: [ ('1.1.1.1', '80', 'ack'), ('1.1.1.1', '80', 'syn'), ('1.1.1.1', '82', 'ack'), ('1.1.1.1', '82', 'syn'), ('2.2.2.2', '443', 'ack'), ('2.2.2.2', '443', 'syn') ] A similar formatting will be performed in the add_data method and used as the basis for comparison """ results = [] # There are more aggregation hierarchies left. Traverse them. if 'values' in root: results += self.flatten_aggregation_hierarchy(root['values']['buckets'], hierarchy_tuple + (root['key'],)) else: # We've gotten to a sub-aggregation, which may have further sub-aggregations # See if we need to traverse further for node in root: if 'values' in node: results += self.flatten_aggregation_hierarchy(node, hierarchy_tuple) else: results.append(hierarchy_tuple + (node['key'],)) return results def add_data(self, data): for document in data: for field in self.fields: value = () lookup_field = field if type(field) == list: # For composite keys, make the lookup based on all fields # Make it a tuple since it can be hashed and used in dictionary lookups lookup_field = tuple(field) for sub_field in field: lookup_result = lookup_es_key(document, sub_field) if not lookup_result: value = None break value += (lookup_result,) else: value = lookup_es_key(document, field) if not value and self.rules.get('alert_on_missing_field'): document['missing_field'] = lookup_field self.add_match(copy.deepcopy(document)) elif value: if value not in self.seen_values[lookup_field]: document['new_field'] = lookup_field self.add_match(copy.deepcopy(document)) self.seen_values[lookup_field].append(value) def add_terms_data(self, terms): # With terms query, len(self.fields) is always 1 and the 0'th entry is always a string field = self.fields[0] for timestamp, buckets in terms.iteritems(): for bucket in buckets: if bucket['doc_count']: if bucket['key'] not in self.seen_values[field]: match = {field: bucket['key'], self.rules['timestamp_field']: timestamp, 'new_field': field} self.add_match(match) self.seen_values[field].append(bucket['key'])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--host', help='Elasticsearch host') parser.add_argument('--port', type=int, help='Elasticsearch port') parser.add_argument('--url-prefix', help='Elasticsearch URL prefix') parser.add_argument('--no-auth', action='store_const', const=True, help='Suppress prompt for basic auth') parser.add_argument('--ssl', action='store_const', const=True, help='Use SSL') parser.add_argument('--no-ssl', action='store_const', const=True, help='Do not use SSL') parser.add_argument('--index', help='Index name to create') parser.add_argument('--old-index', help='Old index name to copy') args = parser.parse_args() if os.path.isfile('../config.yaml'): filename = '../config.yaml' elif os.path.isfile('config.yaml'): filename = 'config.yaml' else: filename = '' username = None password = None use_ssl = None url_prefix = None http_auth = None if filename: with open(filename) as config_file: data = yaml.load(config_file) host = data.get('es_host') port = data.get('es_port') username = data.get('es_username') password = data.get('es_password') url_prefix = data.get('es_url_prefix', '') use_ssl = data.get('use_ssl') else: host = args.host if args.host else raw_input('Enter elasticsearch host: ') port = args.port if args.port else int(raw_input('Enter elasticsearch port: ')) use_ssl = (args.ssl if args.ssl is not None else args.no_ssl if args.no_ssl is not None else raw_input('Use SSL? t/f: ').lower() in ('t', 'true')) if args.no_auth is None: username = raw_input('Enter optional basic-auth username: '******'Enter optional basic-auth password: '******'Enter optional Elasticsearch URL prefix: ')) if username and password: http_auth = username + ':' + password es = Elasticsearch(host=host, port=port, use_ssl=use_ssl, http_auth=http_auth, url_prefix=url_prefix) silence_mapping = {'silence': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, 'until': {'type': 'date', 'format': 'dateOptionalTime'}}}} ess_mapping = {'elastalert_status': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} es_mapping = {'elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, 'match_body': {'enabled': False, 'type': 'object'}, 'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}} error_mapping = {'elastalert_error': {'properties': {'data': {'type': 'object', 'enabled': False}}}} index = args.index if args.index is not None else raw_input('New index name? (Default elastalert_status) ') if not index: index = 'elastalert_status' old_index = (args.old_index if args.old_index is not None else raw_input('Name of existing index to copy? (Default None) ')) res = None if old_index: print('Downloading existing data...') res = es.search(index=old_index, body={}, size=500000) print('Got %s documents' % (len(res['hits']['hits']))) es.indices.create(index) es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping) print('New index %s created' % (index)) if res: bulk = ''.join(['%s\n%s\n' % (json.dumps({'create': {'_type': doc['_type'], '_index': index}}), json.dumps(doc['_source'])) for doc in res['hits']['hits']]) print('Uploading data...') es.bulk(body=bulk, index=index) print('Done!')
for sentence in sentences: ### Tokenize sentence in paragraph sentence = underthesea.word_tokenize(sentence, format="text") ### Lower case sentence = sentence.lower() paragraph_tokenized = paragraph_tokenized + sentence paragraph_tokenized = paragraph_tokenized.replace("\n", "") content_tokenized.append({ "type": "text", "content": paragraph_tokenized }) ### Convert và đẩy dữ liệu lên elasticsearch es_push_body = { "Trang": news_page, "Title": title_tokenized, "NoiDung": content_tokenized, "Description": des_tokenized, "NewspaperLink": news_link, } es.index(index="my-index", body=es_push_body) ### Đếm tổng số bản ghi hiện tại trên ES es_check_body = {"query": {"match_all": {}}} result_check = es.search(index="my-index", body=es_check_body) print(result_check["hits"]["total"]["value"])
def main(in_args=None): parser = argparse.ArgumentParser() parser.add_argument("--host", help="Elasticsearch host") parser.add_argument("--port", type=int, help="Elasticsearch port") parser.add_argument("--url-prefix", help="Elasticsearch URL prefix") parser.add_argument("--no-auth", action="store_const", const=True, help="Suppress prompt for basic auth") parser.add_argument("--ssl", action="store_true", default=None, help="Use SSL") parser.add_argument("--no-ssl", dest="ssl", action="store_false", help="Do not use SSL") parser.add_argument("--index", help="Index name to create") parser.add_argument("--old-index", help="Old index name to copy") parser.add_argument("--config", help="Config file name") args = parser.parse_args(in_args) if args.config: filename = args.config elif os.path.isfile("../config.yaml"): filename = "../config.yaml" elif os.path.isfile("config.yaml"): filename = "config.yaml" else: filename = "" username = None password = None use_ssl = None url_prefix = None http_auth = None if filename: with open(filename) as config_file: data = yaml.load(config_file) host = data.get("es_host") port = data.get("es_port") username = data.get("es_username") password = data.get("es_password") url_prefix = data.get("es_url_prefix", "") use_ssl = data.get("use_ssl") else: host = args.host if args.host else raw_input("Enter elasticsearch host: ") port = args.port if args.port else int(raw_input("Enter elasticsearch port: ")) use_ssl = args.ssl if args.ssl is not None else raw_input("Use SSL? t/f: ").lower() in ("t", "true") if args.no_auth is None: username = raw_input("Enter optional basic-auth username: "******"Enter optional basic-auth password: "******"Enter optional Elasticsearch URL prefix: ") ) if username and password: http_auth = username + ":" + password es = Elasticsearch(host=host, port=port, use_ssl=use_ssl, http_auth=http_auth, url_prefix=url_prefix) silence_mapping = { "silence": { "properties": { "rule_name": {"index": "not_analyzed", "type": "string"}, "until": {"type": "date", "format": "dateOptionalTime"}, } } } ess_mapping = { "elastalert_status": { "properties": { "rule_name": {"index": "not_analyzed", "type": "string"}, "@timestamp": {"format": "dateOptionalTime", "type": "date"}, } } } es_mapping = { "elastalert": { "properties": { "rule_name": {"index": "not_analyzed", "type": "string"}, "match_body": {"enabled": False, "type": "object"}, "aggregate_id": {"index": "not_analyzed", "type": "string"}, } } } error_mapping = {"elastalert_error": {"properties": {"data": {"type": "object", "enabled": False}}}} index = args.index if args.index is not None else raw_input("New index name? (Default elastalert_status) ") if not index: index = "elastalert_status" res = None if args.old_index: print("Downloading existing data...") res = es.search(index=args.old_index, body={}, size=500000) print("Got %s documents" % (len(res["hits"]["hits"]))) es.indices.create(index) es.indices.put_mapping(index=index, doc_type="elastalert", body=es_mapping) es.indices.put_mapping(index=index, doc_type="elastalert_status", body=ess_mapping) es.indices.put_mapping(index=index, doc_type="silence", body=silence_mapping) es.indices.put_mapping(index=index, doc_type="elastalert_error", body=error_mapping) print("New index %s created" % (index)) if res: bulk = "".join( [ "%s\n%s\n" % (json.dumps({"create": {"_type": doc["_type"], "_index": index}}), json.dumps(doc["_source"])) for doc in res["hits"]["hits"] ] ) print("Uploading data...") es.bulk(body=bulk, index=index) print("Done!")
class ELmonocleDB: log = logging.getLogger("monocle.ELmonocleDB") def __init__( self, elastic_conn="localhost:9200", index=None, timeout=10, prefix=CHANGE_PREFIX, create=True, previous_schema=False, idents_config: Optional[IdentsConfig] = None, user=None, password=None, use_ssl=None, verify_certs=None, ssl_show_warn=None, ) -> None: host, port = elastic_conn.split(":") s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ip = socket.gethostbyname(host) self.log.info("ES IP is %s" % ip) self.log.info("ES prefix is %s" % prefix) elastic_conn = [{ "host": host, "port": port, }] if use_ssl: elastic_conn[0]["use_ssl"] = use_ssl if not verify_certs: elastic_conn[0]["verify_certs"] = verify_certs if not ssl_show_warn: elastic_conn[0]["ssl_show_warn"] = ssl_show_warn if user and password: elastic_conn[0]["http_auth"] = "%s:%s" % (user, password) while True: try: s.connect((ip, int(port))) s.shutdown(2) s.close() break except Exception as excpt: self.log.info( "Unable to connect to %s: %s. Sleeping for %ds." % (elastic_conn, excpt, timeout)) time.sleep(timeout) self.log.info("Connecting to ES server at %s" % elastic_conn) self.es = Elasticsearch(elastic_conn) self.log.info(self.es.info()) if previous_schema: self.prefix = PREV_CHANGE_PREFIX else: self.prefix = prefix if not index: self.log.info("No index provided") return self.idents_config = idents_config or [] self.index = "{}{}".format(self.prefix, index) self.log.info("Using ES index %s" % self.index) self.mapping = { "properties": { "id": { "type": "keyword" }, "type": { "type": "keyword" }, "number": { "type": "keyword" }, "change_id": { "type": "keyword" }, "title": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 8191 } }, }, "text": { "type": "text", "fields": { "keyword": { "type": "keyword", "ignore_above": 8191 } }, }, "url": { "type": "keyword" }, "commit_count": { "type": "integer" }, "additions": { "type": "integer" }, "deletions": { "type": "integer" }, "changed_files_count": { "type": "integer" }, "changed_files": { "properties": { "additions": { "type": "integer" }, "deletions": { "type": "integer" }, "path": { "type": "keyword" }, } }, "commits": { "properties": { "sha": { "type": "keyword" }, "author": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "committer": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "authored_at": { "type": "date", "format": "date_time_no_millis", }, "committed_at": { "type": "date", "format": "date_time_no_millis", }, "additions": { "type": "integer" }, "deletions": { "type": "integer" }, "title": { "type": "text" }, } }, "repository_prefix": { "type": "keyword" }, "repository_fullname": { "type": "keyword" }, "repository_shortname": { "type": "keyword" }, "author": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "on_author": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "committer": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "merged_by": { "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, } }, "branch": { "type": "keyword" }, "target_branch": { "type": "keyword" }, "created_at": { "type": "date", "format": "date_time_no_millis" }, "on_created_at": { "type": "date", "format": "date_time_no_millis" }, "merged_at": { "type": "date", "format": "date_time_no_millis" }, "updated_at": { "type": "date", "format": "date_time_no_millis" }, "closed_at": { "type": "date", "format": "date_time_no_millis" }, "state": { "type": "keyword" }, "duration": { "type": "integer" }, "mergeable": { "type": "keyword" }, "labels": { "type": "keyword" }, "assignees": { "type": "nested", "properties": { "uid": { "type": "keyword" }, "muid": { "type": "keyword" }, }, }, "approval": { "type": "keyword" }, "draft": { "type": "boolean" }, "self_merged": { "type": "boolean" }, } } settings = {"mappings": self.mapping} self.ic = self.es.indices if create: self.ic.create(index=self.index, ignore=400, body=settings) # The authors_histo is failing on some context with this error when the # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be # set by changing the [search.max_buckets] cluster level) # This is an attempt to mitigate the issue cluster_settings = {"transient": {"search.max_buckets": 100000}} self.es.cluster.put_settings(body=cluster_settings) def update(self, source_it: List[Union[Change, Event]]) -> None: def gen(it): for _source in it: source = change_or_event_to_dict(_source) d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = source["id"] d["doc"] = source d["doc_as_upsert"] = True yield d bulk(self.es, gen(source_it)) self.es.indices.refresh(index=self.index) def delete_index(self): self.log.info("Deleting index: %s" % self.index) self.ic.delete(index=self.index) def delete_repository(self, repository_fullname): params = {"index": self.index} body = { "query": { "bool": { "filter": { "regexp": { "repository_fullname": { "value": repository_fullname } } } } } } params["body"] = body self.es.delete_by_query(**params) self.es.indices.refresh(index=self.index) def get_last_updated(self, repository_fullname): params = {"index": self.index} body = { "sort": [{ "updated_at": { "order": "desc" } }], "query": { "bool": { "filter": [ { "term": { "type": "Change" } }, { "regexp": { "repository_fullname": { "value": repository_fullname } } }, ] } }, } params["body"] = body try: res = self.es.search(**params) except Exception: return [] ret = [r["_source"] for r in res["hits"]["hits"]] if not ret: return [] return ret[0] def run_named_query(self, name, *args, **kwargs): if name not in queries.public_queries: raise UnknownQueryException("Unknown query: %s" % name) return getattr(queries, name)(self.es, self.index, *args, **kwargs) def get_indices(self): return [ ind.replace(self.prefix, "") for ind in self.es.indices.get(self.prefix + "*") ] def iter_index(self): body = {"query": {"match_all": {}}} return scan(self.es, query=body, index=self.index, size=5000) def update_idents(self) -> None: import json bulk_size = 7500 def get_obj_hash(obj: Dict) -> int: obj_json = json.dumps(obj, sort_keys=True) return hash(obj_json) def update_ident(dict_ident: Dict) -> Dict: dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config) return dict_ident def _update_idents( obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]: prev_hash = get_obj_hash(obj) if obj["type"] == "Change": obj["author"] = update_ident(obj["author"]) if "committer" in obj: obj["committer"] = update_ident(obj["committer"]) if "merged_by" in obj: obj["merged_by"] = update_ident(obj["merged_by"]) if "assignees" in obj: obj["assignees"] = list(map(update_ident, obj["assignees"])) if "commits" in obj: for commit in obj["commits"]: commit["author"] = update_ident(commit["author"]) commit["committer"] = update_ident(commit["committer"]) else: if "author" in obj: obj["author"] = update_ident(obj["author"]) if "on_author" in obj: obj["on_author"] = update_ident(obj["on_author"]) updated = not prev_hash == get_obj_hash(obj) if updated: return dict_to_change_or_event(obj), True else: return None, False def bulk_update(to_update: List) -> List: print("Updating %s objects ..." % len(to_update)) self.update(to_update) return [] to_update = [] total_read = 0 for _obj in self.iter_index(): total_read += 1 if total_read % bulk_size == 0: print("%s objects read from the database" % total_read) obj = _obj["_source"] obj, updated = _update_idents(obj) if updated: to_update.append(obj) if len(to_update) == bulk_size: to_update = bulk_update(to_update) bulk_update(to_update)
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create(self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index(self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update(self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template(self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain(self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query(self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate(self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate(self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector(self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
class ELmonocleDB: log = logging.getLogger("monocle.ELmonocleDB") def __init__( self, elastic_conn="localhost:9200", index=None, timeout=10, prefix=CHANGE_PREFIX, create=True, previous_schema=False, idents_config: Optional[IdentsConfig] = None, user=None, password=None, use_ssl=None, verify_certs=None, ssl_show_warn=None, ) -> None: host, port = elastic_conn.split(":") s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) ip = socket.gethostbyname(host) self.log.info("ES IP is %s" % ip) self.log.info("ES prefix is %s" % prefix) elastic_conn = [ { "host": host, "port": port, } ] if use_ssl: elastic_conn[0]["use_ssl"] = use_ssl if not verify_certs: elastic_conn[0]["verify_certs"] = verify_certs if not ssl_show_warn: elastic_conn[0]["ssl_show_warn"] = ssl_show_warn if user and password: elastic_conn[0]["http_auth"] = "%s:%s" % (user, password) while True: try: s.connect((ip, int(port))) s.shutdown(2) s.close() break except Exception as excpt: self.log.info( "Unable to connect to %s: %s. Sleeping for %ds." % (elastic_conn, excpt, timeout) ) time.sleep(timeout) self.log.info("Connecting to ES server at %s" % elastic_conn) self.es = Elasticsearch(elastic_conn) self.log.info(self.es.info()) if previous_schema: self.prefix = PREV_CHANGE_PREFIX else: self.prefix = prefix if not index: self.log.info("No index provided") return self.idents_config = idents_config or [] self.index = "{}{}".format(self.prefix, index) self.log.info("Using ES index %s" % self.index) self.mapping = { "properties": { "id": {"type": "keyword"}, "type": {"type": "keyword"}, "number": {"type": "keyword"}, "change_id": {"type": "keyword"}, "title": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}}, }, "text": { "type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}}, }, "url": {"type": "keyword"}, "commit_count": {"type": "integer"}, "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "changed_files_count": {"type": "integer"}, "changed_files": { "properties": { "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "path": {"type": "keyword"}, } }, "commits": { "properties": { "sha": {"type": "keyword"}, "author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "committer": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "authored_at": { "type": "date", "format": "date_time_no_millis", }, "committed_at": { "type": "date", "format": "date_time_no_millis", }, "additions": {"type": "integer"}, "deletions": {"type": "integer"}, "title": {"type": "text"}, } }, "repository_prefix": {"type": "keyword"}, "repository_fullname": {"type": "keyword"}, "repository_shortname": {"type": "keyword"}, "author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "on_author": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "committer": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "merged_by": { "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, } }, "branch": {"type": "keyword"}, "target_branch": {"type": "keyword"}, "created_at": {"type": "date", "format": "date_time_no_millis"}, "on_created_at": {"type": "date", "format": "date_time_no_millis"}, "merged_at": {"type": "date", "format": "date_time_no_millis"}, "updated_at": {"type": "date", "format": "date_time_no_millis"}, "closed_at": {"type": "date", "format": "date_time_no_millis"}, "state": {"type": "keyword"}, "duration": {"type": "integer"}, "mergeable": {"type": "keyword"}, "labels": {"type": "keyword"}, "assignees": { "type": "nested", "properties": { "uid": {"type": "keyword"}, "muid": {"type": "keyword"}, }, }, "approval": {"type": "keyword"}, "draft": {"type": "boolean"}, "self_merged": {"type": "boolean"}, "crawler_metadata": { "properties": { "last_commit_at": { "type": "date", "format": "date_time_no_millis", }, "last_post_at": { "type": "date", "format": "date_time_no_millis", }, "total_docs_posted": {"type": "integer"}, "total_changes_updated": {"type": "integer"}, "total_orphans_updated": {"type": "integer"}, } }, "tasks_data": { "properties": { "tid": {"type": "keyword"}, "ttype": {"type": "keyword"}, "crawler_name": {"type": "keyword"}, "updated_at": {"type": "date", "format": "date_time_no_millis"}, "change_url": {"type": "keyword"}, "severity": {"type": "keyword"}, "priority": {"type": "keyword"}, "score": {"type": "integer"}, "url": {"type": "keyword"}, "title": { "type": "text", "fields": { "keyword": {"type": "keyword", "ignore_above": 8191} }, }, "_adopted": {"type": "boolean"}, } }, } } settings = {"mappings": self.mapping} self.ic = self.es.indices if create: self.ic.create(index=self.index, ignore=400, body=settings) # The authors_histo is failing on some context with this error when the # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be # set by changing the [search.max_buckets] cluster level) # This is an attempt to mitigate the issue cluster_settings = {"transient": {"search.max_buckets": 100000}} self.es.cluster.put_settings(body=cluster_settings) def update(self, source_it: List[Union[Change, Event]]) -> None: def gen(it): for _source in it: source = change_or_event_to_dict(_source) d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = source["id"] d["doc"] = source d["doc_as_upsert"] = True yield d bulk(self.es, gen(source_it)) self.es.indices.refresh(index=self.index) def update_task_data( self, source_it: Union[ List[TaskDataForEL], List[OrphanTaskDataForEL], List[AdoptedTaskDataForEL], ], ) -> Optional[BulkIndexError]: def gen(it): for _source in it: d = {} d["_index"] = self.index d["_op_type"] = "update" d["_id"] = _source._id d["doc"] = {} d["doc"].update({"id": _source._id}) if isinstance(_source, TaskDataForEL): d["doc"].update( {"tasks_data": [asdict(td) for td in _source.tasks_data]} ) if isinstance(_source, OrphanTaskDataForEL): d["doc"].update({"tasks_data": asdict(_source.task_data)}) d["doc"]["type"] = "OrphanTaskData" if isinstance(_source, AdoptedTaskDataForEL): d["doc"].update({"tasks_data": asdict(_source.task_data)}) d["doc_as_upsert"] = True yield d ret = None try: bulk(self.es, gen(source_it)) except BulkIndexError as err: ret = err self.es.indices.refresh(index=self.index) return ret def compute_crawler_id_by_name(self, name, _type): return "crawler/%s/%s" % (_type, name) def get_task_crawler_metadata(self, name: str) -> Dict: try: ret = self.es.get( self.index, self.compute_crawler_id_by_name(name, "tasks_crawler") ) return ret["_source"]["crawler_metadata"] except Exception: return {} def set_task_crawler_metadata( self, name: str, commit_date: datetime = None, push_infos: Dict = None ): metadata = {} if commit_date: metadata.update({"last_commit_at": commit_date}) if push_infos: prev_metadata = self.get_task_crawler_metadata(name) metadata.update( { "last_post_at": push_infos["last_post_at"], "total_docs_posted": prev_metadata.get("total_docs_posted", 0) + push_infos["total_docs_posted"], "total_changes_updated": prev_metadata.get( "total_changes_updated", 0 ) + push_infos["total_changes_updated"], "total_orphans_updated": prev_metadata.get( "total_orphans_updated", 0 ) + push_infos["total_orphans_updated"], } ) body = { "doc": {"type": "TaskCrawlerDataCommit", "crawler_metadata": metadata}, "doc_as_upsert": True, } ret = None try: self.es.update( self.index, self.compute_crawler_id_by_name(name, "tasks_crawler"), body=body, ) self.es.indices.refresh(index=self.index) except Exception as err: ret = err return ret def delete_index(self): self.log.info("Deleting index: %s" % self.index) self.ic.delete(index=self.index) def delete_repository(self, repository_fullname): params = {"index": self.index} body = { "query": { "bool": { "filter": { "regexp": { "repository_fullname": {"value": repository_fullname} } } } } } params["body"] = body self.es.delete_by_query(**params) self.es.indices.refresh(index=self.index) def get_last_updated(self, repository_fullname): params = {"index": self.index} body = { "sort": [{"updated_at": {"order": "desc"}}], "query": { "bool": { "filter": [ {"term": {"type": "Change"}}, { "regexp": { "repository_fullname": {"value": repository_fullname} } }, ] } }, } params["body"] = body try: res = self.es.search(**params) except Exception: return [] ret = [r["_source"] for r in res["hits"]["hits"]] if not ret: return [] return ret[0] def get_changes_by_url(self, change_urls, size): params = { "index": self.index, "body": { "size": size, "query": { "bool": { "filter": [ {"term": {"type": "Change"}}, {"terms": {"url": change_urls}}, ] } }, }, } try: res = self.es.search(**params) except Exception: return [] return [r["_source"] for r in res["hits"]["hits"]] def get_orphan_tds_by_change_urls(self, change_urls): assert len(change_urls) <= 50 size = 5000 # Asumming not more that 100 TD data relataed to a change params = { "index": self.index, "body": { "size": size, "query": { "bool": { "must_not": {"exists": {"field": "tasks_data._adopted"}}, "filter": [ {"term": {"type": "OrphanTaskData"}}, {"terms": {"tasks_data.change_url": change_urls}}, ], } }, }, } try: res = self.es.search(**params) except Exception: return [] return [r["_source"] for r in res["hits"]["hits"]] def get_orphan_tds_and_declare_adpotion(self, changes_url): assert len(changes_url) <= 50 tds = self.get_orphan_tds_by_change_urls(changes_url) if tds: adopted_tds = [ AdoptedTaskDataForEL( _id=td["id"], task_data=AdoptedTaskData(_adopted=True), ) for td in tds ] self.update_task_data(adopted_tds) return tds def update_changes_with_orphan_tds(self, mapping: Dict[str, str]): change_urls = list(mapping.keys()) while change_urls: change_urls_to_process = change_urls[:50] change_urls = change_urls[50:] tds = self.get_orphan_tds_and_declare_adpotion(change_urls_to_process) # Group tds in buckets by change_url _map: Dict[str, List] = dict() for td in tds: _map.setdefault(td["tasks_data"]["change_url"], []).append( td["tasks_data"] ) # Create update docs to attach tds to matching changes to_update = [] for change_url, tds in _map.items(): to_update.append( TaskDataForEL( _id=mapping[change_url], tasks_data=createELTaskData(tds), ) ) self.update_task_data(to_update) def run_named_query(self, name, *args, **kwargs): if name not in queries.public_queries: raise UnknownQueryException("Unknown query: %s" % name) return getattr(queries, name)(self.es, self.index, *args, **kwargs) def get_indices(self): return [ ind.replace(self.prefix, "") for ind in self.es.indices.get(self.prefix + "*") ] def iter_index(self): body = {"query": {"match_all": {}}} return scan(self.es, query=body, index=self.index, size=5000) def update_idents(self) -> None: import json bulk_size = 7500 def get_obj_hash(obj: Dict) -> int: obj_json = json.dumps(obj, sort_keys=True) return hash(obj_json) def update_ident(dict_ident: Dict) -> Dict: dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config) return dict_ident def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]: prev_hash = get_obj_hash(obj) if obj["type"] == "Change": obj["author"] = update_ident(obj["author"]) if "committer" in obj: obj["committer"] = update_ident(obj["committer"]) if "merged_by" in obj: obj["merged_by"] = update_ident(obj["merged_by"]) if "assignees" in obj: obj["assignees"] = list(map(update_ident, obj["assignees"])) if "commits" in obj: for commit in obj["commits"]: commit["author"] = update_ident(commit["author"]) commit["committer"] = update_ident(commit["committer"]) if obj["type"] in get_events_list(): if "author" in obj: obj["author"] = update_ident(obj["author"]) if "on_author" in obj: obj["on_author"] = update_ident(obj["on_author"]) updated = not prev_hash == get_obj_hash(obj) if updated: return dict_to_change_or_event(obj), True else: return None, False def bulk_update(to_update: List) -> List: print("Updating %s objects ..." % len(to_update)) self.update(to_update) return [] to_update = [] total_read = 0 for _obj in self.iter_index(): total_read += 1 if total_read % bulk_size == 0: print("%s objects read from the database" % total_read) obj = _obj["_source"] obj, updated = _update_idents(obj) if updated: to_update.append(obj) if len(to_update) == bulk_size: to_update = bulk_update(to_update) bulk_update(to_update)
class ESConnector: """ as many MS will communicate with ElasticSearch, centralize access with this library """ def __init__(self, host=None, port=9200, timeout=10, local_env=False): self.host = host self.port = port self.timeout = timeout self.local_env = local_env self.es = None def _connect(self): """ connect to a member of the ElasticSearch cluster """ try: if self.local_env: self.es = Elasticsearch([{'host': self.host, 'port': self.port}]) else: self.es = Elasticsearch([{'host': self.host, 'port': self.port}], sniff_on_start=True, sniff_on_connection_fail=True, sniffer_timeout=self.timeout) self.idx = IndicesClient(self.es) return except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchError.unknown_exception(backtrace, str(e)) def _create_index(self, index, doc_type, settings=None, mappings=None): """ create a new empty index mandatory args: index = index name doc_type = document type, ie. any valid string settings = ElasticSearch cluster configuration mappings = dict of document fields by type and indexing preference """ if not settings: settings = {'index': {'number_of_shards': '1', 'number_of_replicas': '0'}} if not mappings: mappings = {'property': {'id': {'type': 'string', 'index': 'not_analyzed'}}} try: response = self.es.create(index=index, doc_type=doc_type, body=dumps(settings)) self.idx.put_mapping(index=index, doc_type=doc_type, body=dumps(mappings)) if not 'created' in response or not response['created']: return ElasticSearchError.unable_to_create_index(index) log.info('Index: {} created'.format(index)) log.info('ES create(): response: {}'.format(response)) return except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except NotFoundError as e: return ElasticSearchError.missing_index(self.index) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchError.unknown_exception(backtrace, str(e)) def drop_index(self, index): try: if index in self.es.indices.stats()['indices'].keys(): self.es.indices.delete(index=index, ignore=[400, 404]) log.info('Index: {} deleted'.format(index)) return except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except NotFoundError as e: return ElasticSearcheError.missing_index(self.index) except RequestError as e: return ElasticSearcheError.invalid_request(str(e)) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchError.unknown_exception(backtrace, str(e)) def add_document(self, index=None, doc_type=None, doc_id=0, settings={}, mappings={}, values={}): """ add a new document to an existing index mandatory args: index = index name doc_type = document type, ie. any valid string settings = ElasticSearch cluster configuration mappings = dict of document fields by type and indexing preference values = dictionary of fields and values """ try: err_msg = self._connect() if err_msg: return err_msg if index not in self.es.indices.stats()['indices'].keys(): err_msg = self._create_index(index, doc_type, settings, mappings) if err_msg: return err_msg response = self.es.create(index=index, doc_type=doc_type, id=doc_id, body=dumps(values)) log.info('ES create(): response: {}'.format(response)) return ElasticSearchWrite.object_created(response) except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except NotFoundError as e: return ElasticSearchError.missing_index(index) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchWriteError.unknown_exception(doc_id, values, backtrace, str(e)) def update_document(self, index, doc_type, doc_id, values): """ update an existing document in an existing index mandatory args: index = index name doc_type = document type, ie. any valid string doc_id = document_id values = dictionary of fields and values """ try: err_msg = self._connect() if err_msg: return err_msg log.info('ES body: {}'.format(values)) response = self.es.update(index=index, doc_type=doc_type, id=doc_id, body=dumps(values)) log.info('ES update(): response: {}'.format(response)) return ElasticSearchWrite.object_updated(response) except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except NotFoundError as e: return ElasticSearchError.missing_index(index) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchWriteError.unknown_exception(doc_id, values, backtrace, str(e)) def find_document(self, index, doc_type, dsl=None, fields=None): """ find an existing document in an existing index mandatory args: index = index name doc_type = document type, ie. any valid string dsl = query parameters in DSL format fields = list of fields to return """ try: err_msg = self._connect() if err_msg: return err_msg response = self.es.search(index=index, doc_type=doc_type, body=dumps(dsl), _source=fields) return ElasticSearchRead.object_found(response) except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except NotFoundError as e: return ElasticSearchError.missing_index(index) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchReadError.unknown_exception(dsl, fields, backtrace, str(e)) def search_documents(self, index, doc_type, dsl, fields=None): """ find an existing document in an existing index mandatory args: index = index name doc_type = document type, ie. any valid string dsl = query parameters in DSL format fields = list of fields to return """ try: err_msg = self._connect() if err_msg: return err_msg response = self.es.search(index=index, doc_type=doc_type, body=dumps(dsl), _source=fields) return ElasticSearchRead.objects_found(response) except ConnectionError as e: return ElasticSearchError.no_host_available(self.host, self.port) except RequestError as e: return ElasticSearchError.invalid_request(str(e)) except NotFoundError as e: return ElasticSearchError.missing_index(index) except Exception as e: (type_e, value, traceback_prev) = exc_info() backtrace = extract_tb(traceback_prev) return ElasticSearchReadError.unknown_exception(dsl, fields, backtrace, str(e))
def search(es: Elasticsearch, index_name: str, search_body: dict) -> dict: return es.search(index=index_name, body=search_body)
def main(): if os.path.isfile('../config.yaml'): filename = '../config.yaml' elif os.path.isfile('config.yaml'): filename = 'config.yaml' else: filename = '' username = None password = None use_ssl = None http_auth = None if filename: with open(filename) as config_file: data = yaml.load(config_file) host = data.get('es_host') port = data.get('es_port') username = data.get('es_username') password = data.get('es_password') use_ssl = data.get('use_ssl') else: host = raw_input("Enter elasticsearch host: ") port = int(raw_input("Enter elasticsearch port: ")) while use_ssl is None: resp = raw_input("Use SSL? t/f: ").lower() use_ssl = True if resp in ('t', 'true') else (False if resp in ('f', 'false') else None) username = raw_input("Enter optional basic-auth username: "******"Enter optional basic-auth password: "******"Downloading existing data...") res = es.search(index=old_index, body={}, size=500000) print("Got %s documents" % (len(res['hits']['hits']))) es.indices.create(index) es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping) print("New index %s created" % (index)) if res: bulk = ''.join(['%s\n%s\n' % (json.dumps({'create': {'_type': doc['_type'], '_index': index}}), json.dumps(doc['_source'])) for doc in res['hits']['hits']]) print("Uploading data...") es.bulk(body=bulk, index=index) print("Done!")
def main(): if os.path.isfile('../config.yaml'): filename = '../config.yaml' elif os.path.isfile('config.yaml'): filename = 'config.yaml' else: filename = '' if filename: with open(filename) as config_file: data = yaml.load(config_file) host = data.get('es_host') port = data.get('es_port') else: host = raw_input("Enter elasticsearch host: ") port = int(raw_input("Enter elasticsearch port: ")) es = Elasticsearch(host=host, port=port) silence_mapping = { 'silence': { 'properties': { 'rule_name': { 'index': 'not_analyzed', 'type': 'string' } } } } ess_mapping = { 'elastalert_status': { 'properties': { 'rule_name': { 'index': 'not_analyzed', 'type': 'string' }, '@timestamp': { 'format': 'dateOptionalTime', 'type': 'date' } } } } es_mapping = { 'elastalert': { 'properties': { 'rule_name': { 'index': 'not_analyzed', 'type': 'string' }, 'match_body': { 'enabled': False, 'type': 'object' } } } } error_mapping = { 'elastalert_error': { 'properties': { 'data': { 'type': 'object', 'enabled': False } } } } index = raw_input('New index name? (Default elastalert_status) ') index = index if index else 'elastalert_status' old_index = raw_input('Name of existing index to copy? (Default None) ') res = None if old_index: print("Downloading existing data...") res = es.search(index=old_index, body={}, size=500000) print("Got %s documents" % (len(res['hits']['hits']))) es.indices.create(index) es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping) print("New index %s created" % (index)) if res: bulk = ''.join([ '%s\n%s\n' % (json.dumps({'create': { '_type': doc['_type'], '_index': index }}), json.dumps(doc['_source'])) for doc in res['hits']['hits'] ]) print("Uploading data...") es.bulk(body=bulk, index=index) print("Done!")
class BaseElasticsearchBackend(Base): """Base connection wrapper based on the ElasticSearch official library. It uses two entry points to configure the underlying connection: * ``transport_class``: the transport class from ``elasticsearch``. By default ``elasticsearch.transport.Transport``. * ``connection_class``: the connection class used by the transport class. It's undefined by default, as it is on the subclasses to provide one. If any of these elements is not defined, an ``ImproperlyConfigured`` error will be raised when the backend will try to configure the client. """ #: ElasticSearch transport class used by the client class to perform #: requests. transport_class = Transport #: ElasticSearch connection class used by the transport class to perform #: requests. connection_class = None def configure_client(self): """Instantiate and configure the ElasticSearch client. It simply takes the given HOSTS list and uses PARAMS as the keyword arguments of the ElasticSearch class. The client's transport_class is given by the class attribute ``transport_class``, and the connection class used by the transport class is given by the class attribute ``connection_class``. An ``ImproperlyConfigured`` exception is raised if any of these elements is undefined. """ hosts = self.server['HOSTS'] params = self.server['PARAMS'] if not self.transport_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no transport class provided' % self.__class__) if not self.connection_class: raise ImproperlyConfigured( 'Djangoes backend %r is not properly configured: ' 'no connection class provided' % self.__class__) #pylint: disable=star-args self.client = Elasticsearch(hosts, transport_class=self.transport_class, connection_class=self.connection_class, **params) # Server methods # ============== # The underlying client does not require index names to perform server # related queries, such as "ping" or "info". The connection wrapper act # for them as a proxy. def ping(self, **kwargs): return self.client.ping(**kwargs) def info(self, **kwargs): return self.client.info(**kwargs) def put_script(self, lang, script_id, body, **kwargs): return self.client.put_script(lang, script_id, body, **kwargs) def get_script(self, lang, script_id, **kwargs): return self.client.get_script(lang, script_id, **kwargs) def delete_script(self, lang, script_id, **kwargs): return self.client.delete_script(lang, script_id, **kwargs) def put_template(self, template_id, body, **kwargs): return self.client.put_template(template_id, body, **kwargs) def get_template(self, template_id, body=None, **kwargs): return self.client.get_template(template_id, body, **kwargs) def delete_template(self, template_id=None, **kwargs): return self.client.delete_template(template_id, **kwargs) # Bulk methods # ============ # The underlying client does not require index names, but it can be used. # As it makes sense to not give an index, developers are free to use these # as they want, as long as they are careful. def mget(self, body, index=None, doc_type=None, **kwargs): return self.client.mget(body, index, doc_type, **kwargs) def bulk(self, body, index=None, doc_type=None, **kwargs): return self.client.bulk(body, index, doc_type, **kwargs) def msearch(self, body, index=None, doc_type=None, **kwargs): return self.client.msearch(body, index, doc_type, **kwargs) def mpercolate(self, body, index=None, doc_type=None, **kwargs): return self.client.mpercolate(body, index, doc_type, **kwargs) # Scroll methods # ============== # The underlying client does not require an index to perform scroll. def scroll(self, scroll_id, **kwargs): return self.client.scroll(scroll_id, **kwargs) def clear_scroll(self, scroll_id, body=None, **kwargs): return self.client.clear_scroll(scroll_id, body, **kwargs) # Query methods # ============= # The underlying client requires index names (or alias names) to perform # queries. The connection wrapper overrides these client methods to # automatically uses the configured names (indices and/or aliases). def create(self, doc_type, body, doc_id=None, **kwargs): return self.client.create( self.indices, doc_type, body, doc_id, **kwargs) def index(self, doc_type, body, doc_id=None, **kwargs): return self.client.index( self.indices, doc_type, body, doc_id, **kwargs) def exists(self, doc_id, doc_type='_all', **kwargs): return self.client.exists(self.indices, doc_id, doc_type, **kwargs) def get(self, doc_id, doc_type='_all', **kwargs): return self.client.get(self.indices, doc_id, doc_type, **kwargs) def get_source(self, doc_id, doc_type='_all', **kwargs): return self.client.get_source(self.indices, doc_id, doc_type, **kwargs) def update(self, doc_type, doc_id, body=None, **kwargs): return self.client.update( self.indices, doc_type, doc_id, body, **kwargs) def search(self, doc_type=None, body=None, **kwargs): return self.client.search(self.indices, doc_type, body, **kwargs) def search_shards(self, doc_type=None, **kwargs): return self.client.search_shards(self.indices, doc_type, **kwargs) def search_template(self, doc_type=None, body=None, **kwargs): return self.client.search_template( self.indices, doc_type, body, **kwargs) def explain(self, doc_type, doc_id, body=None, **kwargs): return self.client.explain( self.indices, doc_type, doc_id, body, **kwargs) def delete(self, doc_type, doc_id, **kwargs): return self.client.delete(self.indices, doc_type, doc_id, **kwargs) def count(self, doc_type=None, body=None, **kwargs): return self.client.count(self.indices, doc_type, body, **kwargs) def delete_by_query(self, doc_type=None, body=None, **kwargs): return self.client.delete_by_query( self.indices, doc_type, body, **kwargs) def suggest(self, body, **kwargs): return self.client.suggest(body, self.indices, **kwargs) def percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.percolate( self.indices, doc_type, doc_id, body, **kwargs) def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs): return self.client.count_percolate( self.indices, doc_type, doc_id, body, **kwargs) def mlt(self, doc_type, doc_id, body=None, **kwargs): return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs) def termvector(self, doc_type, doc_id, body=None, **kwargs): return self.client.termvector( self.indices, doc_type, doc_id, body, **kwargs) def mtermvectors(self, doc_type=None, body=None, **kwargs): return self.client.mtermvectors(self.indices, doc_type, body, **kwargs) def benchmark(self, doc_type=None, body=None, **kwargs): return self.client.benchmark(self.indices, doc_type, body, **kwargs) def abort_benchmark(self, name=None, **kwargs): return self.client.abort_benchmark(name, **kwargs) def list_benchmarks(self, doc_type=None, **kwargs): return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
class NewTermsRule(RuleType): """ Alerts on a new value in a list of fields. """ def __init__(self, rule, args=None): super(NewTermsRule, self).__init__(rule, args) self.seen_values = {} # Allow the use of query_key or fields if 'fields' not in self.rules: if 'query_key' not in self.rules: raise EAException("fields or query_key must be specified") self.fields = self.rules['query_key'] else: self.fields = self.rules['fields'] if not self.fields: raise EAException("fields must not be an empty list") if type(self.fields) != list: self.fields = [self.fields] if self.rules.get('use_terms_query') and len(self.fields) != 1: raise EAException( "use_terms_query can only be used with one field at a time") try: self.get_all_terms(args) except Exception as e: # Refuse to start if we cannot get existing terms raise EAException('Error searching for existing terms: %s' % (e)) def get_all_terms(self, args): """ Performs a terms aggregation for each field to get every existing term. """ self.es = Elasticsearch(host=self.rules['es_host'], port=self.rules['es_port'], timeout=self.rules.get('es_conn_timeout', 50)) window_size = datetime.timedelta( **self.rules.get('terms_window_size', {'days': 30})) field_name = {"field": "", "size": 2147483647} # Integer.MAX_VALUE query_template = {"aggs": {"values": {"terms": field_name}}} if args and args.start: end = ts_to_dt(args.start) else: end = ts_now() start = end - window_size if self.rules.get('use_strftime_index'): index = format_index(self.rules['index'], start, end) else: index = self.rules['index'] time_filter = { self.rules['timestamp_field']: { 'lte': dt_to_ts(end), 'gte': dt_to_ts(start) } } query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}} query = {'aggs': {'filtered': query_template}} for field in self.fields: field_name['field'] = field res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout='50s') if 'aggregations' in res: buckets = res['aggregations']['filtered']['values']['buckets'] keys = [bucket['key'] for bucket in buckets] self.seen_values[field] = keys elastalert_logger.info('Found %s unique values for %s' % (len(keys), field)) else: self.seen_values[field] = [] elastalert_logger.info('Found no values for %s' % (field)) def add_data(self, data): for document in data: for field in self.fields: value = document.get(field) if not value and self.rules.get('alert_on_missing_field'): document['missing_field'] = field self.add_match(document) elif value: if value not in self.seen_values[field]: document['new_field'] = field self.add_match(document) self.seen_values[field].append(value) def add_terms_data(self, terms): # With terms query, len(self.fields) is always 1 field = self.fields[0] for timestamp, buckets in terms.iteritems(): for bucket in buckets: if bucket['doc_count']: if bucket['key'] not in self.seen_values[field]: match = { field: bucket['key'], self.rules['timestamp_field']: timestamp, 'new_field': field } self.add_match(match)
class NewTermsRule(RuleType): """ Alerts on a new value in a list of fields. """ def __init__(self, *args): super(NewTermsRule, self).__init__(*args) self.seen_values = {} # Allow the use of query_key or fields if 'fields' not in self.rules: if 'query_key' not in self.rules: raise EAException("fields or query_key must be specified") self.fields = self.rules['query_key'] else: self.fields = self.rules['fields'] if not self.fields: raise EAException("fields must not be an empty list") if type(self.fields) != list: self.fields = [self.fields] if self.rules.get('use_terms_query') and len(self.fields) != 1: raise EAException("use_terms_query can only be used with one field at a time") self.get_all_terms() def get_all_terms(self): """ Performs a terms aggregation for each field to get every existing term. """ self.es = Elasticsearch(host=self.rules['es_host'], port=self.rules['es_port']) window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30})) field_name = {"field": "", "size": 2147483647} # Integer.MAX_VALUE query_template = {"aggs": {"values": {"terms": field_name}}} if self.rules.get('use_strftime_index'): end = ts_now() start = end - window_size index = format_index(self.rules['index'], start, end) else: index = self.rules['index'] for field in self.fields: field_name['field'] = field res = self.es.search(body=query_template, index=index, ignore_unavailable=True, timeout=50) buckets = res['aggregations']['values']['buckets'] keys = [bucket['key'] for bucket in buckets] self.seen_values[field] = keys def add_data(self, data): for document in data: for field in self.fields: value = document.get(field) if not value and self.rules.get('alert_on_missing_field'): document['missing_field'] = field self.add_match(document) elif value: if value not in self.seen_values[field]: document['new_field'] = field self.add_match(document) self.seen_values[field].append(value) def add_terms_data(self, terms): # With terms query, len(self.fields) is always 1 field = self.fields[0] for timestamp, buckets in terms.iteritems(): for bucket in buckets: if bucket['doc_count']: if bucket['key'] not in self.seen_values[field]: match = {field: bucket['key'], self.rules['timestamp_field']: timestamp, 'new_field': field} self.add_match(match)
class NewTermsRule(RuleType): """ Alerts on a new value in a list of fields. """ def __init__(self, rule, args=None): super(NewTermsRule, self).__init__(rule, args) self.seen_values = {} # Allow the use of query_key or fields if 'fields' not in self.rules: if 'query_key' not in self.rules: raise EAException("fields or query_key must be specified") self.fields = self.rules['query_key'] else: self.fields = self.rules['fields'] if not self.fields: raise EAException("fields must not be an empty list") if type(self.fields) != list: self.fields = [self.fields] if self.rules.get('use_terms_query') and len(self.fields) != 1: raise EAException("use_terms_query can only be used with one field at a time") try: self.get_all_terms(args) except Exception as e: # Refuse to start if we cannot get existing terms raise EAException('Error searching for existing terms: %s' % (e)) def get_all_terms(self, args): """ Performs a terms aggregation for each field to get every existing term. """ self.es = Elasticsearch(host=self.rules['es_host'], port=self.rules['es_port']) window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30})) field_name = {"field": "", "size": 2147483647} # Integer.MAX_VALUE query_template = {"aggs": {"values": {"terms": field_name}}} if args and args.start: end = ts_to_dt(args.start) else: end = ts_now() start = end - window_size if self.rules.get('use_strftime_index'): index = format_index(self.rules['index'], start, end) else: index = self.rules['index'] time_filter = {self.rules['timestamp_field']: {'lte': dt_to_ts(end), 'gte': dt_to_ts(start)}} query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}} query = {'aggs': {'filtered': query_template}} for field in self.fields: field_name['field'] = field res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout=50) if 'aggregations' in res: buckets = res['aggregations']['filtered']['values']['buckets'] keys = [bucket['key'] for bucket in buckets] self.seen_values[field] = keys elastalert_logger.info('Found %s unique values for %s' % (len(keys), field)) else: self.seen_values[field] = [] elastalert_logger.info('Found no values for %s' % (field)) def add_data(self, data): for document in data: for field in self.fields: value = document.get(field) if not value and self.rules.get('alert_on_missing_field'): document['missing_field'] = field self.add_match(document) elif value: if value not in self.seen_values[field]: document['new_field'] = field self.add_match(document) self.seen_values[field].append(value) def add_terms_data(self, terms): # With terms query, len(self.fields) is always 1 field = self.fields[0] for timestamp, buckets in terms.iteritems(): for bucket in buckets: if bucket['doc_count']: if bucket['key'] not in self.seen_values[field]: match = {field: bucket['key'], self.rules['timestamp_field']: timestamp, 'new_field': field} self.add_match(match)
def match_query(es: _es, index: str, text: str): query = { "query": { "bool": { "must": [ { "match": { "matn_p": { "query": text, # "operator": "and" } } }, # { # "intervals": { # "matn_p": { # "all_of": { # "ordered": True, # "intervals": [ # { # "match": { # "query": text, # "max_gaps": 0, # "ordered": True # } # }, # # { # # "any_of": { # # "intervals": [ # # {"match": {"query": text}}, # # {"match": {"query": text}} # # ] # # } # # } # ] # } # } # } # } ], "filter": [ { "match": { "matn_p": { "query": text, # "operator": "and" } } }, ], "should": [ # { # "match": { # "matn_p": { # "query": text, # "operator": "and" # } # } # }, # { # "intervals": { # "matn_p": { # "all_of": { # "ordered": True, # "intervals": [ # { # "match": { # "query": text, # "max_gaps": 0, # "ordered": True # } # }, # # { # # "any_of": { # # "intervals": [ # # {"match": {"query": text}}, # # {"match": {"query": text}} # # ] # # } # # } # ] # } # } # } # } ] } } } query = { "query": { "simple_query_string": { "fields": ["matn_p","isnad_p"], "query": text , "flags": "OR|AND|PREFIX" } } } return es.search(index=index, body=query)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--host', help='Elasticsearch host') parser.add_argument('--port', type=int, help='Elasticsearch port') parser.add_argument('--url-prefix', help='Elasticsearch URL prefix') parser.add_argument('--no-auth', action='store_const', const=True, help='Suppress prompt for basic auth') parser.add_argument('--ssl', action='store_true', default=None, help='Use SSL') parser.add_argument('--no-ssl', dest='ssl', action='store_false', help='Do not use SSL') parser.add_argument('--index', help='Index name to create') parser.add_argument('--old-index', help='Old index name to copy') args = parser.parse_args() if os.path.isfile('../config.yaml'): filename = '../config.yaml' elif os.path.isfile('config.yaml'): filename = 'config.yaml' else: filename = '' username = None password = None use_ssl = None url_prefix = None http_auth = None if filename: with open(filename) as config_file: data = yaml.load(config_file) host = data.get('es_host') port = data.get('es_port') username = data.get('es_username') password = data.get('es_password') url_prefix = data.get('es_url_prefix', '') use_ssl = data.get('use_ssl') else: host = args.host if args.host else raw_input('Enter elasticsearch host: ') port = args.port if args.port else int(raw_input('Enter elasticsearch port: ')) use_ssl = (args.ssl if args.ssl is not None else raw_input('Use SSL? t/f: ').lower() in ('t', 'true')) if args.no_auth is None: username = raw_input('Enter optional basic-auth username: '******'Enter optional basic-auth password: '******'Enter optional Elasticsearch URL prefix: ')) if username and password: http_auth = username + ':' + password es = Elasticsearch(host=host, port=port, use_ssl=use_ssl, http_auth=http_auth, url_prefix=url_prefix) silence_mapping = {'silence': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, 'until': {'type': 'date', 'format': 'dateOptionalTime'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} ess_mapping = {'elastalert_status': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} es_mapping = {'elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}, 'match_body': {'enabled': False, 'type': 'object'}, 'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}} past_mapping = {'past_elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'}, 'match_body': {'enabled': False, 'type': 'object'}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}, 'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}} error_mapping = {'elastalert_error': {'properties': {'data': {'type': 'object', 'enabled': False}, '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}} index = args.index if args.index is not None else raw_input('New index name? (Default elastalert_status) ') if not index: index = 'elastalert_status' old_index = (args.old_index if args.old_index is not None else raw_input('Name of existing index to copy? (Default None) ')) res = None if old_index: print('Downloading existing data...') res = es.search(index=old_index, body={}, size=500000) print('Got %s documents' % (len(res['hits']['hits']))) es.indices.create(index) # To avoid a race condition. TODO: replace this with a real check time.sleep(2) es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping) es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping) es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping) es.indices.put_mapping(index=index, doc_type='past_elastalert', body=past_mapping) print('New index %s created' % (index)) if res: bulk = ''.join(['%s\n%s\n' % (json.dumps({'create': {'_type': doc['_type'], '_index': index}}), json.dumps(doc['_source'])) for doc in res['hits']['hits']]) print('Uploading data...') es.bulk(body=bulk, index=index) print('Done!')
def boolean_query(es: _es, index: str): es.search()
class NewTermsRule(RuleType): """ Alerts on a new value in a list of fields. """ def __init__(self, rule, args=None): super(NewTermsRule, self).__init__(rule, args) self.seen_values = {} # Allow the use of query_key or fields if 'fields' not in self.rules: if 'query_key' not in self.rules: raise EAException("fields or query_key must be specified") self.fields = self.rules['query_key'] else: self.fields = self.rules['fields'] if not self.fields: raise EAException("fields must not be an empty list") if type(self.fields) != list: self.fields = [self.fields] if self.rules.get('use_terms_query') and ( len(self.fields) != 1 or len(self.fields) == 1 and type(self.fields[0]) == list ): raise EAException("use_terms_query can only be used with a single non-composite field") try: self.get_all_terms(args) except Exception as e: # Refuse to start if we cannot get existing terms raise EAException('Error searching for existing terms: %s' % (repr(e))) def get_all_terms(self, args): """ Performs a terms aggregation for each field to get every existing term. """ self.es = Elasticsearch( host=self.rules['es_host'], port=self.rules['es_port'], timeout=self.rules.get('es_conn_timeout', 50), send_get_body_as=self.rules.get('send_get_body_as', 'GET') ) window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30})) field_name = {"field": "", "size": 2147483647} # Integer.MAX_VALUE query_template = {"aggs": {"values": {"terms": field_name}}} if args and args.start: end = ts_to_dt(args.start) else: end = ts_now() start = end - window_size step = datetime.timedelta(**self.rules.get('window_step_size', {'days': 1})) for field in self.fields: tmp_start = start tmp_end = min(start + step, end) time_filter = {self.rules['timestamp_field']: {'lt': dt_to_ts(tmp_end), 'gte': dt_to_ts(tmp_start)}} query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}} query = {'aggs': {'filtered': query_template}} # For composite keys, we will need to perform sub-aggregations if type(field) == list: self.seen_values.setdefault(tuple(field), []) level = query_template['aggs'] # Iterate on each part of the composite key and add a sub aggs clause to the elastic search query for i, sub_field in enumerate(field): level['values']['terms']['field'] = add_raw_postfix(sub_field) if i < len(field) - 1: # If we have more fields after the current one, then set up the next nested structure level['values']['aggs'] = {'values': {'terms': copy.deepcopy(field_name)}} level = level['values']['aggs'] else: self.seen_values.setdefault(field, []) # For non-composite keys, only a single agg is needed field_name['field'] = add_raw_postfix(field) # Query the entire time range in small chunks while tmp_start < end: if self.rules.get('use_strftime_index'): index = format_index(self.rules['index'], tmp_start, tmp_end) else: index = self.rules['index'] res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout='50s') if 'aggregations' in res: buckets = res['aggregations']['filtered']['values']['buckets'] if type(field) == list: # For composite keys, make the lookup based on all fields # Make it a tuple since it can be hashed and used in dictionary lookups for bucket in buckets: # We need to walk down the hierarchy and obtain the value at each level self.seen_values[tuple(field)] += self.flatten_aggregation_hierarchy(bucket) else: keys = [bucket['key'] for bucket in buckets] self.seen_values[field] += keys else: self.seen_values.setdefault(field, []) if tmp_start == tmp_end: break tmp_start = tmp_end tmp_end = min(tmp_start + step, end) time_filter[self.rules['timestamp_field']] = {'lt': dt_to_ts(tmp_end), 'gte': dt_to_ts(tmp_start)} for key, values in self.seen_values.iteritems(): if not values: if type(key) == tuple: # If we don't have any results, it could either be because of the absence of any baseline data # OR it may be because the composite key contained a non-primitive type. Either way, give the # end-users a heads up to help them debug what might be going on. elastalert_logger.warning(( 'No results were found from all sub-aggregations. This can either indicate that there is ' 'no baseline data OR that a non-primitive field was used in a composite key.' )) else: elastalert_logger.info('Found no values for %s' % (field)) continue self.seen_values[key] = list(set(values)) elastalert_logger.info('Found %s unique values for %s' % (len(values), key)) def flatten_aggregation_hierarchy(self, root, hierarchy_tuple=()): """ For nested aggregations, the results come back in the following format: { "aggregations" : { "filtered" : { "doc_count" : 37, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "1.1.1.1", # IP address (root) "doc_count" : 13, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "80", # Port (sub-aggregation) "doc_count" : 3, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "ack", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 }, { "key" : "syn", # Reason (sub-aggregation, leaf-node) "doc_count" : 1 } ] } }, { "key" : "82", # Port (sub-aggregation) "doc_count" : 3, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "ack", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 }, { "key" : "syn", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 } ] } } ] } }, { "key" : "2.2.2.2", # IP address (root) "doc_count" : 4, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "443", # Port (sub-aggregation) "doc_count" : 3, "values" : { "doc_count_error_upper_bound" : 0, "sum_other_doc_count" : 0, "buckets" : [ { "key" : "ack", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 }, { "key" : "syn", # Reason (sub-aggregation, leaf-node) "doc_count" : 3 } ] } } ] } } ] } } } } Each level will either have more values and buckets, or it will be a leaf node We'll ultimately return a flattened list with the hierarchies appended as strings, e.g the above snippet would yield a list with: [ ('1.1.1.1', '80', 'ack'), ('1.1.1.1', '80', 'syn'), ('1.1.1.1', '82', 'ack'), ('1.1.1.1', '82', 'syn'), ('2.2.2.2', '443', 'ack'), ('2.2.2.2', '443', 'syn') ] A similar formatting will be performed in the add_data method and used as the basis for comparison """ results = [] # There are more aggregation hierarchies left. Traverse them. if 'values' in root: results += self.flatten_aggregation_hierarchy(root['values']['buckets'], hierarchy_tuple + (root['key'],)) else: # We've gotten to a sub-aggregation, which may have further sub-aggregations # See if we need to traverse further for node in root: if 'values' in node: results += self.flatten_aggregation_hierarchy(node, hierarchy_tuple) else: results.append(hierarchy_tuple + (node['key'],)) return results def add_data(self, data): for document in data: for field in self.fields: value = () lookup_field = field if type(field) == list: # For composite keys, make the lookup based on all fields # Make it a tuple since it can be hashed and used in dictionary lookups lookup_field = tuple(field) for sub_field in field: lookup_result = lookup_es_key(document, sub_field) if not lookup_result: value = None break value += (lookup_result,) else: value = lookup_es_key(document, field) if not value and self.rules.get('alert_on_missing_field'): document['missing_field'] = lookup_field self.add_match(copy.deepcopy(document)) elif value: if value not in self.seen_values[lookup_field]: document['new_field'] = lookup_field self.add_match(copy.deepcopy(document)) self.seen_values[lookup_field].append(value) def add_terms_data(self, terms): # With terms query, len(self.fields) is always 1 and the 0'th entry is always a string field = self.fields[0] for timestamp, buckets in terms.iteritems(): for bucket in buckets: if bucket['doc_count']: if bucket['key'] not in self.seen_values[field]: match = {field: bucket['key'], self.rules['timestamp_field']: timestamp, 'new_field': field} self.add_match(match) self.seen_values[field].append(bucket['key'])
class ElastAlerter(): """ The main Elastalert runner. This class holds all state about active rules, controls when queries are run, and passes information between rules and alerts. :param args: An argparse arguments instance. Should contain debug and start :param conf: The configuration dictionary. At the top level, this contains global options, and under 'rules', contains all state relating to rules and alerts. In each rule in conf['rules'], the RuleType and Alerter instances live under 'type' and 'alerts', respectively. The conf dictionary should not be passed directly from a configuration file, but must be populated by config.py:load_rules instead. """ def parse_args(self, args): parser = argparse.ArgumentParser() parser.add_argument('--config', action='store', dest='config', default="config.yaml", help='Global config file (default: config.yaml)') parser.add_argument( '--debug', action='store_true', dest='debug', help='Suppresses alerts and prints information instead') parser.add_argument( '--rule', dest='rule', help= 'Run only a specific rule (by filename, must still be in rules folder)' ) parser.add_argument( '--silence', dest='silence', help= 'Silence rule for a time period. Must be used with --rule. Usage: ' '--silence <units>=<number>, eg. --silence hours=2') parser.add_argument( '--start', dest='start', help= 'YYYY-MM-DDTHH:MM:SS Start querying from this timestamp. (Default: present)' ) parser.add_argument( '--end', dest='end', help= 'YYYY-MM-DDTHH:MM:SS Query to this timestamp. (Default: present)') parser.add_argument( '--verbose', action='store_true', dest='verbose', help='Increase verbosity without suppressing alerts') parser.add_argument( '--pin_rules', action='store_true', dest='pin_rules', help='Stop ElastAlert from monitoring config file changes') self.args = parser.parse_args(args) def __init__(self, args): self.parse_args(args) self.conf = load_rules(self.args.config, use_rule=self.args.rule) self.max_query_size = self.conf['max_query_size'] self.rules = self.conf['rules'] self.debug = self.args.debug self.verbose = self.args.verbose self.writeback_index = self.conf['writeback_index'] self.es_host = self.conf['es_host'] self.es_port = self.conf['es_port'] self.run_every = self.conf['run_every'] self.alert_time_limit = self.conf['alert_time_limit'] self.old_query_limit = self.conf['old_query_limit'] self.alerts_sent = 0 self.num_hits = 0 self.current_es = None self.current_es_addr = None self.buffer_time = self.conf['buffer_time'] self.silence_cache = {} self.rule_hashes = get_rule_hashes(self.conf) self.writeback_es = Elasticsearch(host=self.es_host, port=self.es_port) if self.debug: self.verbose = True if self.verbose: logging.getLogger().setLevel(logging.INFO) for rule in self.rules: rule = self.init_rule(rule) if self.args.silence: self.silence() @staticmethod def get_index(rule, starttime=None, endtime=None): """ Gets the index for a rule. If strftime is set and starttime and endtime are provided, it will return a comma seperated list of indices. If strftime is set but starttime and endtime are not provided, it will replace all format tokens with a wildcard. """ index = rule['index'] if rule.get('use_strftime_index'): if starttime and endtime: return format_index(index, starttime, endtime) else: # Replace the substring containing format characters with a * format_start = index.find('%') format_end = index.rfind('%') + 2 return index[:format_start] + '*' + index[format_end:] else: return index @staticmethod def get_query(filters, starttime=None, endtime=None, sort=True, timestamp_field='@timestamp'): """ Returns a query dict that will apply a list of filters, filter by start and end time, and sort results by timestamp. :param filters: A list of elasticsearch filters to use. :param starttime: A timestamp to use as the start time of the query. :param endtime: A timestamp to use as the end time of the query. :param sort: If true, sort results by timestamp. (Default True) :return: A query dictionary to pass to elasticsearch. """ starttime = dt_to_ts(starttime) endtime = dt_to_ts(endtime) filters = copy.copy(filters) query = {'filter': {'bool': {'must': filters}}} if starttime and endtime: query['filter']['bool']['must'].append({ 'range': { timestamp_field: { 'from': starttime, 'to': endtime } } }) if sort: query['sort'] = [{timestamp_field: {'order': 'asc'}}] return query def get_terms_query(self, query, size, field): """ Takes a query generated by get_query and outputs a aggregation query """ if 'sort' in query: query.pop('sort') query.update( {'aggs': { 'counts': { 'terms': { 'field': field, 'size': size } } }}) aggs_query = {'aggs': {'filtered': query}} return aggs_query def get_index_start(self, index, timestamp_field='@timestamp'): """ Query for one result sorted by timestamp to find the beginning of the index. :param index: The index of which to find the earliest event. :return: Timestamp of the earliest event. """ query = {'sort': {timestamp_field: {'order': 'asc'}}} try: res = self.current_es.search(index=index, size=1, body=query, _source_include=[timestamp_field], ignore_unavailable=True) except ElasticsearchException as e: self.handle_error("Elasticsearch query error: %s" % (e), {'index': index}) return '1969-12-30T00:00:00Z' if len(res['hits']['hits']) == 0: # Index is completely empty, return a date before the epoch return '1969-12-30T00:00:00Z' timestamp = res['hits']['hits'][0]['_source'][timestamp_field] return timestamp def get_hits(self, rule, starttime, endtime, index): """ Query elasticsearch for the given rule and return the results. :param rule: The rule configuration. :param starttime: The earliest time to query. :param endtime: The latest time to query. :return: A list of hits, bounded by self.max_query_size. """ query = self.get_query(rule['filter'], starttime, endtime, timestamp_field=rule['timestamp_field']) try: res = self.current_es.search(index=index, size=self.max_query_size, body=query, _source_include=rule['include'], ignore_unavailable=True) except ElasticsearchException as e: # Elasticsearch sometimes gives us GIGANTIC error messages # (so big that they will fill the entire terminal buffer) if len(str(e)) > 1024: e = str(e)[:1024] + '... (%d characters removed)' % ( len(str(e)) - 1024) self.handle_error('Error running query: %s' % (e), {'rule': rule['name']}) return None hits = res['hits']['hits'] self.num_hits += len(hits) lt = rule.get('use_local_time') logging.info("Queried rule %s from %s to %s: %s hits" % (rule['name'], pretty_ts( starttime, lt), pretty_ts(endtime, lt), len(hits))) self.replace_ts(hits, rule) # Record doc_type for use in get_top_counts if 'doc_type' not in rule and len(hits): rule['doc_type'] = hits[0]['_type'] return hits def replace_ts(self, hits, rule): for hit in hits: hit['_source'][rule['timestamp_field']] = ts_to_dt( hit['_source'][rule['timestamp_field']]) def get_hits_count(self, rule, starttime, endtime, index): """ Query elasticsearch for the count of results and returns a list of timestamps equal to the endtime. This allows the results to be passed to rules which expect an object for each hit. :param rule: The rule configuration dictionary. :param starttime: The earliest time to query. :param endtime: The latest time to query. :return: A dictionary mapping timestamps to number of hits for that time period. """ query = self.get_query(rule['filter'], starttime, endtime, timestamp_field=rule['timestamp_field'], sort=False) query = {'query': {'filtered': query}} try: res = self.current_es.count(index=index, doc_type=rule['doc_type'], body=query, ignore_unavailable=True) except ElasticsearchException as e: # Elasticsearch sometimes gives us GIGANTIC error messages # (so big that they will fill the entire terminal buffer) if len(str(e)) > 1024: e = str(e)[:1024] + '... (%d characters removed)' % ( len(str(e)) - 1024) self.handle_error('Error running count query: %s' % (e), {'rule': rule['name']}) return None self.num_hits += res['count'] lt = rule.get('use_local_time') logging.info("Queried rule %s from %s to %s: %s hits" % (rule['name'], pretty_ts( starttime, lt), pretty_ts(endtime, lt), res['count'])) return {endtime: res['count']} def get_hits_terms(self, rule, starttime, endtime, index, key, qk=None): rule_filter = copy.copy(rule['filter']) if qk: filter_key = rule['query_key'] if rule.get('raw_count_keys', True) and not rule['query_key'].endswith('.raw'): filter_key += '.raw' rule_filter.extend([{'term': {filter_key: qk}}]) base_query = self.get_query(rule_filter, starttime, endtime, timestamp_field=rule['timestamp_field'], sort=False) query = self.get_terms_query(base_query, rule.get('terms_size', 5), key) try: res = self.current_es.search(index=index, doc_type=rule['doc_type'], body=query, search_type='count', ignore_unavailable=True) except ElasticsearchException as e: # Elasticsearch sometimes gives us GIGANTIC error messages # (so big that they will fill the entire terminal buffer) if len(str(e)) > 1024: e = str(e)[:1024] + '... (%d characters removed)' % ( len(str(e)) - 1024) self.handle_error('Error running query: %s' % (e), {'rule': rule['name']}) return None buckets = res['aggregations']['filtered']['counts']['buckets'] self.num_hits += len(buckets) lt = rule.get('use_local_time') logging.info('Queried rule %s from %s to %s: %s buckets' % (rule['name'], pretty_ts( starttime, lt), pretty_ts(endtime, lt), len(buckets))) return {endtime: buckets} def remove_duplicate_events(self, data, rule): # Remove data we've processed already data = [ event for event in data if event['_id'] not in rule['processed_hits'] ] # Remember the new data's IDs for event in data: rule['processed_hits'][event['_id']] = event['_source'][ rule['timestamp_field']] return [event['_source'] for event in data] def remove_old_events(self, rule): # Anything older than the buffer time we can forget now = ts_now() remove = [] buffer_time = rule.get('buffer_time', self.buffer_time) for _id, timestamp in rule['processed_hits'].iteritems(): if now - timestamp > buffer_time: remove.append(_id) map(rule['processed_hits'].pop, remove) def run_query(self, rule, start=None, end=None): """ Query for the rule and pass all of the results to the RuleType instance. :param rule: The rule configuration. :param start: The earliest time to query. :param end: The latest time to query. Returns True on success and False on failure. """ if start is None: start = self.get_index_start(rule['index']) if end is None: end = ts_now() # Reset hit counter and query rule_inst = rule['type'] prev_num_hits = self.num_hits max_size = rule.get('max_query_size', self.max_query_size) index = self.get_index(rule, start, end) if rule.get('use_count_query'): data = self.get_hits_count(rule, start, end, index) elif rule.get('use_terms_query'): data = self.get_hits_terms(rule, start, end, index, rule['query_key']) else: data = self.get_hits(rule, start, end, index) if data: data = self.remove_duplicate_events(data, rule) # There was an exception while querying if data is None: return False elif data: if rule.get('use_count_query'): rule_inst.add_count_data(data) elif rule.get('use_terms_query'): rule_inst.add_terms_data(data) else: rule_inst.add_data(data) # Warn if we hit max_query_size if self.num_hits - prev_num_hits == max_size and not rule.get( 'use_count_query'): logging.warning("Hit max_query_size (%s) while querying for %s" % (max_size, rule['name'])) return True def get_starttime(self, rule): """ Query ES for the last time we ran this rule. :param rule: The rule configuration. :return: A timestamp or None. """ query = { 'filter': { 'term': { 'rule_name': '%s' % (rule['name']) } }, 'sort': { '@timestamp': { 'order': 'desc' } } } try: if self.writeback_es: res = self.writeback_es.search( index=self.writeback_index, doc_type='elastalert_status', size=1, body=query, _source_include=['endtime', 'rule_name']) if res['hits']['hits']: endtime = ts_to_dt( res['hits']['hits'][0]['_source']['endtime']) if ts_now() - endtime < self.old_query_limit: return endtime else: logging.info( "Found expired previous run for %s at %s" % (rule['name'], endtime)) return None except (ElasticsearchException, KeyError) as e: self.handle_error('Error querying for last run: %s' % (e), {'rule': rule['name']}) self.writeback_es = None return None def set_starttime(self, rule, endtime): """ Given a rule and an endtime, sets the appropriate starttime for it. """ # This means we are starting fresh if 'starttime' not in rule: # Try to get the last run from elasticsearch last_run_end = self.get_starttime(rule) if last_run_end: rule['starttime'] = last_run_end return # Use buffer for normal queries, or run_every increments otherwise buffer_time = rule.get('buffer_time', self.buffer_time) if not rule.get('use_count_query') and not rule.get('use_terms_query'): rule['starttime'] = endtime - buffer_time else: rule['starttime'] = endtime - self.run_every def run_rule(self, rule, endtime, starttime=None): """ Run a rule for a given time period, including querying and alerting on results. :param rule: The rule configuration. :param starttime: The earliest timestamp to query. :param endtime: The latest timestamp to query. :return: The number of matches that the rule produced. """ run_start = time.time() self.current_es = Elasticsearch(host=rule['es_host'], port=rule['es_port']) self.current_es_addr = (rule['es_host'], rule['es_port']) # If there are pending aggregate matches, try processing them for x in range(len(rule['agg_matches'])): match = rule['agg_matches'].pop() self.add_aggregated_alert(match, rule) # Start from provided time if it's given if starttime: rule['starttime'] = starttime else: self.set_starttime(rule, endtime) rule['original_starttime'] = rule['starttime'] # Don't run if starttime was set to the future if ts_now() <= rule['starttime']: logging.warning( "Attempted to use query start time in the future (%s), sleeping instead" % (starttime)) return 0 # Run the rule # If querying over a large time period, split it up into chunks self.num_hits = 0 tmp_endtime = endtime buffer_time = rule.get('buffer_time', self.buffer_time) while endtime - rule['starttime'] > buffer_time: tmp_endtime = rule['starttime'] + self.run_every if not self.run_query(rule, rule['starttime'], tmp_endtime): return 0 rule['starttime'] = tmp_endtime if not self.run_query(rule, rule['starttime'], endtime): return 0 rule['type'].garbage_collect(endtime) # Process any new matches num_matches = len(rule['type'].matches) while rule['type'].matches: match = rule['type'].matches.pop(0) # If realert is set, silence the rule for that duration # Silence is cached by query_key, if it exists # Default realert time is 0 seconds # concatenate query_key (or none) with rule_name to form silence_cache key if 'query_key' in rule: try: key = '.' + match[rule['query_key']] except KeyError: # Some matches may not have a query key key = '' else: key = '' if self.is_silenced(rule['name'] + key) or self.is_silenced( rule['name']): logging.info('Ignoring match for silenced rule %s%s' % (rule['name'], key)) continue if rule['realert']: self.set_realert( rule['name'] + key, dt_to_ts(datetime.datetime.utcnow() + rule['realert'])) # If no aggregation, alert immediately if not rule['aggregation']: self.alert([match], rule) continue # Add it as an aggregated match self.add_aggregated_alert(match, rule) time_taken = time.time() - run_start # Write to ES that we've run this rule against this time period body = { 'rule_name': rule['name'], 'endtime': endtime, 'starttime': rule['starttime'], 'matches': num_matches, 'hits': self.num_hits, '@timestamp': ts_now(), 'time_taken': time_taken } self.writeback('elastalert_status', body) return num_matches def init_rule(self, new_rule, new=True): ''' Copies some necessary non-config state from an exiting rule to a new rule. ''' if 'download_dashboard' in new_rule['filter']: # Download filters from kibana and set the rules filters to them db_filters = self.filters_from_kibana( new_rule, new_rule['filter']['download_dashboard']) if db_filters is not None: new_rule['filter'] = db_filters else: raise EAException("Could not download filters from %s" % (new_rule['filter']['download_dashboard'])) blank_rule = { 'agg_matches': [], 'current_aggregate_id': None, 'processed_hits': {} } rule = blank_rule # Set rule to either a blank template or existing rule with same name if not new: for rule in self.rules: if rule['name'] == new_rule['name']: break else: logging.warning( "Couldn't find existing rule %s, starting from scratch" % (new_rule['name'])) rule = blank_rule copy_properties = [ 'agg_matches', 'current_aggregate_id', 'processed_hits', 'starttime' ] for prop in copy_properties: if prop == 'starttime' and 'starttime' not in rule: continue new_rule[prop] = rule[prop] return new_rule def load_rule_changes(self): ''' Using the modification times of rule config files, syncs the running rules to match the files in rules_folder by removing, adding or reloading rules. ''' rule_hashes = get_rule_hashes(self.conf) # Check each current rule for changes for rule_file, hash_value in self.rule_hashes.iteritems(): if rule_file not in rule_hashes: # Rule file was deleted logging.info( 'Rule file %s not found, stopping rule execution' % (rule_file)) self.rules = [ rule for rule in self.rules if rule['rule_file'] != rule_file ] continue if hash_value != rule_hashes[rule_file]: # Rule file was changed, reload rule try: new_rule = load_configuration( os.path.join(self.conf['rules_folder'], rule_file)) except EAException as e: self.handle_error('Could not load rule %s: %s' % (rule_file, e)) continue logging.info("Reloading configuration for rule %s" % (rule_file)) # Initialize the rule that matches rule_file self.rules = [ rule if rule['rule_file'] != rule_file else self.init_rule( new_rule, False) for rule in self.rules ] # Load new rules if not self.args.rule: for rule_file in set(rule_hashes.keys()) - set( self.rule_hashes.keys()): try: new_rule = load_configuration( os.path.join(self.conf['rules_folder'], rule_file)) except EAException as e: self.handle_error('Could not load rule %s: %s' % (rule_file, e)) continue logging.info('Loaded new rule %s' % (rule_file)) self.rules.append(self.init_rule(new_rule)) self.rule_hashes = rule_hashes def start(self): """ Periodically go through each rule and run it """ starttime = self.args.start if starttime: try: starttime = ts_to_dt(starttime) except (TypeError, ValueError): self.handle_error( "%s is not a valid ISO 8601 timestamp (YYYY-MM-DDTHH:MM:SS+XX:00)" % (starttime)) exit(1) while True: # If writeback_es errored, it's disabled until the next query cycle if not self.writeback_es: self.writeback_es = Elasticsearch(host=self.es_host, port=self.es_port) self.send_pending_alerts() next_run = datetime.datetime.utcnow() + self.run_every for rule in self.rules: # Set endtime based on the rule's delay delay = rule.get('query_delay') if hasattr(self.args, 'end') and self.args.end: endtime = ts_to_dt(self.args.end) elif delay: endtime = ts_now() - delay else: endtime = ts_now() try: num_matches = self.run_rule(rule, endtime, starttime) except EAException as e: self.handle_error( "Error running rule %s: %s" % (rule['name'], e), {'rule': rule['name']}) else: old_starttime = pretty_ts(rule.get('original_starttime'), rule.get('use_local_time')) logging.info( "Ran %s from %s to %s: %s query hits, %s matches," " %s alerts sent" % (rule['name'], old_starttime, pretty_ts(endtime, rule.get('use_local_time')), self.num_hits, num_matches, self.alerts_sent)) self.alerts_sent = 0 self.remove_old_events(rule) if next_run < datetime.datetime.utcnow(): # We were processing for longer than our refresh interval # This can happen if --start was specified with a large time period # or if we are running too slow to process events in real time. logging.warning("Querying from %s to %s took longer than %s!" % (old_starttime, endtime, self.run_every)) continue # Only force starttime once starttime = None if not self.args.pin_rules: self.load_rule_changes() # Wait before querying again sleep_for = (next_run - datetime.datetime.utcnow()).seconds logging.info("Sleeping for %s seconds" % (sleep_for)) time.sleep(sleep_for) def generate_kibana_db(self, rule, match): ''' Uses a template dashboard to upload a temp dashboard showing the match. Returns the url to the dashboard. ''' db = copy.deepcopy(kibana.dashboard_temp) # Set filters for filter in rule['filter']: if filter: kibana.add_filter(db, filter) kibana.set_included_fields(db, rule['include']) # Set index index = self.get_index(rule) kibana.set_index_name(db, index) return self.upload_dashboard(db, rule, match) def upload_dashboard(self, db, rule, match): ''' Uploads a dashboard schema to the kibana-int elasticsearch index associated with rule. Returns the url to the dashboard. ''' # Set time range start = ts_add(match[rule['timestamp_field']], -rule.get('timeframe', datetime.timedelta(minutes=10))) end = ts_add(match[rule['timestamp_field']], datetime.timedelta(minutes=10)) kibana.set_time(db, start, end) # Set dashboard name db_name = 'ElastAlert - %s - %s' % (rule['name'], end) kibana.set_name(db, db_name) # Add filter for query_key value if 'query_key' in rule: if rule['query_key'] in match: term = {'term': {rule['query_key']: match[rule['query_key']]}} kibana.add_filter(db, term) # Convert to json db_js = json.dumps(db) db_body = { 'user': '******', 'group': 'guest', 'title': db_name, 'dashboard': db_js } # Upload es = Elasticsearch(host=rule['es_host'], port=rule['es_port']) res = es.create(index='kibana-int', doc_type='temp', body=db_body) # Return dashboard URL kibana_url = rule.get('kibana_dashboard') if not kibana_url: kibana_url = 'http://%s:%s/_plugin/kibana/' % (rule['es_host'], rule['es_port']) return kibana_url + '#/dashboard/temp/%s' % (res['_id']) def get_dashboard(self, rule, db_name): """ Download dashboard which matches use_kibana_dashboard from elasticsearch. """ es = Elasticsearch(host=rule['es_host'], port=rule['es_port']) if not db_name: raise EAException("use_kibana_dashboard undefined") query = {'query': {'term': {'_id': db_name}}} try: res = es.search(index='kibana-int', doc_type='dashboard', body=query, _source_include=['dashboard']) except ElasticsearchException as e: raise EAException("Error querying for dashboard: %s" % (e)) if res['hits']['hits']: return json.loads(res['hits']['hits'][0]['_source']['dashboard']) else: raise EAException("Could not find dashboard named %s" % (db_name)) def use_kibana_link(self, rule, match): """ Uploads an existing dashboard as a temp dashboard modified for match time. Returns the url to the dashboard. """ # Download or get cached dashboard dashboard = rule.get('dashboard_schema') if not dashboard: db_name = rule.get('use_kibana_dashboard') dashboard = self.get_dashboard(rule, db_name) if dashboard: rule['dashboard_schema'] = dashboard else: return None dashboard = copy.deepcopy(dashboard) return self.upload_dashboard(dashboard, rule, match) def filters_from_kibana(self, rule, db_name): """ Downloads a dashboard from kibana and returns corresponding filters, None on error. """ try: db = rule.get('dashboard_schema') if not db: db = self.get_dashboard(rule, db_name) filters = kibana.filters_from_dashboard(db) except EAException: return None return filters def alert(self, matches, rule, alert_time=None): """ Send out an alert. :param matches: A list of matches. :param rule: A rule configuration. """ if alert_time is None: alert_time = ts_now() # Compute top count keys if rule.get('top_count_keys'): for match in matches: if 'query_key' in rule and rule['query_key'] in match: qk = match[rule['query_key']] else: qk = None start = ts_to_dt(match[rule['timestamp_field']]) - rule.get( 'timeframe', datetime.timedelta(minutes=10)) end = ts_to_dt( match[rule['timestamp_field']]) + datetime.timedelta( minutes=10) keys = rule.get('top_count_keys') counts = self.get_top_counts(rule, start, end, keys, rule.get('top_count_number'), qk) match.update(counts) # Generate a kibana dashboard for the first match if rule.get('generate_kibana_link') or rule.get( 'use_kibana_dashboard'): try: if rule.get('generate_kibana_link'): kb_link = self.generate_kibana_db(rule, matches[0]) else: kb_link = self.use_kibana_link(rule, matches[0]) except EAException as e: self.handle_error( "Could not generate kibana dash for %s match: %s" % (rule['name'], e)) else: if kb_link: matches[0]['kibana_link'] = kb_link for enhancement in rule['match_enhancements']: for match in matches: try: enhancement.process(match) except EAException as e: self.handle_error( "Error running match enhancement: %s" % (e), {'rule': rule['name']}) # Don't send real alerts in debug mode if self.debug: alerter = DebugAlerter(rule) alerter.alert(matches) return # Run the alerts alert_sent = False alert_exception = None for alert in rule['alert']: try: alert.alert(matches) except EAException as e: self.handle_error( 'Error while running alert %s: %s' % (alert.get_info()['type'], e), {'rule': rule['name']}) alert_exception = str(e) else: self.alerts_sent += 1 alert_sent = True # Write the alert(s) to ES agg_id = None for match in matches: alert_body = self.get_alert_body(match, rule, alert_sent, alert_time, alert_exception) # Set all matches to aggregate together if agg_id: alert_body['aggregate_id'] = agg_id res = self.writeback('elastalert', alert_body) if res and not agg_id: agg_id = res['_id'] def get_alert_body(self, match, rule, alert_sent, alert_time, alert_exception=None): body = {'match_body': match} body['rule_name'] = rule['name'] # TODO record info about multiple alerts body['alert_info'] = rule['alert'][0].get_info() body['alert_sent'] = alert_sent body['alert_time'] = alert_time # If the alert failed to send, record the exception if not alert_sent: body['alert_exception'] = alert_exception return body def writeback(self, doc_type, body): # Convert any datetime objects to timestamps for key in body.keys(): if isinstance(body[key], datetime.datetime): body[key] = dt_to_ts(body[key]) if self.debug: logging.info("Skipping writing to ES: %s" % (body)) return None if '@timestamp' not in body: body['@timestamp'] = dt_to_ts(ts_now()) if self.writeback_es: try: res = self.writeback_es.create(index=self.writeback_index, doc_type=doc_type, body=body) return res except ElasticsearchException as e: logging.exception( "Error writing alert info to elasticsearch: %s" % (e)) self.writeback_es = None return None def find_recent_pending_alerts(self, time_limit): """ Queries writeback_es to find alerts that did not send and are newer than time_limit """ query = { 'query': { 'query_string': { 'query': 'alert_sent:false' } }, 'filter': { 'range': { 'alert_time': { 'from': dt_to_ts(ts_now() - time_limit), 'to': dt_to_ts(ts_now()) } } } } if self.writeback_es: try: res = self.writeback_es.search(index=self.writeback_index, doc_type='elastalert', body=query, size=1000) if res['hits']['hits']: return res['hits']['hits'] except: pass return [] def send_pending_alerts(self): pending_alerts = self.find_recent_pending_alerts(self.alert_time_limit) for alert in pending_alerts: _id = alert['_id'] alert = alert['_source'] try: rule_name = alert.pop('rule_name') alert_time = alert.pop('alert_time') match_body = alert.pop('match_body') except KeyError: # Malformed alert, drop it continue agg_id = alert.get('aggregate_id', None) if agg_id: # Aggregated alerts will be taken care of by get_aggregated_matches continue # Find original rule for rule in self.rules: if rule['name'] == rule_name: break else: # Original rule is missing, drop alert continue # Retry the alert unless it's a future alert if ts_now() > ts_to_dt(alert_time): aggregated_matches = self.get_aggregated_matches(_id) if aggregated_matches: matches = [match_body] + [ agg_match['match_body'] for agg_match in aggregated_matches ] self.alert(matches, rule, alert_time=alert_time) rule['current_aggregate_id'] = None else: self.alert([match_body], rule, alert_time=alert_time) # Delete it from the index try: self.writeback_es.delete(index=self.writeback_index, doc_type='elastalert', id=_id) except: self.handle_error("Failed to delete alert %s at %s" % (_id, alert_time)) # Send in memory aggregated alerts for rule in self.rules: if rule['agg_matches']: if ts_now() > rule['aggregate_alert_time']: self.alert(rule['agg_matches'], rule) rule['agg_matches'] = [] def get_aggregated_matches(self, _id): """ Removes and returns all matches from writeback_es that have aggregate_id == _id """ query = { 'query': { 'query_string': { 'query': 'aggregate_id:%s' % (_id) } } } matches = [] if self.writeback_es: try: res = self.writeback_es.search(index=self.writeback_index, doc_type='elastalert', body=query) for match in res['hits']['hits']: matches.append(match['_source']) self.writeback_es.delete(index=self.writeback_index, doc_type='elastalert', id=match['_id']) except (KeyError, ElasticsearchException) as e: self.handle_error( "Error fetching aggregated matches: %s" % (e), {'id': _id}) return matches def add_aggregated_alert(self, match, rule): """ Save a match as a pending aggregate alert to elasticsearch. """ if not rule['current_aggregate_id'] or rule[ 'aggregate_alert_time'] < ts_to_dt( match[rule['timestamp_field']]): # First match, set alert_time match_time = ts_to_dt(match[rule['timestamp_field']]) alert_time = match_time + rule['aggregation'] rule['aggregate_alert_time'] = alert_time agg_id = None else: # Already pending aggregation, use existing alert_time alert_time = rule['aggregate_alert_time'] agg_id = rule['current_aggregate_id'] logging.info( 'Adding alert for %s to aggregation, next alert at %s' % (rule['name'], alert_time)) alert_body = self.get_alert_body(match, rule, False, alert_time) if agg_id: alert_body['aggregate_id'] = agg_id res = self.writeback('elastalert', alert_body) # If new aggregation, save _id if res and not agg_id: rule['current_aggregate_id'] = res['_id'] # Couldn't write the match to ES, save it in memory for now if not res: rule['agg_matches'].append(match) return res def silence(self): """ Silence an alert for a period of time. --silence and --rule must be passed as args. """ if not self.args.rule: logging.error('--silence must be used with --rule') exit(1) # With --rule, self.rules will only contain that specific rule rule_name = self.rules[0]['name'] try: unit, num = self.args.silence.split('=') silence_time = datetime.timedelta(**{unit: int(num)}) silence_ts = dt_to_ts(silence_time + datetime.datetime.utcnow()) except (ValueError, TypeError): logging.error('%s is not a valid time period' % (self.args.silence)) exit(1) if not self.set_realert(rule_name, silence_ts): logging.error('Failed to save silence command to elasticsearch') exit(1) logging.info('Success. %s will be silenced until %s' % (rule_name, silence_ts)) def set_realert(self, rule_name, timestamp): """ Write a silence to elasticsearch for rule_name until timestamp. """ body = { 'rule_name': rule_name, '@timestamp': ts_now(), 'until': timestamp } self.silence_cache[rule_name] = timestamp return self.writeback('silence', body) def is_silenced(self, rule_name): """ Checks if rule_name is currently silenced. Returns false on exception. """ if rule_name in self.silence_cache: if ts_now() < ts_to_dt(self.silence_cache[rule_name]): return True else: self.silence_cache.pop(rule_name) return False query = { 'filter': { 'term': { 'rule_name': rule_name } }, 'sort': { 'until': { 'order': 'desc' } } } if self.writeback_es: try: res = self.writeback_es.search(index=self.writeback_index, doc_type='silence', size=1, body=query, _source_include=['until']) except ElasticsearchException as e: self.handle_error( "Error while querying for alert silence status: %s" % (e), {'rule': rule_name}) return False if res['hits']['hits']: until_ts = res['hits']['hits'][0]['_source']['until'] if ts_now() < ts_to_dt(until_ts): self.silence_cache[rule_name] = until_ts return True return False def handle_error(self, message, data=None): ''' Logs message at error level and writes message, data and traceback to Elasticsearch. ''' if not self.writeback_es: self.writeback_es = Elasticsearch(host=self.es_host, port=self.es_port) logging.error(message) body = {'message': message} tb = traceback.format_exc() body['traceback'] = tb.strip().split('\n') if data: body['data'] = data self.writeback('elastalert_error', body) def get_top_counts(self, rule, starttime, endtime, keys, number=5, qk=None): """ Counts the number of events for each unique value for each key field. Returns a dictionary with top_events_<key> mapped to the top 5 counts for each key. """ all_counts = {} for key in keys: index = self.get_index(rule, starttime, endtime) buckets = self.get_hits_terms(rule, starttime, endtime, index, key, qk).values()[0] # get_hits_terms adds to num_hits, but we don't want to count these self.num_hits -= len(buckets) terms = {} for bucket in buckets: terms[bucket['key']] = bucket['doc_count'] counts = terms.items() counts.sort(key=lambda x: x[1], reverse=True) # Save a dict with the top 5 events by key all_counts['top_events_%s' % (key)] = dict(counts[:number]) return all_counts