Exemple #1
0
def search_fuzzy(request=None, project_id=None):
    project_id = project_id if project_id \
        else json.loads(request.session['project_id'])

    index_name = elastic_cache_key(project_id, 'ec2')
    ebs_index_name = elastic_cache_key(project_id, 'ebs')
    elb_index_name = elastic_cache_key(project_id, 'elb')
    eip_index_name = elastic_cache_key(project_id, 'eip')
    vpc_index_name = elastic_cache_key(project_id, 'vpc')
    subnet_index_name = elastic_cache_key(project_id, 'subnet')
    security_group_index_name = elastic_cache_key(project_id, 'security_group')

    st = request.GET.get('st', None)
    client = Elasticsearch(hosts=settings.ELASTIC_SEARCH_NODES)

    query = {
        "query": {
            "query_string": {
                "fields": ["title"],
                "query": "*" + st + "*",
            }
        },
    }

    total = client.search(index=[
        index_name, ebs_index_name, elb_index_name, eip_index_name,
        vpc_index_name, subnet_index_name, security_group_index_name
    ],
                          doc_type=[
                              "instance_id", "name_title", "prip_title",
                              "puip_title", "ebs", "eip", "elb", "vpc",
                              "subnet", "security_group_id",
                              "security_group_name"
                          ],
                          body=query,
                          ignore_unavailable=True)['hits']['total']

    # Get Total search result and set size parameter equal to that, to get all results
    # ToDo Discuss and Optimize
    query['size'] = total

    search_results = client.search(index=[
        index_name, ebs_index_name, elb_index_name, eip_index_name,
        vpc_index_name, subnet_index_name, security_group_index_name
    ],
                                   doc_type=[
                                       "instance_id", "name_title",
                                       "prip_title", "puip_title", "ebs",
                                       "eip", "elb", "vpc", "subnet",
                                       "security_group_id",
                                       "security_group_name"
                                   ],
                                   body=query,
                                   ignore_unavailable=True)
    return search_results
def main():
    es_host = raw_input("Elasticsearch host: ")
    es_port = raw_input("Elasticsearch port: ")
    db_name = raw_input("Dashboard name: ")
    send_get_body_as = raw_input(
        "Method for querying Elasticsearch[GET]: ") or 'GET'
    es = Elasticsearch(host=es_host,
                       port=es_port,
                       send_get_body_as=send_get_body_as)
    query = {'query': {'term': {'_id': db_name}}}
    res = es.search(index='kibana-int',
                    doc_type='dashboard',
                    body=query,
                    _source_include=['dashboard'])
    if not res['hits']['hits']:
        print("No dashboard %s found" % (db_name))
        exit()

    db = json.loads(res['hits']['hits'][0]['_source']['dashboard'])
    config_filters = filters_from_dashboard(db)

    print("\nPartial Config file")
    print("-----------\n")
    print("name: %s" % (db_name))
    print("es_host: %s" % (es_host))
    print("es_port: %s" % (es_port))
    print("filter:")
    print(yaml.safe_dump(config_filters))
Exemple #3
0
class TestReindexer(unittest.TestCase):
    def setUp(self):
        self.source_index = "reindex"
        self.target_index = "reindex-a"
        self.client = Elasticsearch()
        self.reindexer = Reindexer(self.client)
        self.schema_manager = SchemaManager(self.client)

        # try:
        #     read_only_setting = {"index": {"blocks": {"read_only": False}}}
        #     self.client.indices.put_settings(index=self.source_index, body=read_only_setting)
        # except:
        #     pass

        self.client.indices.create(index=self.source_index)

    def tearDown(self):
        for index in [self.source_index, self.target_index]:
            try:
                self.client.indices.delete(index=index)
            except:
                pass

    def test_reindex(self):
        create = []
        for i in ['a', 'b', 'c', 'd', 'e']:
            doc = {
                '_op_type': 'create',
                '_index': self.source_index,
                '_type': 'document',
                'doc': {'name': i}
            }
            create.append(doc)
        bulk(self.client, create, refresh=True)
        docs = self.client.search(index=self.source_index)
        self.assertEqual(len(docs['hits']['hits']), 5)

        self.reindexer.do_reindex(self.source_index, self.target_index, 3)

        self.client.indices.refresh(','.join([self.source_index, self.target_index]))
        docs = self.client.search(index=self.source_index)
        self.assertEqual(len(docs['hits']['hits']), 5)
        docs = self.client.search(index=self.target_index)
        self.assertEqual(len(docs['hits']['hits']), 5)
Exemple #4
0
    def get_dashboard(self, rule, db_name):
        """ Download dashboard which matches use_kibana_dashboard from elasticsearch. """
        es = Elasticsearch(host=rule['es_host'], port=rule['es_port'])
        if not db_name:
            raise EAException("use_kibana_dashboard undefined")
        query = {'query': {'term': {'_id': db_name}}}
        try:
            res = es.search(index='kibana-int',
                            doc_type='dashboard',
                            body=query,
                            _source_include=['dashboard'])
        except ElasticsearchException as e:
            raise EAException("Error querying for dashboard: %s" % (e))

        if res['hits']['hits']:
            return json.loads(res['hits']['hits'][0]['_source']['dashboard'])
        else:
            raise EAException("Could not find dashboard named %s" % (db_name))
def main():
    es_host = raw_input("Elasticsearch host: ")
    es_port = raw_input("Elasticsearch port: ")
    db_name = raw_input("Dashboard name: ")
    es = Elasticsearch(host=es_host, port=es_port)
    query = {'query': {'term': {'_id': db_name}}}
    res = es.search(index='kibana-int', doc_type='dashboard', body=query, _source_include=['dashboard'])
    if not res['hits']['hits']:
        print("No dashboard %s found" % (db_name))
        exit()

    db = json.loads(res['hits']['hits'][0]['_source']['dashboard'])
    config_filters = filters_from_dashboard(db)

    print("\nPartial Config file")
    print("-----------\n")
    print("name: %s" % (db_name))
    print("es_host: %s" % (es_host))
    print("es_port: %s" % (es_port))
    print("filter:")
    print(yaml.safe_dump(config_filters))
Exemple #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--host', help='Elasticsearch host')
    parser.add_argument('--port', type=int, help='Elasticsearch port')
    parser.add_argument('--url-prefix', help='Elasticsearch URL prefix')
    parser.add_argument('--no-auth',
                        action='store_const',
                        const=True,
                        help='Suppress prompt for basic auth')
    parser.add_argument('--ssl',
                        action='store_true',
                        default=None,
                        help='Use SSL')
    parser.add_argument('--no-ssl',
                        dest='ssl',
                        action='store_false',
                        help='Do not use SSL')
    parser.add_argument('--index', help='Index name to create')
    parser.add_argument('--old-index', help='Old index name to copy')
    parser.add_argument('--boto-profile',
                        default=None,
                        help='Boto profile to use for signing requests')
    parser.add_argument('--aws-region',
                        default=None,
                        help='AWS Region to use for signing requests')
    args = parser.parse_args()

    if os.path.isfile('../config.yaml'):
        filename = '../config.yaml'
    elif os.path.isfile('config.yaml'):
        filename = 'config.yaml'
    else:
        filename = ''

    if filename:
        with open(filename) as config_file:
            data = yaml.load(config_file)
        host = args.host if args.host else data.get('es_host')
        port = args.port if args.port else data.get('es_port')
        username = data.get('es_username')
        password = data.get('es_password')
        url_prefix = args.url_prefix if args.url_prefix is not None else data.get(
            'es_url_prefix', '')
        use_ssl = args.ssl if args.ssl is not None else data.get('use_ssl')
        aws_region = data.get('aws_region', None)
    else:
        username = None
        password = None
        aws_region = args.aws_region
        host = args.host if args.host else raw_input(
            'Enter elasticsearch host: ')
        port = args.port if args.port else int(
            raw_input('Enter elasticsearch port: '))
        use_ssl = (args.ssl if args.ssl is not None else
                   raw_input('Use SSL? t/f: ').lower() in ('t', 'true'))
        if args.no_auth is None:
            username = raw_input('Enter optional basic-auth username: '******'Enter optional basic-auth password: '******'Enter optional Elasticsearch URL prefix: '))

    auth = Auth()
    http_auth = auth(host=host,
                     username=username,
                     password=password,
                     aws_region=aws_region,
                     boto_profile=args.boto_profile)

    es = Elasticsearch(host=host,
                       port=port,
                       use_ssl=use_ssl,
                       connection_class=RequestsHttpConnection,
                       http_auth=http_auth,
                       url_prefix=url_prefix)

    silence_mapping = {
        'silence': {
            'properties': {
                'rule_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'until': {
                    'type': 'date',
                    'format': 'dateOptionalTime'
                },
                '@timestamp': {
                    'format': 'dateOptionalTime',
                    'type': 'date'
                }
            }
        }
    }
    ess_mapping = {
        'elastalert_status': {
            'properties': {
                'rule_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                '@timestamp': {
                    'format': 'dateOptionalTime',
                    'type': 'date'
                }
            }
        }
    }
    es_mapping = {
        'elastalert': {
            'properties': {
                'rule_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                '@timestamp': {
                    'format': 'dateOptionalTime',
                    'type': 'date'
                },
                'alert_time': {
                    'format': 'dateOptionalTime',
                    'type': 'date'
                },
                'match_body': {
                    'enabled': False,
                    'type': 'object'
                },
                'aggregate_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                }
            }
        }
    }
    past_mapping = {
        'past_elastalert': {
            'properties': {
                'rule_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'match_body': {
                    'enabled': False,
                    'type': 'object'
                },
                '@timestamp': {
                    'format': 'dateOptionalTime',
                    'type': 'date'
                },
                'aggregate_id': {
                    'index': 'not_analyzed',
                    'type': 'string'
                }
            }
        }
    }
    error_mapping = {
        'elastalert_error': {
            'properties': {
                'data': {
                    'type': 'object',
                    'enabled': False
                },
                '@timestamp': {
                    'format': 'dateOptionalTime',
                    'type': 'date'
                }
            }
        }
    }

    index = args.index if args.index is not None else raw_input(
        'New index name? (Default elastalert_status) ')
    if not index:
        index = 'elastalert_status'

    old_index = (args.old_index if args.old_index is not None else
                 raw_input('Name of existing index to copy? (Default None) '))

    res = None
    if old_index:
        print('Downloading existing data...')
        res = es.search(index=old_index, body={}, size=500000)
        print('Got %s documents' % (len(res['hits']['hits'])))

    es_index = IndicesClient(es)
    if es_index.exists(index):
        print('Index ' + index + ' already exists. Skipping index creation.')
        return None

    es.indices.create(index)
    # To avoid a race condition. TODO: replace this with a real check
    time.sleep(2)
    es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping)
    es.indices.put_mapping(index=index,
                           doc_type='elastalert_status',
                           body=ess_mapping)
    es.indices.put_mapping(index=index,
                           doc_type='silence',
                           body=silence_mapping)
    es.indices.put_mapping(index=index,
                           doc_type='elastalert_error',
                           body=error_mapping)
    es.indices.put_mapping(index=index,
                           doc_type='past_elastalert',
                           body=past_mapping)
    print('New index %s created' % index)

    if res:
        bulk = ''.join([
            '%s\n%s\n' %
            (json.dumps({'create': {
                '_type': doc['_type'],
                '_index': index
            }}), json.dumps(doc['_source'])) for doc in res['hits']['hits']
        ])
        print('Uploading data...')
        es.bulk(body=bulk, index=index)

    print('Done!')
Exemple #7
0
from elasticsearch.client import Elasticsearch
from elasticsearch import helpers

indexName = "morocco-99"
print("Index Name: ", indexName)

es = Elasticsearch(hosts="http://localhost:9200")

results = es.search(body={
        "_source": "html",
        "size": 100,        
        "query": {
            "bool": {
                "must_not": {
                    "exists": {
                        "field": "processedText"
                    }            
                }
            }
        }
    }, index=indexName)

if len(results['hits']['hits']) > 0:
    print("Records Found: ", len(results['hits']['hits']) , "Processing Now")
    import re
    from bs4 import BeautifulSoup
    for item in range(len(results['hits']['hits'])):
        print("Processing", results['hits']['hits'][item]['_id'])
        soup = BeautifulSoup(results['hits']['hits'][item]['_source']['html'], 'html.parser')
        for script in soup(["script", "style",""]):
            script.extract()
Exemple #8
0
def reindex(from_hosts,
            from_index,
            to_hosts,
            to_index,
            to_type,
            source='{"query":{"match_all":{}}}',
            max_docs=0,
            page_size=10,
            logging_per_docs=1000,
            es_scroll='5m',
            request_timeout=60):

    if from_index is None:
        logger.warn('from_index is empty.')
        return

    from_es = Elasticsearch(hosts=from_hosts)
    to_es = Elasticsearch(hosts=to_hosts)

    scroll_id = None
    counter = 0
    running = True
    bulk_data = []
    while(running):
        try:
            if scroll_id is None:
                response = from_es.search(index=from_index,
                                          body=source,
                                          params={"request_timeout": request_timeout,
                                                  "scroll": es_scroll,
                                                  "size": page_size})
            else:
                response = from_es.scroll(scroll_id=scroll_id,
                                          params={"request_timeout": request_timeout,
                                                  "scroll": es_scroll})
            if len(response['hits']['hits']) == 0:
                running = False
                break
            scroll_id = response['_scroll_id']
            for hit in response['hits']['hits']:
                if '_source' in hit:
                    counter += 1
                    if counter % logging_per_docs == 0:
                        logger.info(u'Loaded {0} docs.'.format(counter))
                    if max_docs > 0 and counter >= max_docs:
                        logger.info(u'{0} docs are loaded, but it exceeded {1} docs.'.format(counter, max_docs))
                        running = False
                        break
                    op_index = to_index if to_index is not None else hit['_index']
                    op_type = to_type if to_type is not None else hit['_type']
                    bulk_data.append({"index": {"_index": op_index,
                                                "_type": op_type,
                                                "_id": hit['_id']}
                                      })
                    bulk_data.append(hit['_source'])
            if len(bulk_data) != 0:
                to_es.bulk(body=bulk_data, params={"request_timeout": request_timeout})
                bulk_data = []
        except NotFoundError:
            break
        except:
            logger.exception(u"Failed to load documents from Elasticsearch(Loaded {0} doc).".format(counter))
            break

    if len(bulk_data) != 0:
        to_es.bulk(body=bulk_data, params={"request_timeout": request_timeout})

    logger.info('Loaded {0} documents.'.format(counter))
Exemple #9
0
def get_changes(es: Elasticsearch, index_name: str, url: str) -> list:
    search_body = get_changes_query(url)
    response = es.search(index=index_name, body=search_body)
    return response['hits']['hits']
class NewTermsRule(RuleType):
    """ Alerts on a new value in a list of fields. """

    def __init__(self, rule, args=None):
        super(NewTermsRule, self).__init__(rule, args)
        self.seen_values = {}
        # Allow the use of query_key or fields
        if 'fields' not in self.rules:
            if 'query_key' not in self.rules:
                raise EAException("fields or query_key must be specified")
            self.fields = self.rules['query_key']
        else:
            self.fields = self.rules['fields']
        if not self.fields:
            raise EAException("fields must not be an empty list")
        if type(self.fields) != list:
            self.fields = [self.fields]
        if self.rules.get('use_terms_query') and (
            len(self.fields) != 1 or len(self.fields) == 1 and type(self.fields[0]) == list
        ):
            raise EAException("use_terms_query can only be used with a single non-composite field")
        try:
            self.get_all_terms(args)
        except Exception as e:
            # Refuse to start if we cannot get existing terms
            raise EAException('Error searching for existing terms: %s' % (e))

    def get_all_terms(self, args):
        """ Performs a terms aggregation for each field to get every existing term. """
        self.es = Elasticsearch(
            host=self.rules['es_host'],
            port=self.rules['es_port'],
            timeout=self.rules.get('es_conn_timeout', 50),
            send_get_body_as=self.rules.get('send_get_body_as', 'GET')
        )
        window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30}))
        field_name = {"field": "", "size": 2147483647}  # Integer.MAX_VALUE
        query_template = {"aggs": {"values": {"terms": field_name}}}
        if args and args.start:
            end = ts_to_dt(args.start)
        else:
            end = ts_now()
        start = end - window_size
        if self.rules.get('use_strftime_index'):
            index = format_index(self.rules['index'], start, end)
        else:
            index = self.rules['index']
        time_filter = {self.rules['timestamp_field']: {'lte': dt_to_ts(end), 'gte': dt_to_ts(start)}}
        query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}}
        query = {'aggs': {'filtered': query_template}}

        for field in self.fields:
            # For composite keys, we will need to perform sub-aggregations
            if type(field) == list:
                level = query_template['aggs']
                # Iterate on each part of the composite key and add a sub aggs clause to the elastic search query
                for i, sub_field in enumerate(field):
                    level['values']['terms']['field'] = add_raw_postfix(sub_field)
                    if i < len(field) - 1:
                        # If we have more fields after the current one, then set up the next nested structure
                        level['values']['aggs'] = {'values': {'terms': copy.deepcopy(field_name)}}
                        level = level['values']['aggs']
            else:
                # For non-composite keys, only a single agg is needed
                field_name['field'] = add_raw_postfix(field)
            res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout='50s')
            if 'aggregations' in res:
                buckets = res['aggregations']['filtered']['values']['buckets']
                if type(field) == list:
                    # For composite keys, make the lookup based on all fields
                    # Make it a tuple since it can be hashed and used in dictionary lookups
                    self.seen_values[tuple(field)] = []
                    for bucket in buckets:
                        # We need to walk down the hierarchy and obtain the value at each level
                        self.seen_values[tuple(field)] += self.flatten_aggregation_hierarchy(bucket)
                    # If we don't have any results, it could either be because of the absence of any baseline data
                    # OR it may be because the composite key contained a non-primitive type.  Either way, give the
                    # end-users a heads up to help them debug what might be going on.
                    if not self.seen_values[tuple(field)]:
                        elastalert_logger.warning((
                            'No results were found from all sub-aggregations.  This can either indicate that there is '
                            'no baseline data OR that a non-primitive field was used in a composite key.'
                        ))
                else:
                    keys = [bucket['key'] for bucket in buckets]
                    self.seen_values[field] = keys
                    elastalert_logger.info('Found %s unique values for %s' % (len(keys), field))
            else:
                self.seen_values[field] = []
                elastalert_logger.info('Found no values for %s' % (field))

    def flatten_aggregation_hierarchy(self, root, hierarchy_tuple=()):
        """ For nested aggregations, the results come back in the following format:
            {
            "aggregations" : {
                "filtered" : {
                  "doc_count" : 37,
                  "values" : {
                    "doc_count_error_upper_bound" : 0,
                    "sum_other_doc_count" : 0,
                    "buckets" : [ {
                      "key" : "1.1.1.1", # IP address (root)
                      "doc_count" : 13,
                      "values" : {
                        "doc_count_error_upper_bound" : 0,
                        "sum_other_doc_count" : 0,
                        "buckets" : [ {
                          "key" : "80",    # Port (sub-aggregation)
                          "doc_count" : 3,
                          "values" : {
                            "doc_count_error_upper_bound" : 0,
                            "sum_other_doc_count" : 0,
                            "buckets" : [ {
                              "key" : "ack",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            }, {
                              "key" : "syn",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 1
                            } ]
                          }
                        }, {
                          "key" : "82",    # Port (sub-aggregation)
                          "doc_count" : 3,
                          "values" : {
                            "doc_count_error_upper_bound" : 0,
                            "sum_other_doc_count" : 0,
                            "buckets" : [ {
                              "key" : "ack",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            }, {
                              "key" : "syn",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            } ]
                          }
                        } ]
                      }
                    }, {
                      "key" : "2.2.2.2", # IP address (root)
                      "doc_count" : 4,
                      "values" : {
                        "doc_count_error_upper_bound" : 0,
                        "sum_other_doc_count" : 0,
                        "buckets" : [ {
                          "key" : "443",    # Port (sub-aggregation)
                          "doc_count" : 3,
                          "values" : {
                            "doc_count_error_upper_bound" : 0,
                            "sum_other_doc_count" : 0,
                            "buckets" : [ {
                              "key" : "ack",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            }, {
                              "key" : "syn",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            } ]
                          }
                        } ]
                      }
                    } ]
                  }
                }
              }
            }

            Each level will either have more values and buckets, or it will be a leaf node
            We'll ultimately return a flattened list with the hierarchies appended as strings,
            e.g the above snippet would yield a list with:

            [
             ('1.1.1.1', '80', 'ack'),
             ('1.1.1.1', '80', 'syn'),
             ('1.1.1.1', '82', 'ack'),
             ('1.1.1.1', '82', 'syn'),
             ('2.2.2.2', '443', 'ack'),
             ('2.2.2.2', '443', 'syn')
            ]

            A similar formatting will be performed in the add_data method and used as the basis for comparison

        """
        results = []
        # There are more aggregation hierarchies left.  Traverse them.
        if 'values' in root:
            results += self.flatten_aggregation_hierarchy(root['values']['buckets'], hierarchy_tuple + (root['key'],))
        else:
            # We've gotten to a sub-aggregation, which may have further sub-aggregations
            # See if we need to traverse further
            for node in root:
                if 'values' in node:
                    results += self.flatten_aggregation_hierarchy(node, hierarchy_tuple)
                else:
                    results.append(hierarchy_tuple + (node['key'],))
        return results

    def add_data(self, data):
        for document in data:
            for field in self.fields:
                value = ()
                lookup_field = field
                if type(field) == list:
                    # For composite keys, make the lookup based on all fields
                    # Make it a tuple since it can be hashed and used in dictionary lookups
                    lookup_field = tuple(field)
                    for sub_field in field:
                        lookup_result = lookup_es_key(document, sub_field)
                        if not lookup_result:
                            value = None
                            break
                        value += (lookup_result,)
                else:
                    value = lookup_es_key(document, field)
                if not value and self.rules.get('alert_on_missing_field'):
                    document['missing_field'] = lookup_field
                    self.add_match(copy.deepcopy(document))
                elif value:
                    if value not in self.seen_values[lookup_field]:
                        document['new_field'] = lookup_field
                        self.add_match(copy.deepcopy(document))
                        self.seen_values[lookup_field].append(value)

    def add_terms_data(self, terms):
        # With terms query, len(self.fields) is always 1 and the 0'th entry is always a string
        field = self.fields[0]
        for timestamp, buckets in terms.iteritems():
            for bucket in buckets:
                if bucket['doc_count']:
                    if bucket['key'] not in self.seen_values[field]:
                        match = {field: bucket['key'],
                                 self.rules['timestamp_field']: timestamp,
                                 'new_field': field}
                        self.add_match(match)
                        self.seen_values[field].append(bucket['key'])
Exemple #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--host', help='Elasticsearch host')
    parser.add_argument('--port', type=int, help='Elasticsearch port')
    parser.add_argument('--url-prefix', help='Elasticsearch URL prefix')
    parser.add_argument('--no-auth', action='store_const', const=True, help='Suppress prompt for basic auth')
    parser.add_argument('--ssl', action='store_const', const=True, help='Use SSL')
    parser.add_argument('--no-ssl', action='store_const', const=True, help='Do not use SSL')
    parser.add_argument('--index', help='Index name to create')
    parser.add_argument('--old-index', help='Old index name to copy')
    args = parser.parse_args()

    if os.path.isfile('../config.yaml'):
        filename = '../config.yaml'
    elif os.path.isfile('config.yaml'):
        filename = 'config.yaml'
    else:
        filename = ''

    username = None
    password = None
    use_ssl = None
    url_prefix = None
    http_auth = None

    if filename:
        with open(filename) as config_file:
            data = yaml.load(config_file)
        host = data.get('es_host')
        port = data.get('es_port')
        username = data.get('es_username')
        password = data.get('es_password')
        url_prefix = data.get('es_url_prefix', '')
        use_ssl = data.get('use_ssl')
    else:
        host = args.host if args.host else raw_input('Enter elasticsearch host: ')
        port = args.port if args.port else int(raw_input('Enter elasticsearch port: '))
        use_ssl = (args.ssl if args.ssl is not None
                   else args.no_ssl if args.no_ssl is not None
                   else raw_input('Use SSL? t/f: ').lower() in ('t', 'true'))
        if args.no_auth is None:
            username = raw_input('Enter optional basic-auth username: '******'Enter optional basic-auth password: '******'Enter optional Elasticsearch URL prefix: '))

    if username and password:
        http_auth = username + ':' + password

    es = Elasticsearch(host=host, port=port, use_ssl=use_ssl, http_auth=http_auth, url_prefix=url_prefix)

    silence_mapping = {'silence': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'},
                                                  'until': {'type': 'date', 'format': 'dateOptionalTime'}}}}
    ess_mapping = {'elastalert_status': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'},
                                                        '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}}
    es_mapping = {'elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'},
                                                'match_body': {'enabled': False, 'type': 'object'},
                                                'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}}
    error_mapping = {'elastalert_error': {'properties': {'data': {'type': 'object', 'enabled': False}}}}

    index = args.index if args.index is not None else raw_input('New index name? (Default elastalert_status) ')
    if not index:
        index = 'elastalert_status'

    old_index = (args.old_index if args.old_index is not None
                 else raw_input('Name of existing index to copy? (Default None) '))

    res = None
    if old_index:
        print('Downloading existing data...')
        res = es.search(index=old_index, body={}, size=500000)
        print('Got %s documents' % (len(res['hits']['hits'])))

    es.indices.create(index)
    es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping)
    es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping)
    es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping)
    es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping)
    print('New index %s created' % (index))

    if res:
        bulk = ''.join(['%s\n%s\n' % (json.dumps({'create': {'_type': doc['_type'], '_index': index}}),
                                      json.dumps(doc['_source'])) for doc in res['hits']['hits']])
        print('Uploading data...')
        es.bulk(body=bulk, index=index)

    print('Done!')
            for sentence in sentences:
                ### Tokenize sentence in paragraph
                sentence = underthesea.word_tokenize(sentence, format="text")
                ### Lower case
                sentence = sentence.lower()

                paragraph_tokenized = paragraph_tokenized + sentence

        paragraph_tokenized = paragraph_tokenized.replace("\n", "")

        content_tokenized.append({
            "type": "text",
            "content": paragraph_tokenized
        })

    ### Convert và đẩy dữ liệu lên elasticsearch
    es_push_body = {
        "Trang": news_page,
        "Title": title_tokenized,
        "NoiDung": content_tokenized,
        "Description": des_tokenized,
        "NewspaperLink": news_link,
    }
    es.index(index="my-index", body=es_push_body)

### Đếm tổng số bản ghi hiện tại trên ES
es_check_body = {"query": {"match_all": {}}}

result_check = es.search(index="my-index", body=es_check_body)
print(result_check["hits"]["total"]["value"])
Exemple #13
0
def main(in_args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", help="Elasticsearch host")
    parser.add_argument("--port", type=int, help="Elasticsearch port")
    parser.add_argument("--url-prefix", help="Elasticsearch URL prefix")
    parser.add_argument("--no-auth", action="store_const", const=True, help="Suppress prompt for basic auth")
    parser.add_argument("--ssl", action="store_true", default=None, help="Use SSL")
    parser.add_argument("--no-ssl", dest="ssl", action="store_false", help="Do not use SSL")
    parser.add_argument("--index", help="Index name to create")
    parser.add_argument("--old-index", help="Old index name to copy")
    parser.add_argument("--config", help="Config file name")

    args = parser.parse_args(in_args)

    if args.config:
        filename = args.config
    elif os.path.isfile("../config.yaml"):
        filename = "../config.yaml"
    elif os.path.isfile("config.yaml"):
        filename = "config.yaml"
    else:
        filename = ""

    username = None
    password = None
    use_ssl = None
    url_prefix = None
    http_auth = None

    if filename:
        with open(filename) as config_file:
            data = yaml.load(config_file)
        host = data.get("es_host")
        port = data.get("es_port")
        username = data.get("es_username")
        password = data.get("es_password")
        url_prefix = data.get("es_url_prefix", "")
        use_ssl = data.get("use_ssl")
    else:
        host = args.host if args.host else raw_input("Enter elasticsearch host: ")
        port = args.port if args.port else int(raw_input("Enter elasticsearch port: "))
        use_ssl = args.ssl if args.ssl is not None else raw_input("Use SSL? t/f: ").lower() in ("t", "true")
        if args.no_auth is None:
            username = raw_input("Enter optional basic-auth username: "******"Enter optional basic-auth password: "******"Enter optional Elasticsearch URL prefix: ")
        )

    if username and password:
        http_auth = username + ":" + password

    es = Elasticsearch(host=host, port=port, use_ssl=use_ssl, http_auth=http_auth, url_prefix=url_prefix)

    silence_mapping = {
        "silence": {
            "properties": {
                "rule_name": {"index": "not_analyzed", "type": "string"},
                "until": {"type": "date", "format": "dateOptionalTime"},
            }
        }
    }
    ess_mapping = {
        "elastalert_status": {
            "properties": {
                "rule_name": {"index": "not_analyzed", "type": "string"},
                "@timestamp": {"format": "dateOptionalTime", "type": "date"},
            }
        }
    }
    es_mapping = {
        "elastalert": {
            "properties": {
                "rule_name": {"index": "not_analyzed", "type": "string"},
                "match_body": {"enabled": False, "type": "object"},
                "aggregate_id": {"index": "not_analyzed", "type": "string"},
            }
        }
    }
    error_mapping = {"elastalert_error": {"properties": {"data": {"type": "object", "enabled": False}}}}

    index = args.index if args.index is not None else raw_input("New index name? (Default elastalert_status) ")
    if not index:
        index = "elastalert_status"

    res = None
    if args.old_index:
        print("Downloading existing data...")
        res = es.search(index=args.old_index, body={}, size=500000)
        print("Got %s documents" % (len(res["hits"]["hits"])))

    es.indices.create(index)
    es.indices.put_mapping(index=index, doc_type="elastalert", body=es_mapping)
    es.indices.put_mapping(index=index, doc_type="elastalert_status", body=ess_mapping)
    es.indices.put_mapping(index=index, doc_type="silence", body=silence_mapping)
    es.indices.put_mapping(index=index, doc_type="elastalert_error", body=error_mapping)
    print("New index %s created" % (index))

    if res:
        bulk = "".join(
            [
                "%s\n%s\n"
                % (json.dumps({"create": {"_type": doc["_type"], "_index": index}}), json.dumps(doc["_source"]))
                for doc in res["hits"]["hits"]
            ]
        )
        print("Uploading data...")
        es.bulk(body=bulk, index=index)

    print("Done!")
Exemple #14
0
class ELmonocleDB:

    log = logging.getLogger("monocle.ELmonocleDB")

    def __init__(
        self,
        elastic_conn="localhost:9200",
        index=None,
        timeout=10,
        prefix=CHANGE_PREFIX,
        create=True,
        previous_schema=False,
        idents_config: Optional[IdentsConfig] = None,
        user=None,
        password=None,
        use_ssl=None,
        verify_certs=None,
        ssl_show_warn=None,
    ) -> None:
        host, port = elastic_conn.split(":")
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ip = socket.gethostbyname(host)
        self.log.info("ES IP is %s" % ip)
        self.log.info("ES prefix is %s" % prefix)

        elastic_conn = [{
            "host": host,
            "port": port,
        }]

        if use_ssl:
            elastic_conn[0]["use_ssl"] = use_ssl

        if not verify_certs:
            elastic_conn[0]["verify_certs"] = verify_certs

        if not ssl_show_warn:
            elastic_conn[0]["ssl_show_warn"] = ssl_show_warn

        if user and password:
            elastic_conn[0]["http_auth"] = "%s:%s" % (user, password)

        while True:
            try:
                s.connect((ip, int(port)))
                s.shutdown(2)
                s.close()
                break
            except Exception as excpt:
                self.log.info(
                    "Unable to connect to %s: %s. Sleeping for %ds." %
                    (elastic_conn, excpt, timeout))
                time.sleep(timeout)

        self.log.info("Connecting to ES server at %s" % elastic_conn)
        self.es = Elasticsearch(elastic_conn)
        self.log.info(self.es.info())

        if previous_schema:
            self.prefix = PREV_CHANGE_PREFIX
        else:
            self.prefix = prefix

        if not index:
            self.log.info("No index provided")
            return

        self.idents_config = idents_config or []

        self.index = "{}{}".format(self.prefix, index)
        self.log.info("Using ES index %s" % self.index)
        self.mapping = {
            "properties": {
                "id": {
                    "type": "keyword"
                },
                "type": {
                    "type": "keyword"
                },
                "number": {
                    "type": "keyword"
                },
                "change_id": {
                    "type": "keyword"
                },
                "title": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 8191
                        }
                    },
                },
                "text": {
                    "type": "text",
                    "fields": {
                        "keyword": {
                            "type": "keyword",
                            "ignore_above": 8191
                        }
                    },
                },
                "url": {
                    "type": "keyword"
                },
                "commit_count": {
                    "type": "integer"
                },
                "additions": {
                    "type": "integer"
                },
                "deletions": {
                    "type": "integer"
                },
                "changed_files_count": {
                    "type": "integer"
                },
                "changed_files": {
                    "properties": {
                        "additions": {
                            "type": "integer"
                        },
                        "deletions": {
                            "type": "integer"
                        },
                        "path": {
                            "type": "keyword"
                        },
                    }
                },
                "commits": {
                    "properties": {
                        "sha": {
                            "type": "keyword"
                        },
                        "author": {
                            "properties": {
                                "uid": {
                                    "type": "keyword"
                                },
                                "muid": {
                                    "type": "keyword"
                                },
                            }
                        },
                        "committer": {
                            "properties": {
                                "uid": {
                                    "type": "keyword"
                                },
                                "muid": {
                                    "type": "keyword"
                                },
                            }
                        },
                        "authored_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "committed_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "additions": {
                            "type": "integer"
                        },
                        "deletions": {
                            "type": "integer"
                        },
                        "title": {
                            "type": "text"
                        },
                    }
                },
                "repository_prefix": {
                    "type": "keyword"
                },
                "repository_fullname": {
                    "type": "keyword"
                },
                "repository_shortname": {
                    "type": "keyword"
                },
                "author": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "on_author": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "committer": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "merged_by": {
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    }
                },
                "branch": {
                    "type": "keyword"
                },
                "target_branch": {
                    "type": "keyword"
                },
                "created_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "on_created_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "merged_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "updated_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "closed_at": {
                    "type": "date",
                    "format": "date_time_no_millis"
                },
                "state": {
                    "type": "keyword"
                },
                "duration": {
                    "type": "integer"
                },
                "mergeable": {
                    "type": "keyword"
                },
                "labels": {
                    "type": "keyword"
                },
                "assignees": {
                    "type": "nested",
                    "properties": {
                        "uid": {
                            "type": "keyword"
                        },
                        "muid": {
                            "type": "keyword"
                        },
                    },
                },
                "approval": {
                    "type": "keyword"
                },
                "draft": {
                    "type": "boolean"
                },
                "self_merged": {
                    "type": "boolean"
                },
            }
        }
        settings = {"mappings": self.mapping}
        self.ic = self.es.indices
        if create:
            self.ic.create(index=self.index, ignore=400, body=settings)
        # The authors_histo is failing on some context with this error when the
        # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be
        # set by changing the [search.max_buckets] cluster level)
        # This is an attempt to mitigate the issue
        cluster_settings = {"transient": {"search.max_buckets": 100000}}
        self.es.cluster.put_settings(body=cluster_settings)

    def update(self, source_it: List[Union[Change, Event]]) -> None:
        def gen(it):
            for _source in it:
                source = change_or_event_to_dict(_source)
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = source["id"]
                d["doc"] = source
                d["doc_as_upsert"] = True
                yield d

        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)

    def delete_index(self):
        self.log.info("Deleting index: %s" % self.index)
        self.ic.delete(index=self.index)

    def delete_repository(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "regexp": {
                            "repository_fullname": {
                                "value": repository_fullname
                            }
                        }
                    }
                }
            }
        }
        params["body"] = body
        self.es.delete_by_query(**params)
        self.es.indices.refresh(index=self.index)

    def get_last_updated(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "sort": [{
                "updated_at": {
                    "order": "desc"
                }
            }],
            "query": {
                "bool": {
                    "filter": [
                        {
                            "term": {
                                "type": "Change"
                            }
                        },
                        {
                            "regexp": {
                                "repository_fullname": {
                                    "value": repository_fullname
                                }
                            }
                        },
                    ]
                }
            },
        }
        params["body"] = body
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        ret = [r["_source"] for r in res["hits"]["hits"]]
        if not ret:
            return []
        return ret[0]

    def run_named_query(self, name, *args, **kwargs):
        if name not in queries.public_queries:
            raise UnknownQueryException("Unknown query: %s" % name)
        return getattr(queries, name)(self.es, self.index, *args, **kwargs)

    def get_indices(self):
        return [
            ind.replace(self.prefix, "")
            for ind in self.es.indices.get(self.prefix + "*")
        ]

    def iter_index(self):
        body = {"query": {"match_all": {}}}
        return scan(self.es, query=body, index=self.index, size=5000)

    def update_idents(self) -> None:

        import json

        bulk_size = 7500

        def get_obj_hash(obj: Dict) -> int:
            obj_json = json.dumps(obj, sort_keys=True)
            return hash(obj_json)

        def update_ident(dict_ident: Dict) -> Dict:
            dict_ident["muid"] = create_muid(dict_ident["uid"],
                                             self.idents_config)
            return dict_ident

        def _update_idents(
                obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]:

            prev_hash = get_obj_hash(obj)

            if obj["type"] == "Change":
                obj["author"] = update_ident(obj["author"])
                if "committer" in obj:
                    obj["committer"] = update_ident(obj["committer"])
                if "merged_by" in obj:
                    obj["merged_by"] = update_ident(obj["merged_by"])
                if "assignees" in obj:
                    obj["assignees"] = list(map(update_ident,
                                                obj["assignees"]))
                if "commits" in obj:
                    for commit in obj["commits"]:
                        commit["author"] = update_ident(commit["author"])
                        commit["committer"] = update_ident(commit["committer"])
            else:
                if "author" in obj:
                    obj["author"] = update_ident(obj["author"])
                if "on_author" in obj:
                    obj["on_author"] = update_ident(obj["on_author"])
            updated = not prev_hash == get_obj_hash(obj)
            if updated:
                return dict_to_change_or_event(obj), True
            else:
                return None, False

        def bulk_update(to_update: List) -> List:
            print("Updating %s objects ..." % len(to_update))
            self.update(to_update)
            return []

        to_update = []
        total_read = 0
        for _obj in self.iter_index():
            total_read += 1
            if total_read % bulk_size == 0:
                print("%s objects read from the database" % total_read)
            obj = _obj["_source"]
            obj, updated = _update_idents(obj)
            if updated:
                to_update.append(obj)
            if len(to_update) == bulk_size:
                to_update = bulk_update(to_update)

        bulk_update(to_update)
Exemple #15
0
class BaseElasticsearchBackend(Base):
    """Base connection wrapper based on the ElasticSearch official library.

    It uses two entry points to configure the underlying connection:

    * ``transport_class``: the transport class from ``elasticsearch``. By
      default ``elasticsearch.transport.Transport``.
    * ``connection_class``: the connection class used by the transport class.
      It's undefined by default, as it is on the subclasses to provide one.

    If any of these elements is not defined, an ``ImproperlyConfigured`` error
    will be raised when the backend will try to configure the client.
    """
    #: ElasticSearch transport class used by the client class to perform
    #: requests.
    transport_class = Transport
    #: ElasticSearch connection class used by the transport class to perform
    #: requests.
    connection_class = None

    def configure_client(self):
        """Instantiate and configure the ElasticSearch client.

        It simply takes the given HOSTS list and uses PARAMS as the keyword
        arguments of the ElasticSearch class.

        The client's transport_class is given by the class attribute
        ``transport_class``, and the connection class used by the transport
        class is given by the class attribute ``connection_class``.

        An ``ImproperlyConfigured`` exception is raised if any of these
        elements is undefined.
        """
        hosts = self.server['HOSTS']
        params = self.server['PARAMS']

        if not self.transport_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no transport class provided' % self.__class__)

        if not self.connection_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no connection class provided' % self.__class__)

        #pylint: disable=star-args
        self.client = Elasticsearch(hosts,
                                    transport_class=self.transport_class,
                                    connection_class=self.connection_class,
                                    **params)

    # Server methods
    # ==============
    # The underlying client does not require index names to perform server
    # related queries, such as "ping" or "info". The connection wrapper act
    # for them as a proxy.

    def ping(self, **kwargs):
        return self.client.ping(**kwargs)

    def info(self, **kwargs):
        return self.client.info(**kwargs)

    def put_script(self, lang, script_id, body, **kwargs):
        return self.client.put_script(lang, script_id, body, **kwargs)

    def get_script(self, lang, script_id, **kwargs):
        return self.client.get_script(lang, script_id, **kwargs)

    def delete_script(self, lang, script_id, **kwargs):
        return self.client.delete_script(lang, script_id, **kwargs)

    def put_template(self, template_id, body, **kwargs):
        return self.client.put_template(template_id, body, **kwargs)

    def get_template(self, template_id, body=None, **kwargs):
        return self.client.get_template(template_id, body, **kwargs)

    def delete_template(self, template_id=None, **kwargs):
        return self.client.delete_template(template_id, **kwargs)

    # Bulk methods
    # ============
    # The underlying client does not require index names, but it can be used.
    # As it makes sense to not give an index, developers are free to use these
    # as they want, as long as they are careful.

    def mget(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mget(body, index, doc_type, **kwargs)

    def bulk(self, body, index=None, doc_type=None, **kwargs):
        return self.client.bulk(body, index, doc_type, **kwargs)

    def msearch(self, body, index=None, doc_type=None, **kwargs):
        return self.client.msearch(body, index, doc_type, **kwargs)

    def mpercolate(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mpercolate(body, index, doc_type, **kwargs)

    # Scroll methods
    # ==============
    # The underlying client does not require an index to perform scroll.

    def scroll(self, scroll_id, **kwargs):
        return self.client.scroll(scroll_id, **kwargs)

    def clear_scroll(self, scroll_id, body=None, **kwargs):
        return self.client.clear_scroll(scroll_id, body, **kwargs)

    # Query methods
    # =============
    # The underlying client requires index names (or alias names) to perform
    # queries. The connection wrapper overrides these client methods to
    # automatically uses the configured names (indices and/or aliases).

    def create(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.create(self.indices, doc_type, body, doc_id,
                                  **kwargs)

    def index(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.index(self.indices, doc_type, body, doc_id,
                                 **kwargs)

    def exists(self, doc_id, doc_type='_all', **kwargs):
        return self.client.exists(self.indices, doc_id, doc_type, **kwargs)

    def get(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get(self.indices, doc_id, doc_type, **kwargs)

    def get_source(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get_source(self.indices, doc_id, doc_type, **kwargs)

    def update(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.update(self.indices, doc_type, doc_id, body,
                                  **kwargs)

    def search(self, doc_type=None, body=None, **kwargs):
        return self.client.search(self.indices, doc_type, body, **kwargs)

    def search_shards(self, doc_type=None, **kwargs):
        return self.client.search_shards(self.indices, doc_type, **kwargs)

    def search_template(self, doc_type=None, body=None, **kwargs):
        return self.client.search_template(self.indices, doc_type, body,
                                           **kwargs)

    def explain(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.explain(self.indices, doc_type, doc_id, body,
                                   **kwargs)

    def delete(self, doc_type, doc_id, **kwargs):
        return self.client.delete(self.indices, doc_type, doc_id, **kwargs)

    def count(self, doc_type=None, body=None, **kwargs):
        return self.client.count(self.indices, doc_type, body, **kwargs)

    def delete_by_query(self, doc_type=None, body=None, **kwargs):
        return self.client.delete_by_query(self.indices, doc_type, body,
                                           **kwargs)

    def suggest(self, body, **kwargs):
        return self.client.suggest(body, self.indices, **kwargs)

    def percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.percolate(self.indices, doc_type, doc_id, body,
                                     **kwargs)

    def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.count_percolate(self.indices, doc_type, doc_id,
                                           body, **kwargs)

    def mlt(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs)

    def termvector(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.termvector(self.indices, doc_type, doc_id, body,
                                      **kwargs)

    def mtermvectors(self, doc_type=None, body=None, **kwargs):
        return self.client.mtermvectors(self.indices, doc_type, body, **kwargs)

    def benchmark(self, doc_type=None, body=None, **kwargs):
        return self.client.benchmark(self.indices, doc_type, body, **kwargs)

    def abort_benchmark(self, name=None, **kwargs):
        return self.client.abort_benchmark(name, **kwargs)

    def list_benchmarks(self, doc_type=None, **kwargs):
        return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
Exemple #16
0
class ELmonocleDB:

    log = logging.getLogger("monocle.ELmonocleDB")

    def __init__(
        self,
        elastic_conn="localhost:9200",
        index=None,
        timeout=10,
        prefix=CHANGE_PREFIX,
        create=True,
        previous_schema=False,
        idents_config: Optional[IdentsConfig] = None,
        user=None,
        password=None,
        use_ssl=None,
        verify_certs=None,
        ssl_show_warn=None,
    ) -> None:
        host, port = elastic_conn.split(":")
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ip = socket.gethostbyname(host)
        self.log.info("ES IP is %s" % ip)
        self.log.info("ES prefix is %s" % prefix)

        elastic_conn = [
            {
                "host": host,
                "port": port,
            }
        ]

        if use_ssl:
            elastic_conn[0]["use_ssl"] = use_ssl

        if not verify_certs:
            elastic_conn[0]["verify_certs"] = verify_certs

        if not ssl_show_warn:
            elastic_conn[0]["ssl_show_warn"] = ssl_show_warn

        if user and password:
            elastic_conn[0]["http_auth"] = "%s:%s" % (user, password)

        while True:
            try:
                s.connect((ip, int(port)))
                s.shutdown(2)
                s.close()
                break
            except Exception as excpt:
                self.log.info(
                    "Unable to connect to %s: %s. Sleeping for %ds."
                    % (elastic_conn, excpt, timeout)
                )
                time.sleep(timeout)

        self.log.info("Connecting to ES server at %s" % elastic_conn)
        self.es = Elasticsearch(elastic_conn)
        self.log.info(self.es.info())

        if previous_schema:
            self.prefix = PREV_CHANGE_PREFIX
        else:
            self.prefix = prefix

        if not index:
            self.log.info("No index provided")
            return

        self.idents_config = idents_config or []

        self.index = "{}{}".format(self.prefix, index)
        self.log.info("Using ES index %s" % self.index)
        self.mapping = {
            "properties": {
                "id": {"type": "keyword"},
                "type": {"type": "keyword"},
                "number": {"type": "keyword"},
                "change_id": {"type": "keyword"},
                "title": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}},
                },
                "text": {
                    "type": "text",
                    "fields": {"keyword": {"type": "keyword", "ignore_above": 8191}},
                },
                "url": {"type": "keyword"},
                "commit_count": {"type": "integer"},
                "additions": {"type": "integer"},
                "deletions": {"type": "integer"},
                "changed_files_count": {"type": "integer"},
                "changed_files": {
                    "properties": {
                        "additions": {"type": "integer"},
                        "deletions": {"type": "integer"},
                        "path": {"type": "keyword"},
                    }
                },
                "commits": {
                    "properties": {
                        "sha": {"type": "keyword"},
                        "author": {
                            "properties": {
                                "uid": {"type": "keyword"},
                                "muid": {"type": "keyword"},
                            }
                        },
                        "committer": {
                            "properties": {
                                "uid": {"type": "keyword"},
                                "muid": {"type": "keyword"},
                            }
                        },
                        "authored_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "committed_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "additions": {"type": "integer"},
                        "deletions": {"type": "integer"},
                        "title": {"type": "text"},
                    }
                },
                "repository_prefix": {"type": "keyword"},
                "repository_fullname": {"type": "keyword"},
                "repository_shortname": {"type": "keyword"},
                "author": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "on_author": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "committer": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "merged_by": {
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    }
                },
                "branch": {"type": "keyword"},
                "target_branch": {"type": "keyword"},
                "created_at": {"type": "date", "format": "date_time_no_millis"},
                "on_created_at": {"type": "date", "format": "date_time_no_millis"},
                "merged_at": {"type": "date", "format": "date_time_no_millis"},
                "updated_at": {"type": "date", "format": "date_time_no_millis"},
                "closed_at": {"type": "date", "format": "date_time_no_millis"},
                "state": {"type": "keyword"},
                "duration": {"type": "integer"},
                "mergeable": {"type": "keyword"},
                "labels": {"type": "keyword"},
                "assignees": {
                    "type": "nested",
                    "properties": {
                        "uid": {"type": "keyword"},
                        "muid": {"type": "keyword"},
                    },
                },
                "approval": {"type": "keyword"},
                "draft": {"type": "boolean"},
                "self_merged": {"type": "boolean"},
                "crawler_metadata": {
                    "properties": {
                        "last_commit_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "last_post_at": {
                            "type": "date",
                            "format": "date_time_no_millis",
                        },
                        "total_docs_posted": {"type": "integer"},
                        "total_changes_updated": {"type": "integer"},
                        "total_orphans_updated": {"type": "integer"},
                    }
                },
                "tasks_data": {
                    "properties": {
                        "tid": {"type": "keyword"},
                        "ttype": {"type": "keyword"},
                        "crawler_name": {"type": "keyword"},
                        "updated_at": {"type": "date", "format": "date_time_no_millis"},
                        "change_url": {"type": "keyword"},
                        "severity": {"type": "keyword"},
                        "priority": {"type": "keyword"},
                        "score": {"type": "integer"},
                        "url": {"type": "keyword"},
                        "title": {
                            "type": "text",
                            "fields": {
                                "keyword": {"type": "keyword", "ignore_above": 8191}
                            },
                        },
                        "_adopted": {"type": "boolean"},
                    }
                },
            }
        }
        settings = {"mappings": self.mapping}
        self.ic = self.es.indices
        if create:
            self.ic.create(index=self.index, ignore=400, body=settings)
        # The authors_histo is failing on some context with this error when the
        # time slice is large: Must be less than or equal to: [10000] but was [10001]. ()This limit can be
        # set by changing the [search.max_buckets] cluster level)
        # This is an attempt to mitigate the issue
        cluster_settings = {"transient": {"search.max_buckets": 100000}}
        self.es.cluster.put_settings(body=cluster_settings)

    def update(self, source_it: List[Union[Change, Event]]) -> None:
        def gen(it):
            for _source in it:
                source = change_or_event_to_dict(_source)
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = source["id"]
                d["doc"] = source
                d["doc_as_upsert"] = True
                yield d

        bulk(self.es, gen(source_it))
        self.es.indices.refresh(index=self.index)

    def update_task_data(
        self,
        source_it: Union[
            List[TaskDataForEL],
            List[OrphanTaskDataForEL],
            List[AdoptedTaskDataForEL],
        ],
    ) -> Optional[BulkIndexError]:
        def gen(it):
            for _source in it:
                d = {}
                d["_index"] = self.index
                d["_op_type"] = "update"
                d["_id"] = _source._id
                d["doc"] = {}
                d["doc"].update({"id": _source._id})
                if isinstance(_source, TaskDataForEL):
                    d["doc"].update(
                        {"tasks_data": [asdict(td) for td in _source.tasks_data]}
                    )
                if isinstance(_source, OrphanTaskDataForEL):
                    d["doc"].update({"tasks_data": asdict(_source.task_data)})
                    d["doc"]["type"] = "OrphanTaskData"
                if isinstance(_source, AdoptedTaskDataForEL):
                    d["doc"].update({"tasks_data": asdict(_source.task_data)})
                d["doc_as_upsert"] = True
                yield d

        ret = None
        try:
            bulk(self.es, gen(source_it))
        except BulkIndexError as err:
            ret = err
        self.es.indices.refresh(index=self.index)
        return ret

    def compute_crawler_id_by_name(self, name, _type):
        return "crawler/%s/%s" % (_type, name)

    def get_task_crawler_metadata(self, name: str) -> Dict:
        try:
            ret = self.es.get(
                self.index, self.compute_crawler_id_by_name(name, "tasks_crawler")
            )
            return ret["_source"]["crawler_metadata"]
        except Exception:
            return {}

    def set_task_crawler_metadata(
        self, name: str, commit_date: datetime = None, push_infos: Dict = None
    ):
        metadata = {}
        if commit_date:
            metadata.update({"last_commit_at": commit_date})
        if push_infos:
            prev_metadata = self.get_task_crawler_metadata(name)
            metadata.update(
                {
                    "last_post_at": push_infos["last_post_at"],
                    "total_docs_posted": prev_metadata.get("total_docs_posted", 0)
                    + push_infos["total_docs_posted"],
                    "total_changes_updated": prev_metadata.get(
                        "total_changes_updated", 0
                    )
                    + push_infos["total_changes_updated"],
                    "total_orphans_updated": prev_metadata.get(
                        "total_orphans_updated", 0
                    )
                    + push_infos["total_orphans_updated"],
                }
            )
        body = {
            "doc": {"type": "TaskCrawlerDataCommit", "crawler_metadata": metadata},
            "doc_as_upsert": True,
        }
        ret = None
        try:
            self.es.update(
                self.index,
                self.compute_crawler_id_by_name(name, "tasks_crawler"),
                body=body,
            )
            self.es.indices.refresh(index=self.index)
        except Exception as err:
            ret = err
        return ret

    def delete_index(self):
        self.log.info("Deleting index: %s" % self.index)
        self.ic.delete(index=self.index)

    def delete_repository(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "query": {
                "bool": {
                    "filter": {
                        "regexp": {
                            "repository_fullname": {"value": repository_fullname}
                        }
                    }
                }
            }
        }
        params["body"] = body
        self.es.delete_by_query(**params)
        self.es.indices.refresh(index=self.index)

    def get_last_updated(self, repository_fullname):
        params = {"index": self.index}
        body = {
            "sort": [{"updated_at": {"order": "desc"}}],
            "query": {
                "bool": {
                    "filter": [
                        {"term": {"type": "Change"}},
                        {
                            "regexp": {
                                "repository_fullname": {"value": repository_fullname}
                            }
                        },
                    ]
                }
            },
        }
        params["body"] = body
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        ret = [r["_source"] for r in res["hits"]["hits"]]
        if not ret:
            return []
        return ret[0]

    def get_changes_by_url(self, change_urls, size):
        params = {
            "index": self.index,
            "body": {
                "size": size,
                "query": {
                    "bool": {
                        "filter": [
                            {"term": {"type": "Change"}},
                            {"terms": {"url": change_urls}},
                        ]
                    }
                },
            },
        }
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        return [r["_source"] for r in res["hits"]["hits"]]

    def get_orphan_tds_by_change_urls(self, change_urls):
        assert len(change_urls) <= 50
        size = 5000  # Asumming not more that 100 TD data relataed to a change
        params = {
            "index": self.index,
            "body": {
                "size": size,
                "query": {
                    "bool": {
                        "must_not": {"exists": {"field": "tasks_data._adopted"}},
                        "filter": [
                            {"term": {"type": "OrphanTaskData"}},
                            {"terms": {"tasks_data.change_url": change_urls}},
                        ],
                    }
                },
            },
        }
        try:
            res = self.es.search(**params)
        except Exception:
            return []
        return [r["_source"] for r in res["hits"]["hits"]]

    def get_orphan_tds_and_declare_adpotion(self, changes_url):
        assert len(changes_url) <= 50
        tds = self.get_orphan_tds_by_change_urls(changes_url)
        if tds:
            adopted_tds = [
                AdoptedTaskDataForEL(
                    _id=td["id"],
                    task_data=AdoptedTaskData(_adopted=True),
                )
                for td in tds
            ]
            self.update_task_data(adopted_tds)
        return tds

    def update_changes_with_orphan_tds(self, mapping: Dict[str, str]):
        change_urls = list(mapping.keys())
        while change_urls:
            change_urls_to_process = change_urls[:50]
            change_urls = change_urls[50:]
            tds = self.get_orphan_tds_and_declare_adpotion(change_urls_to_process)
            # Group tds in buckets by change_url
            _map: Dict[str, List] = dict()
            for td in tds:
                _map.setdefault(td["tasks_data"]["change_url"], []).append(
                    td["tasks_data"]
                )
            # Create update docs to attach tds to matching changes
            to_update = []
            for change_url, tds in _map.items():
                to_update.append(
                    TaskDataForEL(
                        _id=mapping[change_url],
                        tasks_data=createELTaskData(tds),
                    )
                )
            self.update_task_data(to_update)

    def run_named_query(self, name, *args, **kwargs):
        if name not in queries.public_queries:
            raise UnknownQueryException("Unknown query: %s" % name)
        return getattr(queries, name)(self.es, self.index, *args, **kwargs)

    def get_indices(self):
        return [
            ind.replace(self.prefix, "")
            for ind in self.es.indices.get(self.prefix + "*")
        ]

    def iter_index(self):
        body = {"query": {"match_all": {}}}
        return scan(self.es, query=body, index=self.index, size=5000)

    def update_idents(self) -> None:

        import json

        bulk_size = 7500

        def get_obj_hash(obj: Dict) -> int:
            obj_json = json.dumps(obj, sort_keys=True)
            return hash(obj_json)

        def update_ident(dict_ident: Dict) -> Dict:
            dict_ident["muid"] = create_muid(dict_ident["uid"], self.idents_config)
            return dict_ident

        def _update_idents(obj: Dict) -> Tuple[Optional[Union[Change, Event]], bool]:

            prev_hash = get_obj_hash(obj)

            if obj["type"] == "Change":
                obj["author"] = update_ident(obj["author"])
                if "committer" in obj:
                    obj["committer"] = update_ident(obj["committer"])
                if "merged_by" in obj:
                    obj["merged_by"] = update_ident(obj["merged_by"])
                if "assignees" in obj:
                    obj["assignees"] = list(map(update_ident, obj["assignees"]))
                if "commits" in obj:
                    for commit in obj["commits"]:
                        commit["author"] = update_ident(commit["author"])
                        commit["committer"] = update_ident(commit["committer"])
            if obj["type"] in get_events_list():
                if "author" in obj:
                    obj["author"] = update_ident(obj["author"])
                if "on_author" in obj:
                    obj["on_author"] = update_ident(obj["on_author"])
            updated = not prev_hash == get_obj_hash(obj)
            if updated:
                return dict_to_change_or_event(obj), True
            else:
                return None, False

        def bulk_update(to_update: List) -> List:
            print("Updating %s objects ..." % len(to_update))
            self.update(to_update)
            return []

        to_update = []
        total_read = 0
        for _obj in self.iter_index():
            total_read += 1
            if total_read % bulk_size == 0:
                print("%s objects read from the database" % total_read)
            obj = _obj["_source"]
            obj, updated = _update_idents(obj)
            if updated:
                to_update.append(obj)
            if len(to_update) == bulk_size:
                to_update = bulk_update(to_update)

        bulk_update(to_update)
class ESConnector:
    """
    as many MS will communicate with ElasticSearch, centralize access
    with this library
    """

    def __init__(self,
                 host=None,
                 port=9200,
                 timeout=10,
                 local_env=False):
        self.host = host
        self.port = port
        self.timeout = timeout
        self.local_env = local_env
        self.es = None

    def _connect(self):
        """
        connect to a member of the ElasticSearch cluster
        """
        try:
            if self.local_env:
                self.es = Elasticsearch([{'host': self.host,
                                          'port': self.port}])
            else:
                self.es = Elasticsearch([{'host': self.host,
                                          'port': self.port}],
                                        sniff_on_start=True,
                                        sniff_on_connection_fail=True,
                                        sniffer_timeout=self.timeout)
            self.idx = IndicesClient(self.es)
            return
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host, self.port)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchError.unknown_exception(backtrace, str(e))

    def _create_index(self, index, doc_type, settings=None, mappings=None):
        """
        create a new empty index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            settings = ElasticSearch cluster configuration
            mappings = dict of document fields by type and indexing preference
        """
        if not settings:
            settings = {'index': {'number_of_shards': '1',
                                  'number_of_replicas': '0'}}
        if not mappings:
            mappings = {'property': {'id': {'type': 'string',
                                           'index': 'not_analyzed'}}} 
        try:
            response = self.es.create(index=index,
                                      doc_type=doc_type,
                                      body=dumps(settings))
            self.idx.put_mapping(index=index,
                                 doc_type=doc_type,
                                 body=dumps(mappings))
            if not 'created' in response or not response['created']:
                return ElasticSearchError.unable_to_create_index(index)                                                                                                            
            log.info('Index: {} created'.format(index))
            log.info('ES create(): response: {}'.format(response))
            return
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except NotFoundError as e:
            return ElasticSearchError.missing_index(self.index)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchError.unknown_exception(backtrace,
                                                        str(e))

    def drop_index(self, index):
        try:
            if index in self.es.indices.stats()['indices'].keys():
                self.es.indices.delete(index=index, ignore=[400, 404])
            log.info('Index: {} deleted'.format(index))
            return
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except NotFoundError as e:
            return ElasticSearcheError.missing_index(self.index)
        except RequestError as e:
            return ElasticSearcheError.invalid_request(str(e))
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchError.unknown_exception(backtrace,
                                                        str(e))

    def add_document(self,
                     index=None,
                     doc_type=None,
                     doc_id=0,
                     settings={},
                     mappings={},
                     values={}):
        """
        add a new document to an existing index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            settings = ElasticSearch cluster configuration
            mappings = dict of document fields by type and indexing preference
            values = dictionary of fields and values
        """
        try:
            err_msg = self._connect()
            if err_msg:
                return err_msg
            if index not in self.es.indices.stats()['indices'].keys():
                err_msg = self._create_index(index,
                                             doc_type,
                                             settings,
                                             mappings)
                if err_msg:
                    return err_msg
            response = self.es.create(index=index,
                                      doc_type=doc_type,
                                      id=doc_id,
                                      body=dumps(values))
            log.info('ES create(): response: {}'.format(response))
            return ElasticSearchWrite.object_created(response)
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except NotFoundError as e:
            return ElasticSearchError.missing_index(index)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchWriteError.unknown_exception(doc_id,
                                                             values,
                                                             backtrace,
                                                             str(e))

    def update_document(self,
                        index,
                        doc_type,
                        doc_id,
                        values):
        """
        update an existing document in an existing index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            doc_id = document_id
            values = dictionary of fields and values
        """
        try:
            err_msg = self._connect()
            if err_msg:
                return err_msg
            log.info('ES body: {}'.format(values))
            response = self.es.update(index=index,
                                      doc_type=doc_type,
                                      id=doc_id,
                                      body=dumps(values))
            log.info('ES update(): response: {}'.format(response))
            return ElasticSearchWrite.object_updated(response)
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except NotFoundError as e:
            return ElasticSearchError.missing_index(index)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchWriteError.unknown_exception(doc_id,
                                                             values,
                                                             backtrace,
                                                             str(e))

    def find_document(self,
                      index,
                      doc_type,
                      dsl=None,
                      fields=None):
        """
        find an existing document in an existing index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            dsl = query parameters in DSL format
            fields = list of fields to return
        """
        try:
            err_msg = self._connect()
            if err_msg:
                return err_msg
            response = self.es.search(index=index,
                                      doc_type=doc_type,
                                      body=dumps(dsl),
                                      _source=fields)
            return ElasticSearchRead.object_found(response)
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except NotFoundError as e:
            return ElasticSearchError.missing_index(index)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchReadError.unknown_exception(dsl,
                                                            fields,
                                                            backtrace,
                                                            str(e))

    def search_documents(self,
                         index,
                         doc_type,
                         dsl,
                         fields=None):
        """
        find an existing document in an existing index
        mandatory args:
            index = index name 
            doc_type = document type, ie. any valid string
            dsl = query parameters in DSL format
            fields = list of fields to return
        """
        try:
            err_msg = self._connect()
            if err_msg:
                return err_msg
            response = self.es.search(index=index,
                                      doc_type=doc_type,
                                      body=dumps(dsl),
                                      _source=fields)
            return ElasticSearchRead.objects_found(response)
        except ConnectionError as e:
            return ElasticSearchError.no_host_available(self.host,
                                                        self.port)
        except RequestError as e:
            return ElasticSearchError.invalid_request(str(e))
        except NotFoundError as e:
            return ElasticSearchError.missing_index(index)
        except Exception as e:
            (type_e, value, traceback_prev) = exc_info()
            backtrace = extract_tb(traceback_prev)
            return ElasticSearchReadError.unknown_exception(dsl,
                                                            fields,
                                                            backtrace,
                                                            str(e))
Exemple #18
0
def search(es: Elasticsearch, index_name: str, search_body: dict) -> dict:
    return es.search(index=index_name, body=search_body)
Exemple #19
0
def main():
    if os.path.isfile('../config.yaml'):
        filename = '../config.yaml'
    elif os.path.isfile('config.yaml'):
        filename = 'config.yaml'
    else:
        filename = ''

    username = None
    password = None
    use_ssl = None
    http_auth = None

    if filename:
        with open(filename) as config_file:
            data = yaml.load(config_file)
        host = data.get('es_host')
        port = data.get('es_port')
        username = data.get('es_username')
        password = data.get('es_password')
        use_ssl = data.get('use_ssl')
    else:
        host = raw_input("Enter elasticsearch host: ")
        port = int(raw_input("Enter elasticsearch port: "))
        while use_ssl is None:
            resp = raw_input("Use SSL? t/f: ").lower()
            use_ssl = True if resp in ('t', 'true') else (False if resp in ('f', 'false') else None)
        username = raw_input("Enter optional basic-auth username: "******"Enter optional basic-auth password: "******"Downloading existing data...")
        res = es.search(index=old_index, body={}, size=500000)
        print("Got %s documents" % (len(res['hits']['hits'])))

    es.indices.create(index)
    es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping)
    es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping)
    es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping)
    es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping)
    print("New index %s created" % (index))

    if res:
        bulk = ''.join(['%s\n%s\n' % (json.dumps({'create': {'_type': doc['_type'], '_index': index}}),
                                      json.dumps(doc['_source'])) for doc in res['hits']['hits']])
        print("Uploading data...")
        es.bulk(body=bulk, index=index)

    print("Done!")
Exemple #20
0
def main():
    if os.path.isfile('../config.yaml'):
        filename = '../config.yaml'
    elif os.path.isfile('config.yaml'):
        filename = 'config.yaml'
    else:
        filename = ''

    if filename:
        with open(filename) as config_file:
            data = yaml.load(config_file)
        host = data.get('es_host')
        port = data.get('es_port')
    else:
        host = raw_input("Enter elasticsearch host: ")
        port = int(raw_input("Enter elasticsearch port: "))

    es = Elasticsearch(host=host, port=port)

    silence_mapping = {
        'silence': {
            'properties': {
                'rule_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                }
            }
        }
    }
    ess_mapping = {
        'elastalert_status': {
            'properties': {
                'rule_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                '@timestamp': {
                    'format': 'dateOptionalTime',
                    'type': 'date'
                }
            }
        }
    }
    es_mapping = {
        'elastalert': {
            'properties': {
                'rule_name': {
                    'index': 'not_analyzed',
                    'type': 'string'
                },
                'match_body': {
                    'enabled': False,
                    'type': 'object'
                }
            }
        }
    }
    error_mapping = {
        'elastalert_error': {
            'properties': {
                'data': {
                    'type': 'object',
                    'enabled': False
                }
            }
        }
    }

    index = raw_input('New index name? (Default elastalert_status) ')
    index = index if index else 'elastalert_status'
    old_index = raw_input('Name of existing index to copy? (Default None) ')

    res = None
    if old_index:
        print("Downloading existing data...")
        res = es.search(index=old_index, body={}, size=500000)
        print("Got %s documents" % (len(res['hits']['hits'])))

    es.indices.create(index)
    es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping)
    es.indices.put_mapping(index=index,
                           doc_type='elastalert_status',
                           body=ess_mapping)
    es.indices.put_mapping(index=index,
                           doc_type='silence',
                           body=silence_mapping)
    es.indices.put_mapping(index=index,
                           doc_type='elastalert_error',
                           body=error_mapping)
    print("New index %s created" % (index))

    if res:
        bulk = ''.join([
            '%s\n%s\n' %
            (json.dumps({'create': {
                '_type': doc['_type'],
                '_index': index
            }}), json.dumps(doc['_source'])) for doc in res['hits']['hits']
        ])
        print("Uploading data...")
        es.bulk(body=bulk, index=index)

    print("Done!")
Exemple #21
0
class BaseElasticsearchBackend(Base):
    """Base connection wrapper based on the ElasticSearch official library.

    It uses two entry points to configure the underlying connection:

    * ``transport_class``: the transport class from ``elasticsearch``. By
      default ``elasticsearch.transport.Transport``.
    * ``connection_class``: the connection class used by the transport class.
      It's undefined by default, as it is on the subclasses to provide one.

    If any of these elements is not defined, an ``ImproperlyConfigured`` error
    will be raised when the backend will try to configure the client.
    """
    #: ElasticSearch transport class used by the client class to perform
    #: requests.
    transport_class = Transport
    #: ElasticSearch connection class used by the transport class to perform
    #: requests.
    connection_class = None

    def configure_client(self):
        """Instantiate and configure the ElasticSearch client.

        It simply takes the given HOSTS list and uses PARAMS as the keyword
        arguments of the ElasticSearch class.

        The client's transport_class is given by the class attribute
        ``transport_class``, and the connection class used by the transport
        class is given by the class attribute ``connection_class``.

        An ``ImproperlyConfigured`` exception is raised if any of these
        elements is undefined.
        """
        hosts = self.server['HOSTS']
        params = self.server['PARAMS']

        if not self.transport_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no transport class provided' % self.__class__)

        if not self.connection_class:
            raise ImproperlyConfigured(
                'Djangoes backend %r is not properly configured: '
                'no connection class provided' % self.__class__)

        #pylint: disable=star-args
        self.client = Elasticsearch(hosts,
                                    transport_class=self.transport_class,
                                    connection_class=self.connection_class,
                                    **params)

    # Server methods
    # ==============
    # The underlying client does not require index names to perform server
    # related queries, such as "ping" or "info". The connection wrapper act
    # for them as a proxy.

    def ping(self, **kwargs):
        return self.client.ping(**kwargs)

    def info(self, **kwargs):
        return self.client.info(**kwargs)

    def put_script(self, lang, script_id, body, **kwargs):
        return self.client.put_script(lang, script_id, body, **kwargs)

    def get_script(self, lang, script_id, **kwargs):
        return self.client.get_script(lang, script_id, **kwargs)

    def delete_script(self, lang, script_id, **kwargs):
        return self.client.delete_script(lang, script_id, **kwargs)

    def put_template(self, template_id, body, **kwargs):
        return self.client.put_template(template_id, body, **kwargs)

    def get_template(self, template_id, body=None, **kwargs):
        return self.client.get_template(template_id, body, **kwargs)

    def delete_template(self, template_id=None, **kwargs):
        return self.client.delete_template(template_id, **kwargs)

    # Bulk methods
    # ============
    # The underlying client does not require index names, but it can be used.
    # As it makes sense to not give an index, developers are free to use these
    # as they want, as long as they are careful.

    def mget(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mget(body, index, doc_type, **kwargs)

    def bulk(self, body, index=None, doc_type=None, **kwargs):
        return self.client.bulk(body, index, doc_type, **kwargs)

    def msearch(self, body, index=None, doc_type=None, **kwargs):
        return self.client.msearch(body, index, doc_type, **kwargs)

    def mpercolate(self, body, index=None, doc_type=None, **kwargs):
        return self.client.mpercolate(body, index, doc_type, **kwargs)

    # Scroll methods
    # ==============
    # The underlying client does not require an index to perform scroll.

    def scroll(self, scroll_id, **kwargs):
        return self.client.scroll(scroll_id, **kwargs)

    def clear_scroll(self, scroll_id, body=None, **kwargs):
        return self.client.clear_scroll(scroll_id, body, **kwargs)

    # Query methods
    # =============
    # The underlying client requires index names (or alias names) to perform
    # queries. The connection wrapper overrides these client methods to
    # automatically uses the configured names (indices and/or aliases).

    def create(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.create(
            self.indices, doc_type, body, doc_id, **kwargs)

    def index(self, doc_type, body, doc_id=None, **kwargs):
        return self.client.index(
            self.indices, doc_type, body, doc_id, **kwargs)

    def exists(self, doc_id, doc_type='_all', **kwargs):
        return self.client.exists(self.indices, doc_id, doc_type, **kwargs)

    def get(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get(self.indices, doc_id, doc_type, **kwargs)

    def get_source(self, doc_id, doc_type='_all', **kwargs):
        return self.client.get_source(self.indices, doc_id, doc_type, **kwargs)

    def update(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.update(
            self.indices, doc_type, doc_id, body, **kwargs)

    def search(self, doc_type=None, body=None, **kwargs):
        return self.client.search(self.indices, doc_type, body, **kwargs)

    def search_shards(self, doc_type=None, **kwargs):
        return self.client.search_shards(self.indices, doc_type, **kwargs)

    def search_template(self, doc_type=None, body=None, **kwargs):
        return self.client.search_template(
            self.indices, doc_type, body, **kwargs)

    def explain(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.explain(
            self.indices, doc_type, doc_id, body, **kwargs)

    def delete(self, doc_type, doc_id, **kwargs):
        return self.client.delete(self.indices, doc_type, doc_id, **kwargs)

    def count(self, doc_type=None, body=None, **kwargs):
        return self.client.count(self.indices, doc_type, body, **kwargs)

    def delete_by_query(self, doc_type=None, body=None, **kwargs):
        return self.client.delete_by_query(
            self.indices, doc_type, body, **kwargs)

    def suggest(self, body, **kwargs):
        return self.client.suggest(body, self.indices, **kwargs)

    def percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.percolate(
            self.indices, doc_type, doc_id, body, **kwargs)

    def count_percolate(self, doc_type, doc_id=None, body=None, **kwargs):
        return self.client.count_percolate(
            self.indices, doc_type, doc_id, body, **kwargs)

    def mlt(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.mlt(self.indices, doc_type, doc_id, body, **kwargs)

    def termvector(self, doc_type, doc_id, body=None, **kwargs):
        return self.client.termvector(
            self.indices, doc_type, doc_id, body, **kwargs)

    def mtermvectors(self, doc_type=None, body=None, **kwargs):
        return self.client.mtermvectors(self.indices, doc_type, body, **kwargs)

    def benchmark(self, doc_type=None, body=None, **kwargs):
        return self.client.benchmark(self.indices, doc_type, body, **kwargs)

    def abort_benchmark(self, name=None, **kwargs):
        return self.client.abort_benchmark(name, **kwargs)

    def list_benchmarks(self, doc_type=None, **kwargs):
        return self.client.list_benchmarks(self.indices, doc_type, **kwargs)
Exemple #22
0
class NewTermsRule(RuleType):
    """ Alerts on a new value in a list of fields. """
    def __init__(self, rule, args=None):
        super(NewTermsRule, self).__init__(rule, args)
        self.seen_values = {}
        # Allow the use of query_key or fields
        if 'fields' not in self.rules:
            if 'query_key' not in self.rules:
                raise EAException("fields or query_key must be specified")
            self.fields = self.rules['query_key']
        else:
            self.fields = self.rules['fields']
        if not self.fields:
            raise EAException("fields must not be an empty list")
        if type(self.fields) != list:
            self.fields = [self.fields]
        if self.rules.get('use_terms_query') and len(self.fields) != 1:
            raise EAException(
                "use_terms_query can only be used with one field at a time")
        try:
            self.get_all_terms(args)
        except Exception as e:
            # Refuse to start if we cannot get existing terms
            raise EAException('Error searching for existing terms: %s' % (e))

    def get_all_terms(self, args):
        """ Performs a terms aggregation for each field to get every existing term. """
        self.es = Elasticsearch(host=self.rules['es_host'],
                                port=self.rules['es_port'],
                                timeout=self.rules.get('es_conn_timeout', 50))
        window_size = datetime.timedelta(
            **self.rules.get('terms_window_size', {'days': 30}))
        field_name = {"field": "", "size": 2147483647}  # Integer.MAX_VALUE
        query_template = {"aggs": {"values": {"terms": field_name}}}
        if args and args.start:
            end = ts_to_dt(args.start)
        else:
            end = ts_now()
        start = end - window_size
        if self.rules.get('use_strftime_index'):
            index = format_index(self.rules['index'], start, end)
        else:
            index = self.rules['index']
        time_filter = {
            self.rules['timestamp_field']: {
                'lte': dt_to_ts(end),
                'gte': dt_to_ts(start)
            }
        }
        query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}}
        query = {'aggs': {'filtered': query_template}}

        for field in self.fields:
            field_name['field'] = field
            res = self.es.search(body=query,
                                 index=index,
                                 ignore_unavailable=True,
                                 timeout='50s')
            if 'aggregations' in res:
                buckets = res['aggregations']['filtered']['values']['buckets']
                keys = [bucket['key'] for bucket in buckets]
                self.seen_values[field] = keys
                elastalert_logger.info('Found %s unique values for %s' %
                                       (len(keys), field))
            else:
                self.seen_values[field] = []
                elastalert_logger.info('Found no values for %s' % (field))

    def add_data(self, data):
        for document in data:
            for field in self.fields:
                value = document.get(field)
                if not value and self.rules.get('alert_on_missing_field'):
                    document['missing_field'] = field
                    self.add_match(document)
                elif value:
                    if value not in self.seen_values[field]:
                        document['new_field'] = field
                        self.add_match(document)
                        self.seen_values[field].append(value)

    def add_terms_data(self, terms):
        # With terms query, len(self.fields) is always 1
        field = self.fields[0]
        for timestamp, buckets in terms.iteritems():
            for bucket in buckets:
                if bucket['doc_count']:
                    if bucket['key'] not in self.seen_values[field]:
                        match = {
                            field: bucket['key'],
                            self.rules['timestamp_field']: timestamp,
                            'new_field': field
                        }
                        self.add_match(match)
Exemple #23
0
class NewTermsRule(RuleType):
    """ Alerts on a new value in a list of fields. """

    def __init__(self, *args):
        super(NewTermsRule, self).__init__(*args)
        self.seen_values = {}
        # Allow the use of query_key or fields
        if 'fields' not in self.rules:
            if 'query_key' not in self.rules:
                raise EAException("fields or query_key must be specified")
            self.fields = self.rules['query_key']
        else:
            self.fields = self.rules['fields']
        if not self.fields:
            raise EAException("fields must not be an empty list")
        if type(self.fields) != list:
            self.fields = [self.fields]
        if self.rules.get('use_terms_query') and len(self.fields) != 1:
            raise EAException("use_terms_query can only be used with one field at a time")
        self.get_all_terms()

    def get_all_terms(self):
        """ Performs a terms aggregation for each field to get every existing term. """
        self.es = Elasticsearch(host=self.rules['es_host'], port=self.rules['es_port'])
        window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30}))

        field_name = {"field": "", "size": 2147483647}  # Integer.MAX_VALUE
        query_template = {"aggs": {"values": {"terms": field_name}}}
        if self.rules.get('use_strftime_index'):
            end = ts_now()
            start = end - window_size
            index = format_index(self.rules['index'], start, end)
        else:
            index = self.rules['index']

        for field in self.fields:
            field_name['field'] = field
            res = self.es.search(body=query_template, index=index, ignore_unavailable=True, timeout=50)
            buckets = res['aggregations']['values']['buckets']
            keys = [bucket['key'] for bucket in buckets]
            self.seen_values[field] = keys

    def add_data(self, data):
        for document in data:
            for field in self.fields:
                value = document.get(field)
                if not value and self.rules.get('alert_on_missing_field'):
                    document['missing_field'] = field
                    self.add_match(document)
                elif value:
                    if value not in self.seen_values[field]:
                        document['new_field'] = field
                        self.add_match(document)
                        self.seen_values[field].append(value)

    def add_terms_data(self, terms):
        # With terms query, len(self.fields) is always 1
        field = self.fields[0]
        for timestamp, buckets in terms.iteritems():
            for bucket in buckets:
                if bucket['doc_count']:
                    if bucket['key'] not in self.seen_values[field]:
                        match = {field: bucket['key'],
                                 self.rules['timestamp_field']: timestamp,
                                 'new_field': field}
                        self.add_match(match)
Exemple #24
0
class NewTermsRule(RuleType):
    """ Alerts on a new value in a list of fields. """

    def __init__(self, rule, args=None):
        super(NewTermsRule, self).__init__(rule, args)
        self.seen_values = {}
        # Allow the use of query_key or fields
        if 'fields' not in self.rules:
            if 'query_key' not in self.rules:
                raise EAException("fields or query_key must be specified")
            self.fields = self.rules['query_key']
        else:
            self.fields = self.rules['fields']
        if not self.fields:
            raise EAException("fields must not be an empty list")
        if type(self.fields) != list:
            self.fields = [self.fields]
        if self.rules.get('use_terms_query') and len(self.fields) != 1:
            raise EAException("use_terms_query can only be used with one field at a time")
        try:
            self.get_all_terms(args)
        except Exception as e:
            # Refuse to start if we cannot get existing terms
            raise EAException('Error searching for existing terms: %s' % (e))

    def get_all_terms(self, args):
        """ Performs a terms aggregation for each field to get every existing term. """
        self.es = Elasticsearch(host=self.rules['es_host'], port=self.rules['es_port'])
        window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30}))
        field_name = {"field": "", "size": 2147483647}  # Integer.MAX_VALUE
        query_template = {"aggs": {"values": {"terms": field_name}}}
        if args and args.start:
            end = ts_to_dt(args.start)
        else:
            end = ts_now()
        start = end - window_size
        if self.rules.get('use_strftime_index'):
            index = format_index(self.rules['index'], start, end)
        else:
            index = self.rules['index']
        time_filter = {self.rules['timestamp_field']: {'lte': dt_to_ts(end), 'gte': dt_to_ts(start)}}
        query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}}
        query = {'aggs': {'filtered': query_template}}

        for field in self.fields:
            field_name['field'] = field
            res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout=50)
            if 'aggregations' in res:
                buckets = res['aggregations']['filtered']['values']['buckets']
                keys = [bucket['key'] for bucket in buckets]
                self.seen_values[field] = keys
                elastalert_logger.info('Found %s unique values for %s' % (len(keys), field))
            else:
                self.seen_values[field] = []
                elastalert_logger.info('Found no values for %s' % (field))

    def add_data(self, data):
        for document in data:
            for field in self.fields:
                value = document.get(field)
                if not value and self.rules.get('alert_on_missing_field'):
                    document['missing_field'] = field
                    self.add_match(document)
                elif value:
                    if value not in self.seen_values[field]:
                        document['new_field'] = field
                        self.add_match(document)
                        self.seen_values[field].append(value)

    def add_terms_data(self, terms):
        # With terms query, len(self.fields) is always 1
        field = self.fields[0]
        for timestamp, buckets in terms.iteritems():
            for bucket in buckets:
                if bucket['doc_count']:
                    if bucket['key'] not in self.seen_values[field]:
                        match = {field: bucket['key'],
                                 self.rules['timestamp_field']: timestamp,
                                 'new_field': field}
                        self.add_match(match)
Exemple #25
0
def match_query(es: _es, index: str, text: str):
    query = {
        "query": {
            "bool": {
                "must": [
                    {
                        "match": {
                            "matn_p": {
                                "query": text,
                                # "operator": "and"
                            }
                        }
                    },
                    # {
                    #     "intervals": {
                    #         "matn_p": {
                    #             "all_of": {
                    #                 "ordered": True,
                    #                 "intervals": [
                    #                     {
                    #                         "match": {
                    #                             "query": text,
                    #                             "max_gaps": 0,
                    #                             "ordered": True
                    #                         }
                    #                     },
                    #                     # {
                    #                     #     "any_of": {
                    #                     #         "intervals": [
                    #                     #             {"match": {"query": text}},
                    #                     #             {"match": {"query": text}}
                    #                     #         ]
                    #                     #     }
                    #                     # }
                    #                 ]
                    #             }
                    #         }
                    #     }
                    # }
                ],
                "filter": [
                    {
                        "match": {
                            "matn_p": {
                                "query": text,
                                # "operator": "and"
                            }
                        }
                    },
                ],
                "should": [
                    # {
                    #     "match": {
                    #         "matn_p": {
                    #             "query": text,
                    #             "operator": "and"
                    #         }
                    #     }
                    # },

                    # {
                    #     "intervals": {
                    #         "matn_p": {
                    #             "all_of": {
                    #                 "ordered": True,
                    #                 "intervals": [
                    #                     {
                    #                         "match": {
                    #                             "query": text,
                    #                             "max_gaps": 0,
                    #                             "ordered": True
                    #                         }
                    #                     },
                    #                     # {
                    #                     #     "any_of": {
                    #                     #         "intervals": [
                    #                     #             {"match": {"query": text}},
                    #                     #             {"match": {"query": text}}
                    #                     #         ]
                    #                     #     }
                    #                     # }
                    #                 ]
                    #             }
                    #         }
                    #     }
                    # }
                ]
            }
        }
    }

    query = {
        "query": {
            "simple_query_string": {
                "fields": ["matn_p","isnad_p"],
                "query": text  ,
                "flags": "OR|AND|PREFIX"
            }
        }
    }
    return es.search(index=index, body=query)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--host', help='Elasticsearch host')
    parser.add_argument('--port', type=int, help='Elasticsearch port')
    parser.add_argument('--url-prefix', help='Elasticsearch URL prefix')
    parser.add_argument('--no-auth', action='store_const', const=True, help='Suppress prompt for basic auth')
    parser.add_argument('--ssl', action='store_true', default=None, help='Use SSL')
    parser.add_argument('--no-ssl', dest='ssl', action='store_false', help='Do not use SSL')
    parser.add_argument('--index', help='Index name to create')
    parser.add_argument('--old-index', help='Old index name to copy')
    args = parser.parse_args()

    if os.path.isfile('../config.yaml'):
        filename = '../config.yaml'
    elif os.path.isfile('config.yaml'):
        filename = 'config.yaml'
    else:
        filename = ''

    username = None
    password = None
    use_ssl = None
    url_prefix = None
    http_auth = None

    if filename:
        with open(filename) as config_file:
            data = yaml.load(config_file)
        host = data.get('es_host')
        port = data.get('es_port')
        username = data.get('es_username')
        password = data.get('es_password')
        url_prefix = data.get('es_url_prefix', '')
        use_ssl = data.get('use_ssl')
    else:
        host = args.host if args.host else raw_input('Enter elasticsearch host: ')
        port = args.port if args.port else int(raw_input('Enter elasticsearch port: '))
        use_ssl = (args.ssl if args.ssl is not None
                   else raw_input('Use SSL? t/f: ').lower() in ('t', 'true'))
        if args.no_auth is None:
            username = raw_input('Enter optional basic-auth username: '******'Enter optional basic-auth password: '******'Enter optional Elasticsearch URL prefix: '))

    if username and password:
        http_auth = username + ':' + password

    es = Elasticsearch(host=host, port=port, use_ssl=use_ssl, http_auth=http_auth, url_prefix=url_prefix)

    silence_mapping = {'silence': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'},
                                                  'until': {'type': 'date', 'format': 'dateOptionalTime'},
                                                  '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}}
    ess_mapping = {'elastalert_status': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'},
                                                        '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}}
    es_mapping = {'elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'},
                                                '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'},
                                                'match_body': {'enabled': False, 'type': 'object'},
                                                'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}}
    past_mapping = {'past_elastalert': {'properties': {'rule_name': {'index': 'not_analyzed', 'type': 'string'},
                                                       'match_body': {'enabled': False, 'type': 'object'},
                                                       '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'},
                                                       'aggregate_id': {'index': 'not_analyzed', 'type': 'string'}}}}
    error_mapping = {'elastalert_error': {'properties': {'data': {'type': 'object', 'enabled': False},
                                                         '@timestamp': {'format': 'dateOptionalTime', 'type': 'date'}}}}

    index = args.index if args.index is not None else raw_input('New index name? (Default elastalert_status) ')
    if not index:
        index = 'elastalert_status'

    old_index = (args.old_index if args.old_index is not None
                 else raw_input('Name of existing index to copy? (Default None) '))

    res = None
    if old_index:
        print('Downloading existing data...')
        res = es.search(index=old_index, body={}, size=500000)
        print('Got %s documents' % (len(res['hits']['hits'])))

    es.indices.create(index)
    # To avoid a race condition. TODO: replace this with a real check
    time.sleep(2)
    es.indices.put_mapping(index=index, doc_type='elastalert', body=es_mapping)
    es.indices.put_mapping(index=index, doc_type='elastalert_status', body=ess_mapping)
    es.indices.put_mapping(index=index, doc_type='silence', body=silence_mapping)
    es.indices.put_mapping(index=index, doc_type='elastalert_error', body=error_mapping)
    es.indices.put_mapping(index=index, doc_type='past_elastalert', body=past_mapping)
    print('New index %s created' % (index))

    if res:
        bulk = ''.join(['%s\n%s\n' % (json.dumps({'create': {'_type': doc['_type'], '_index': index}}),
                                      json.dumps(doc['_source'])) for doc in res['hits']['hits']])
        print('Uploading data...')
        es.bulk(body=bulk, index=index)

    print('Done!')
Exemple #27
0
def boolean_query(es: _es, index: str):
    es.search()
Exemple #28
0
class NewTermsRule(RuleType):
    """ Alerts on a new value in a list of fields. """

    def __init__(self, rule, args=None):
        super(NewTermsRule, self).__init__(rule, args)
        self.seen_values = {}
        # Allow the use of query_key or fields
        if 'fields' not in self.rules:
            if 'query_key' not in self.rules:
                raise EAException("fields or query_key must be specified")
            self.fields = self.rules['query_key']
        else:
            self.fields = self.rules['fields']
        if not self.fields:
            raise EAException("fields must not be an empty list")
        if type(self.fields) != list:
            self.fields = [self.fields]
        if self.rules.get('use_terms_query') and (
            len(self.fields) != 1 or len(self.fields) == 1 and type(self.fields[0]) == list
        ):
            raise EAException("use_terms_query can only be used with a single non-composite field")
        try:
            self.get_all_terms(args)
        except Exception as e:
            # Refuse to start if we cannot get existing terms
            raise EAException('Error searching for existing terms: %s' % (repr(e)))

    def get_all_terms(self, args):
        """ Performs a terms aggregation for each field to get every existing term. """
        self.es = Elasticsearch(
            host=self.rules['es_host'],
            port=self.rules['es_port'],
            timeout=self.rules.get('es_conn_timeout', 50),
            send_get_body_as=self.rules.get('send_get_body_as', 'GET')
        )
        window_size = datetime.timedelta(**self.rules.get('terms_window_size', {'days': 30}))
        field_name = {"field": "", "size": 2147483647}  # Integer.MAX_VALUE
        query_template = {"aggs": {"values": {"terms": field_name}}}
        if args and args.start:
            end = ts_to_dt(args.start)
        else:
            end = ts_now()
        start = end - window_size
        step = datetime.timedelta(**self.rules.get('window_step_size', {'days': 1}))

        for field in self.fields:
            tmp_start = start
            tmp_end = min(start + step, end)

            time_filter = {self.rules['timestamp_field']: {'lt': dt_to_ts(tmp_end), 'gte': dt_to_ts(tmp_start)}}
            query_template['filter'] = {'bool': {'must': [{'range': time_filter}]}}
            query = {'aggs': {'filtered': query_template}}
            # For composite keys, we will need to perform sub-aggregations
            if type(field) == list:
                self.seen_values.setdefault(tuple(field), [])
                level = query_template['aggs']
                # Iterate on each part of the composite key and add a sub aggs clause to the elastic search query
                for i, sub_field in enumerate(field):
                    level['values']['terms']['field'] = add_raw_postfix(sub_field)
                    if i < len(field) - 1:
                        # If we have more fields after the current one, then set up the next nested structure
                        level['values']['aggs'] = {'values': {'terms': copy.deepcopy(field_name)}}
                        level = level['values']['aggs']
            else:
                self.seen_values.setdefault(field, [])
                # For non-composite keys, only a single agg is needed
                field_name['field'] = add_raw_postfix(field)

            # Query the entire time range in small chunks
            while tmp_start < end:
                if self.rules.get('use_strftime_index'):
                    index = format_index(self.rules['index'], tmp_start, tmp_end)
                else:
                    index = self.rules['index']
                res = self.es.search(body=query, index=index, ignore_unavailable=True, timeout='50s')
                if 'aggregations' in res:
                    buckets = res['aggregations']['filtered']['values']['buckets']
                    if type(field) == list:
                        # For composite keys, make the lookup based on all fields
                        # Make it a tuple since it can be hashed and used in dictionary lookups
                        for bucket in buckets:
                            # We need to walk down the hierarchy and obtain the value at each level
                            self.seen_values[tuple(field)] += self.flatten_aggregation_hierarchy(bucket)
                    else:
                        keys = [bucket['key'] for bucket in buckets]
                        self.seen_values[field] += keys
                else:
                    self.seen_values.setdefault(field, [])
                if tmp_start == tmp_end:
                    break
                tmp_start = tmp_end
                tmp_end = min(tmp_start + step, end)
                time_filter[self.rules['timestamp_field']] = {'lt': dt_to_ts(tmp_end), 'gte': dt_to_ts(tmp_start)}

            for key, values in self.seen_values.iteritems():
                if not values:
                    if type(key) == tuple:
                        # If we don't have any results, it could either be because of the absence of any baseline data
                        # OR it may be because the composite key contained a non-primitive type.  Either way, give the
                        # end-users a heads up to help them debug what might be going on.
                        elastalert_logger.warning((
                            'No results were found from all sub-aggregations.  This can either indicate that there is '
                            'no baseline data OR that a non-primitive field was used in a composite key.'
                        ))
                    else:
                        elastalert_logger.info('Found no values for %s' % (field))
                    continue
                self.seen_values[key] = list(set(values))
                elastalert_logger.info('Found %s unique values for %s' % (len(values), key))

    def flatten_aggregation_hierarchy(self, root, hierarchy_tuple=()):
        """ For nested aggregations, the results come back in the following format:
            {
            "aggregations" : {
                "filtered" : {
                  "doc_count" : 37,
                  "values" : {
                    "doc_count_error_upper_bound" : 0,
                    "sum_other_doc_count" : 0,
                    "buckets" : [ {
                      "key" : "1.1.1.1", # IP address (root)
                      "doc_count" : 13,
                      "values" : {
                        "doc_count_error_upper_bound" : 0,
                        "sum_other_doc_count" : 0,
                        "buckets" : [ {
                          "key" : "80",    # Port (sub-aggregation)
                          "doc_count" : 3,
                          "values" : {
                            "doc_count_error_upper_bound" : 0,
                            "sum_other_doc_count" : 0,
                            "buckets" : [ {
                              "key" : "ack",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            }, {
                              "key" : "syn",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 1
                            } ]
                          }
                        }, {
                          "key" : "82",    # Port (sub-aggregation)
                          "doc_count" : 3,
                          "values" : {
                            "doc_count_error_upper_bound" : 0,
                            "sum_other_doc_count" : 0,
                            "buckets" : [ {
                              "key" : "ack",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            }, {
                              "key" : "syn",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            } ]
                          }
                        } ]
                      }
                    }, {
                      "key" : "2.2.2.2", # IP address (root)
                      "doc_count" : 4,
                      "values" : {
                        "doc_count_error_upper_bound" : 0,
                        "sum_other_doc_count" : 0,
                        "buckets" : [ {
                          "key" : "443",    # Port (sub-aggregation)
                          "doc_count" : 3,
                          "values" : {
                            "doc_count_error_upper_bound" : 0,
                            "sum_other_doc_count" : 0,
                            "buckets" : [ {
                              "key" : "ack",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            }, {
                              "key" : "syn",  # Reason (sub-aggregation, leaf-node)
                              "doc_count" : 3
                            } ]
                          }
                        } ]
                      }
                    } ]
                  }
                }
              }
            }

            Each level will either have more values and buckets, or it will be a leaf node
            We'll ultimately return a flattened list with the hierarchies appended as strings,
            e.g the above snippet would yield a list with:

            [
             ('1.1.1.1', '80', 'ack'),
             ('1.1.1.1', '80', 'syn'),
             ('1.1.1.1', '82', 'ack'),
             ('1.1.1.1', '82', 'syn'),
             ('2.2.2.2', '443', 'ack'),
             ('2.2.2.2', '443', 'syn')
            ]

            A similar formatting will be performed in the add_data method and used as the basis for comparison

        """
        results = []
        # There are more aggregation hierarchies left.  Traverse them.
        if 'values' in root:
            results += self.flatten_aggregation_hierarchy(root['values']['buckets'], hierarchy_tuple + (root['key'],))
        else:
            # We've gotten to a sub-aggregation, which may have further sub-aggregations
            # See if we need to traverse further
            for node in root:
                if 'values' in node:
                    results += self.flatten_aggregation_hierarchy(node, hierarchy_tuple)
                else:
                    results.append(hierarchy_tuple + (node['key'],))
        return results

    def add_data(self, data):
        for document in data:
            for field in self.fields:
                value = ()
                lookup_field = field
                if type(field) == list:
                    # For composite keys, make the lookup based on all fields
                    # Make it a tuple since it can be hashed and used in dictionary lookups
                    lookup_field = tuple(field)
                    for sub_field in field:
                        lookup_result = lookup_es_key(document, sub_field)
                        if not lookup_result:
                            value = None
                            break
                        value += (lookup_result,)
                else:
                    value = lookup_es_key(document, field)
                if not value and self.rules.get('alert_on_missing_field'):
                    document['missing_field'] = lookup_field
                    self.add_match(copy.deepcopy(document))
                elif value:
                    if value not in self.seen_values[lookup_field]:
                        document['new_field'] = lookup_field
                        self.add_match(copy.deepcopy(document))
                        self.seen_values[lookup_field].append(value)

    def add_terms_data(self, terms):
        # With terms query, len(self.fields) is always 1 and the 0'th entry is always a string
        field = self.fields[0]
        for timestamp, buckets in terms.iteritems():
            for bucket in buckets:
                if bucket['doc_count']:
                    if bucket['key'] not in self.seen_values[field]:
                        match = {field: bucket['key'],
                                 self.rules['timestamp_field']: timestamp,
                                 'new_field': field}
                        self.add_match(match)
                        self.seen_values[field].append(bucket['key'])
Exemple #29
0
class ElastAlerter():
    """ The main Elastalert runner. This class holds all state about active rules,
    controls when queries are run, and passes information between rules and alerts.

    :param args: An argparse arguments instance. Should contain debug and start

    :param conf: The configuration dictionary. At the top level, this
    contains global options, and under 'rules', contains all state relating
    to rules and alerts. In each rule in conf['rules'], the RuleType and Alerter
    instances live under 'type' and 'alerts', respectively. The conf dictionary
    should not be passed directly from a configuration file, but must be populated
    by config.py:load_rules instead. """
    def parse_args(self, args):
        parser = argparse.ArgumentParser()
        parser.add_argument('--config',
                            action='store',
                            dest='config',
                            default="config.yaml",
                            help='Global config file (default: config.yaml)')
        parser.add_argument(
            '--debug',
            action='store_true',
            dest='debug',
            help='Suppresses alerts and prints information instead')
        parser.add_argument(
            '--rule',
            dest='rule',
            help=
            'Run only a specific rule (by filename, must still be in rules folder)'
        )
        parser.add_argument(
            '--silence',
            dest='silence',
            help=
            'Silence rule for a time period. Must be used with --rule. Usage: '
            '--silence <units>=<number>, eg. --silence hours=2')
        parser.add_argument(
            '--start',
            dest='start',
            help=
            'YYYY-MM-DDTHH:MM:SS Start querying from this timestamp. (Default: present)'
        )
        parser.add_argument(
            '--end',
            dest='end',
            help=
            'YYYY-MM-DDTHH:MM:SS Query to this timestamp. (Default: present)')
        parser.add_argument(
            '--verbose',
            action='store_true',
            dest='verbose',
            help='Increase verbosity without suppressing alerts')
        parser.add_argument(
            '--pin_rules',
            action='store_true',
            dest='pin_rules',
            help='Stop ElastAlert from monitoring config file changes')
        self.args = parser.parse_args(args)

    def __init__(self, args):
        self.parse_args(args)
        self.conf = load_rules(self.args.config, use_rule=self.args.rule)
        self.max_query_size = self.conf['max_query_size']
        self.rules = self.conf['rules']
        self.debug = self.args.debug
        self.verbose = self.args.verbose
        self.writeback_index = self.conf['writeback_index']
        self.es_host = self.conf['es_host']
        self.es_port = self.conf['es_port']
        self.run_every = self.conf['run_every']
        self.alert_time_limit = self.conf['alert_time_limit']
        self.old_query_limit = self.conf['old_query_limit']
        self.alerts_sent = 0
        self.num_hits = 0
        self.current_es = None
        self.current_es_addr = None
        self.buffer_time = self.conf['buffer_time']
        self.silence_cache = {}
        self.rule_hashes = get_rule_hashes(self.conf)

        self.writeback_es = Elasticsearch(host=self.es_host, port=self.es_port)

        if self.debug:
            self.verbose = True

        if self.verbose:
            logging.getLogger().setLevel(logging.INFO)

        for rule in self.rules:
            rule = self.init_rule(rule)

        if self.args.silence:
            self.silence()

    @staticmethod
    def get_index(rule, starttime=None, endtime=None):
        """ Gets the index for a rule. If strftime is set and starttime and endtime
        are provided, it will return a comma seperated list of indices. If strftime
        is set but starttime and endtime are not provided, it will replace all format
        tokens with a wildcard. """
        index = rule['index']
        if rule.get('use_strftime_index'):
            if starttime and endtime:
                return format_index(index, starttime, endtime)
            else:
                # Replace the substring containing format characters with a *
                format_start = index.find('%')
                format_end = index.rfind('%') + 2
                return index[:format_start] + '*' + index[format_end:]
        else:
            return index

    @staticmethod
    def get_query(filters,
                  starttime=None,
                  endtime=None,
                  sort=True,
                  timestamp_field='@timestamp'):
        """ Returns a query dict that will apply a list of filters, filter by
        start and end time, and sort results by timestamp.

        :param filters: A list of elasticsearch filters to use.
        :param starttime: A timestamp to use as the start time of the query.
        :param endtime: A timestamp to use as the end time of the query.
        :param sort: If true, sort results by timestamp. (Default True)
        :return: A query dictionary to pass to elasticsearch.
        """
        starttime = dt_to_ts(starttime)
        endtime = dt_to_ts(endtime)
        filters = copy.copy(filters)
        query = {'filter': {'bool': {'must': filters}}}
        if starttime and endtime:
            query['filter']['bool']['must'].append({
                'range': {
                    timestamp_field: {
                        'from': starttime,
                        'to': endtime
                    }
                }
            })
        if sort:
            query['sort'] = [{timestamp_field: {'order': 'asc'}}]
        return query

    def get_terms_query(self, query, size, field):
        """ Takes a query generated by get_query and outputs a aggregation query """
        if 'sort' in query:
            query.pop('sort')
        query.update(
            {'aggs': {
                'counts': {
                    'terms': {
                        'field': field,
                        'size': size
                    }
                }
            }})
        aggs_query = {'aggs': {'filtered': query}}
        return aggs_query

    def get_index_start(self, index, timestamp_field='@timestamp'):
        """ Query for one result sorted by timestamp to find the beginning of the index.

        :param index: The index of which to find the earliest event.
        :return: Timestamp of the earliest event.
        """
        query = {'sort': {timestamp_field: {'order': 'asc'}}}
        try:
            res = self.current_es.search(index=index,
                                         size=1,
                                         body=query,
                                         _source_include=[timestamp_field],
                                         ignore_unavailable=True)
        except ElasticsearchException as e:
            self.handle_error("Elasticsearch query error: %s" % (e),
                              {'index': index})
            return '1969-12-30T00:00:00Z'
        if len(res['hits']['hits']) == 0:
            # Index is completely empty, return a date before the epoch
            return '1969-12-30T00:00:00Z'
        timestamp = res['hits']['hits'][0]['_source'][timestamp_field]
        return timestamp

    def get_hits(self, rule, starttime, endtime, index):
        """ Query elasticsearch for the given rule and return the results.

        :param rule: The rule configuration.
        :param starttime: The earliest time to query.
        :param endtime: The latest time to query.
        :return: A list of hits, bounded by self.max_query_size.
        """
        query = self.get_query(rule['filter'],
                               starttime,
                               endtime,
                               timestamp_field=rule['timestamp_field'])
        try:
            res = self.current_es.search(index=index,
                                         size=self.max_query_size,
                                         body=query,
                                         _source_include=rule['include'],
                                         ignore_unavailable=True)
        except ElasticsearchException as e:
            # Elasticsearch sometimes gives us GIGANTIC error messages
            # (so big that they will fill the entire terminal buffer)
            if len(str(e)) > 1024:
                e = str(e)[:1024] + '... (%d characters removed)' % (
                    len(str(e)) - 1024)
            self.handle_error('Error running query: %s' % (e),
                              {'rule': rule['name']})
            return None

        hits = res['hits']['hits']
        self.num_hits += len(hits)
        lt = rule.get('use_local_time')
        logging.info("Queried rule %s from %s to %s: %s hits" %
                     (rule['name'], pretty_ts(
                         starttime, lt), pretty_ts(endtime, lt), len(hits)))
        self.replace_ts(hits, rule)

        # Record doc_type for use in get_top_counts
        if 'doc_type' not in rule and len(hits):
            rule['doc_type'] = hits[0]['_type']
        return hits

    def replace_ts(self, hits, rule):
        for hit in hits:
            hit['_source'][rule['timestamp_field']] = ts_to_dt(
                hit['_source'][rule['timestamp_field']])

    def get_hits_count(self, rule, starttime, endtime, index):
        """ Query elasticsearch for the count of results and returns a list of timestamps
        equal to the endtime. This allows the results to be passed to rules which expect
        an object for each hit.

        :param rule: The rule configuration dictionary.
        :param starttime: The earliest time to query.
        :param endtime: The latest time to query.
        :return: A dictionary mapping timestamps to number of hits for that time period.
        """
        query = self.get_query(rule['filter'],
                               starttime,
                               endtime,
                               timestamp_field=rule['timestamp_field'],
                               sort=False)
        query = {'query': {'filtered': query}}

        try:
            res = self.current_es.count(index=index,
                                        doc_type=rule['doc_type'],
                                        body=query,
                                        ignore_unavailable=True)
        except ElasticsearchException as e:
            # Elasticsearch sometimes gives us GIGANTIC error messages
            # (so big that they will fill the entire terminal buffer)
            if len(str(e)) > 1024:
                e = str(e)[:1024] + '... (%d characters removed)' % (
                    len(str(e)) - 1024)
            self.handle_error('Error running count query: %s' % (e),
                              {'rule': rule['name']})
            return None

        self.num_hits += res['count']
        lt = rule.get('use_local_time')
        logging.info("Queried rule %s from %s to %s: %s hits" %
                     (rule['name'], pretty_ts(
                         starttime, lt), pretty_ts(endtime, lt), res['count']))
        return {endtime: res['count']}

    def get_hits_terms(self, rule, starttime, endtime, index, key, qk=None):
        rule_filter = copy.copy(rule['filter'])
        if qk:
            filter_key = rule['query_key']
            if rule.get('raw_count_keys',
                        True) and not rule['query_key'].endswith('.raw'):
                filter_key += '.raw'
            rule_filter.extend([{'term': {filter_key: qk}}])
        base_query = self.get_query(rule_filter,
                                    starttime,
                                    endtime,
                                    timestamp_field=rule['timestamp_field'],
                                    sort=False)
        query = self.get_terms_query(base_query, rule.get('terms_size', 5),
                                     key)

        try:
            res = self.current_es.search(index=index,
                                         doc_type=rule['doc_type'],
                                         body=query,
                                         search_type='count',
                                         ignore_unavailable=True)
        except ElasticsearchException as e:
            # Elasticsearch sometimes gives us GIGANTIC error messages
            # (so big that they will fill the entire terminal buffer)
            if len(str(e)) > 1024:
                e = str(e)[:1024] + '... (%d characters removed)' % (
                    len(str(e)) - 1024)
            self.handle_error('Error running query: %s' % (e),
                              {'rule': rule['name']})
            return None

        buckets = res['aggregations']['filtered']['counts']['buckets']
        self.num_hits += len(buckets)
        lt = rule.get('use_local_time')
        logging.info('Queried rule %s from %s to %s: %s buckets' %
                     (rule['name'], pretty_ts(
                         starttime, lt), pretty_ts(endtime, lt), len(buckets)))
        return {endtime: buckets}

    def remove_duplicate_events(self, data, rule):
        # Remove data we've processed already
        data = [
            event for event in data
            if event['_id'] not in rule['processed_hits']
        ]

        # Remember the new data's IDs
        for event in data:
            rule['processed_hits'][event['_id']] = event['_source'][
                rule['timestamp_field']]

        return [event['_source'] for event in data]

    def remove_old_events(self, rule):
        # Anything older than the buffer time we can forget
        now = ts_now()
        remove = []
        buffer_time = rule.get('buffer_time', self.buffer_time)
        for _id, timestamp in rule['processed_hits'].iteritems():
            if now - timestamp > buffer_time:
                remove.append(_id)
        map(rule['processed_hits'].pop, remove)

    def run_query(self, rule, start=None, end=None):
        """ Query for the rule and pass all of the results to the RuleType instance.

        :param rule: The rule configuration.
        :param start: The earliest time to query.
        :param end: The latest time to query.
        Returns True on success and False on failure.
        """
        if start is None:
            start = self.get_index_start(rule['index'])
        if end is None:
            end = ts_now()

        # Reset hit counter and query
        rule_inst = rule['type']
        prev_num_hits = self.num_hits
        max_size = rule.get('max_query_size', self.max_query_size)
        index = self.get_index(rule, start, end)
        if rule.get('use_count_query'):
            data = self.get_hits_count(rule, start, end, index)
        elif rule.get('use_terms_query'):
            data = self.get_hits_terms(rule, start, end, index,
                                       rule['query_key'])
        else:
            data = self.get_hits(rule, start, end, index)
            if data:
                data = self.remove_duplicate_events(data, rule)

        # There was an exception while querying
        if data is None:
            return False
        elif data:
            if rule.get('use_count_query'):
                rule_inst.add_count_data(data)
            elif rule.get('use_terms_query'):
                rule_inst.add_terms_data(data)
            else:
                rule_inst.add_data(data)

        # Warn if we hit max_query_size
        if self.num_hits - prev_num_hits == max_size and not rule.get(
                'use_count_query'):
            logging.warning("Hit max_query_size (%s) while querying for %s" %
                            (max_size, rule['name']))

        return True

    def get_starttime(self, rule):
        """ Query ES for the last time we ran this rule.

        :param rule: The rule configuration.
        :return: A timestamp or None.
        """
        query = {
            'filter': {
                'term': {
                    'rule_name': '%s' % (rule['name'])
                }
            },
            'sort': {
                '@timestamp': {
                    'order': 'desc'
                }
            }
        }
        try:
            if self.writeback_es:
                res = self.writeback_es.search(
                    index=self.writeback_index,
                    doc_type='elastalert_status',
                    size=1,
                    body=query,
                    _source_include=['endtime', 'rule_name'])
                if res['hits']['hits']:
                    endtime = ts_to_dt(
                        res['hits']['hits'][0]['_source']['endtime'])

                    if ts_now() - endtime < self.old_query_limit:
                        return endtime
                    else:
                        logging.info(
                            "Found expired previous run for %s at %s" %
                            (rule['name'], endtime))
                        return None
        except (ElasticsearchException, KeyError) as e:
            self.handle_error('Error querying for last run: %s' % (e),
                              {'rule': rule['name']})
            self.writeback_es = None

        return None

    def set_starttime(self, rule, endtime):
        """ Given a rule and an endtime, sets the appropriate starttime for it. """

        # This means we are starting fresh
        if 'starttime' not in rule:
            # Try to get the last run from elasticsearch
            last_run_end = self.get_starttime(rule)
            if last_run_end:
                rule['starttime'] = last_run_end
                return

        # Use buffer for normal queries, or run_every increments otherwise
        buffer_time = rule.get('buffer_time', self.buffer_time)
        if not rule.get('use_count_query') and not rule.get('use_terms_query'):
            rule['starttime'] = endtime - buffer_time
        else:
            rule['starttime'] = endtime - self.run_every

    def run_rule(self, rule, endtime, starttime=None):
        """ Run a rule for a given time period, including querying and alerting on results.

        :param rule: The rule configuration.
        :param starttime: The earliest timestamp to query.
        :param endtime: The latest timestamp to query.
        :return: The number of matches that the rule produced.
        """
        run_start = time.time()
        self.current_es = Elasticsearch(host=rule['es_host'],
                                        port=rule['es_port'])
        self.current_es_addr = (rule['es_host'], rule['es_port'])

        # If there are pending aggregate matches, try processing them
        for x in range(len(rule['agg_matches'])):
            match = rule['agg_matches'].pop()
            self.add_aggregated_alert(match, rule)

        # Start from provided time if it's given
        if starttime:
            rule['starttime'] = starttime
        else:
            self.set_starttime(rule, endtime)
        rule['original_starttime'] = rule['starttime']

        # Don't run if starttime was set to the future
        if ts_now() <= rule['starttime']:
            logging.warning(
                "Attempted to use query start time in the future (%s), sleeping instead"
                % (starttime))
            return 0

        # Run the rule
        # If querying over a large time period, split it up into chunks
        self.num_hits = 0
        tmp_endtime = endtime
        buffer_time = rule.get('buffer_time', self.buffer_time)
        while endtime - rule['starttime'] > buffer_time:
            tmp_endtime = rule['starttime'] + self.run_every
            if not self.run_query(rule, rule['starttime'], tmp_endtime):
                return 0
            rule['starttime'] = tmp_endtime
        if not self.run_query(rule, rule['starttime'], endtime):
            return 0

        rule['type'].garbage_collect(endtime)

        # Process any new matches
        num_matches = len(rule['type'].matches)
        while rule['type'].matches:
            match = rule['type'].matches.pop(0)

            # If realert is set, silence the rule for that duration
            # Silence is cached by query_key, if it exists
            # Default realert time is 0 seconds

            # concatenate query_key (or none) with rule_name to form silence_cache key
            if 'query_key' in rule:
                try:
                    key = '.' + match[rule['query_key']]
                except KeyError:
                    # Some matches may not have a query key
                    key = ''
            else:
                key = ''

            if self.is_silenced(rule['name'] + key) or self.is_silenced(
                    rule['name']):
                logging.info('Ignoring match for silenced rule %s%s' %
                             (rule['name'], key))
                continue

            if rule['realert']:
                self.set_realert(
                    rule['name'] + key,
                    dt_to_ts(datetime.datetime.utcnow() + rule['realert']))

            # If no aggregation, alert immediately
            if not rule['aggregation']:
                self.alert([match], rule)
                continue

            # Add it as an aggregated match
            self.add_aggregated_alert(match, rule)

        time_taken = time.time() - run_start
        # Write to ES that we've run this rule against this time period
        body = {
            'rule_name': rule['name'],
            'endtime': endtime,
            'starttime': rule['starttime'],
            'matches': num_matches,
            'hits': self.num_hits,
            '@timestamp': ts_now(),
            'time_taken': time_taken
        }
        self.writeback('elastalert_status', body)

        return num_matches

    def init_rule(self, new_rule, new=True):
        ''' Copies some necessary non-config state from an exiting rule to a new rule. '''
        if 'download_dashboard' in new_rule['filter']:
            # Download filters from kibana and set the rules filters to them
            db_filters = self.filters_from_kibana(
                new_rule, new_rule['filter']['download_dashboard'])
            if db_filters is not None:
                new_rule['filter'] = db_filters
            else:
                raise EAException("Could not download filters from %s" %
                                  (new_rule['filter']['download_dashboard']))

        blank_rule = {
            'agg_matches': [],
            'current_aggregate_id': None,
            'processed_hits': {}
        }
        rule = blank_rule

        # Set rule to either a blank template or existing rule with same name
        if not new:
            for rule in self.rules:
                if rule['name'] == new_rule['name']:
                    break
            else:
                logging.warning(
                    "Couldn't find existing rule %s, starting from scratch" %
                    (new_rule['name']))
                rule = blank_rule

        copy_properties = [
            'agg_matches', 'current_aggregate_id', 'processed_hits',
            'starttime'
        ]
        for prop in copy_properties:
            if prop == 'starttime' and 'starttime' not in rule:
                continue
            new_rule[prop] = rule[prop]

        return new_rule

    def load_rule_changes(self):
        ''' Using the modification times of rule config files, syncs the running rules
        to match the files in rules_folder by removing, adding or reloading rules. '''
        rule_hashes = get_rule_hashes(self.conf)

        # Check each current rule for changes
        for rule_file, hash_value in self.rule_hashes.iteritems():
            if rule_file not in rule_hashes:
                # Rule file was deleted
                logging.info(
                    'Rule file %s not found, stopping rule execution' %
                    (rule_file))
                self.rules = [
                    rule for rule in self.rules
                    if rule['rule_file'] != rule_file
                ]
                continue
            if hash_value != rule_hashes[rule_file]:
                # Rule file was changed, reload rule
                try:
                    new_rule = load_configuration(
                        os.path.join(self.conf['rules_folder'], rule_file))
                except EAException as e:
                    self.handle_error('Could not load rule %s: %s' %
                                      (rule_file, e))
                    continue
                logging.info("Reloading configuration for rule %s" %
                             (rule_file))

                # Initialize the rule that matches rule_file
                self.rules = [
                    rule if rule['rule_file'] != rule_file else self.init_rule(
                        new_rule, False) for rule in self.rules
                ]

        # Load new rules
        if not self.args.rule:
            for rule_file in set(rule_hashes.keys()) - set(
                    self.rule_hashes.keys()):
                try:
                    new_rule = load_configuration(
                        os.path.join(self.conf['rules_folder'], rule_file))
                except EAException as e:
                    self.handle_error('Could not load rule %s: %s' %
                                      (rule_file, e))
                    continue
                logging.info('Loaded new rule %s' % (rule_file))
                self.rules.append(self.init_rule(new_rule))

        self.rule_hashes = rule_hashes

    def start(self):
        """ Periodically go through each rule and run it """
        starttime = self.args.start
        if starttime:
            try:
                starttime = ts_to_dt(starttime)
            except (TypeError, ValueError):
                self.handle_error(
                    "%s is not a valid ISO 8601 timestamp (YYYY-MM-DDTHH:MM:SS+XX:00)"
                    % (starttime))
                exit(1)
        while True:
            # If writeback_es errored, it's disabled until the next query cycle
            if not self.writeback_es:
                self.writeback_es = Elasticsearch(host=self.es_host,
                                                  port=self.es_port)

            self.send_pending_alerts()

            next_run = datetime.datetime.utcnow() + self.run_every

            for rule in self.rules:
                # Set endtime based on the rule's delay
                delay = rule.get('query_delay')
                if hasattr(self.args, 'end') and self.args.end:
                    endtime = ts_to_dt(self.args.end)
                elif delay:
                    endtime = ts_now() - delay
                else:
                    endtime = ts_now()

                try:
                    num_matches = self.run_rule(rule, endtime, starttime)
                except EAException as e:
                    self.handle_error(
                        "Error running rule %s: %s" % (rule['name'], e),
                        {'rule': rule['name']})
                else:
                    old_starttime = pretty_ts(rule.get('original_starttime'),
                                              rule.get('use_local_time'))
                    logging.info(
                        "Ran %s from %s to %s: %s query hits, %s matches,"
                        " %s alerts sent" %
                        (rule['name'], old_starttime,
                         pretty_ts(endtime, rule.get('use_local_time')),
                         self.num_hits, num_matches, self.alerts_sent))
                    self.alerts_sent = 0

                self.remove_old_events(rule)

            if next_run < datetime.datetime.utcnow():
                # We were processing for longer than our refresh interval
                # This can happen if --start was specified with a large time period
                # or if we are running too slow to process events in real time.
                logging.warning("Querying from %s to %s took longer than %s!" %
                                (old_starttime, endtime, self.run_every))
                continue

            # Only force starttime once
            starttime = None

            if not self.args.pin_rules:
                self.load_rule_changes()

            # Wait before querying again
            sleep_for = (next_run - datetime.datetime.utcnow()).seconds
            logging.info("Sleeping for %s seconds" % (sleep_for))
            time.sleep(sleep_for)

    def generate_kibana_db(self, rule, match):
        ''' Uses a template dashboard to upload a temp dashboard showing the match.
        Returns the url to the dashboard. '''
        db = copy.deepcopy(kibana.dashboard_temp)

        # Set filters
        for filter in rule['filter']:
            if filter:
                kibana.add_filter(db, filter)
        kibana.set_included_fields(db, rule['include'])

        # Set index
        index = self.get_index(rule)
        kibana.set_index_name(db, index)

        return self.upload_dashboard(db, rule, match)

    def upload_dashboard(self, db, rule, match):
        ''' Uploads a dashboard schema to the kibana-int elasticsearch index associated with rule.
        Returns the url to the dashboard. '''
        # Set time range
        start = ts_add(match[rule['timestamp_field']],
                       -rule.get('timeframe', datetime.timedelta(minutes=10)))
        end = ts_add(match[rule['timestamp_field']],
                     datetime.timedelta(minutes=10))
        kibana.set_time(db, start, end)

        # Set dashboard name
        db_name = 'ElastAlert - %s - %s' % (rule['name'], end)
        kibana.set_name(db, db_name)

        # Add filter for query_key value
        if 'query_key' in rule:
            if rule['query_key'] in match:
                term = {'term': {rule['query_key']: match[rule['query_key']]}}
                kibana.add_filter(db, term)

        # Convert to json
        db_js = json.dumps(db)
        db_body = {
            'user': '******',
            'group': 'guest',
            'title': db_name,
            'dashboard': db_js
        }

        # Upload
        es = Elasticsearch(host=rule['es_host'], port=rule['es_port'])
        res = es.create(index='kibana-int', doc_type='temp', body=db_body)

        # Return dashboard URL
        kibana_url = rule.get('kibana_dashboard')
        if not kibana_url:
            kibana_url = 'http://%s:%s/_plugin/kibana/' % (rule['es_host'],
                                                           rule['es_port'])
        return kibana_url + '#/dashboard/temp/%s' % (res['_id'])

    def get_dashboard(self, rule, db_name):
        """ Download dashboard which matches use_kibana_dashboard from elasticsearch. """
        es = Elasticsearch(host=rule['es_host'], port=rule['es_port'])
        if not db_name:
            raise EAException("use_kibana_dashboard undefined")
        query = {'query': {'term': {'_id': db_name}}}
        try:
            res = es.search(index='kibana-int',
                            doc_type='dashboard',
                            body=query,
                            _source_include=['dashboard'])
        except ElasticsearchException as e:
            raise EAException("Error querying for dashboard: %s" % (e))

        if res['hits']['hits']:
            return json.loads(res['hits']['hits'][0]['_source']['dashboard'])
        else:
            raise EAException("Could not find dashboard named %s" % (db_name))

    def use_kibana_link(self, rule, match):
        """ Uploads an existing dashboard as a temp dashboard modified for match time.
        Returns the url to the dashboard. """
        # Download or get cached dashboard
        dashboard = rule.get('dashboard_schema')
        if not dashboard:
            db_name = rule.get('use_kibana_dashboard')
            dashboard = self.get_dashboard(rule, db_name)
        if dashboard:
            rule['dashboard_schema'] = dashboard
        else:
            return None
        dashboard = copy.deepcopy(dashboard)
        return self.upload_dashboard(dashboard, rule, match)

    def filters_from_kibana(self, rule, db_name):
        """ Downloads a dashboard from kibana and returns corresponding filters, None on error. """
        try:
            db = rule.get('dashboard_schema')
            if not db:
                db = self.get_dashboard(rule, db_name)
            filters = kibana.filters_from_dashboard(db)
        except EAException:
            return None
        return filters

    def alert(self, matches, rule, alert_time=None):
        """ Send out an alert.

        :param matches: A list of matches.
        :param rule: A rule configuration.
        """
        if alert_time is None:
            alert_time = ts_now()

        # Compute top count keys
        if rule.get('top_count_keys'):
            for match in matches:
                if 'query_key' in rule and rule['query_key'] in match:
                    qk = match[rule['query_key']]
                else:
                    qk = None
                start = ts_to_dt(match[rule['timestamp_field']]) - rule.get(
                    'timeframe', datetime.timedelta(minutes=10))
                end = ts_to_dt(
                    match[rule['timestamp_field']]) + datetime.timedelta(
                        minutes=10)
                keys = rule.get('top_count_keys')
                counts = self.get_top_counts(rule, start, end, keys,
                                             rule.get('top_count_number'), qk)
                match.update(counts)

        # Generate a kibana dashboard for the first match
        if rule.get('generate_kibana_link') or rule.get(
                'use_kibana_dashboard'):
            try:
                if rule.get('generate_kibana_link'):
                    kb_link = self.generate_kibana_db(rule, matches[0])
                else:
                    kb_link = self.use_kibana_link(rule, matches[0])
            except EAException as e:
                self.handle_error(
                    "Could not generate kibana dash for %s match: %s" %
                    (rule['name'], e))
            else:
                if kb_link:
                    matches[0]['kibana_link'] = kb_link

        for enhancement in rule['match_enhancements']:
            for match in matches:
                try:
                    enhancement.process(match)
                except EAException as e:
                    self.handle_error(
                        "Error running match enhancement: %s" % (e),
                        {'rule': rule['name']})

        # Don't send real alerts in debug mode
        if self.debug:
            alerter = DebugAlerter(rule)
            alerter.alert(matches)
            return

        # Run the alerts
        alert_sent = False
        alert_exception = None
        for alert in rule['alert']:
            try:
                alert.alert(matches)
            except EAException as e:
                self.handle_error(
                    'Error while running alert %s: %s' %
                    (alert.get_info()['type'], e), {'rule': rule['name']})
                alert_exception = str(e)
            else:
                self.alerts_sent += 1
                alert_sent = True

        # Write the alert(s) to ES
        agg_id = None
        for match in matches:
            alert_body = self.get_alert_body(match, rule, alert_sent,
                                             alert_time, alert_exception)
            # Set all matches to aggregate together
            if agg_id:
                alert_body['aggregate_id'] = agg_id
            res = self.writeback('elastalert', alert_body)
            if res and not agg_id:
                agg_id = res['_id']

    def get_alert_body(self,
                       match,
                       rule,
                       alert_sent,
                       alert_time,
                       alert_exception=None):
        body = {'match_body': match}
        body['rule_name'] = rule['name']
        # TODO record info about multiple alerts
        body['alert_info'] = rule['alert'][0].get_info()
        body['alert_sent'] = alert_sent
        body['alert_time'] = alert_time

        # If the alert failed to send, record the exception
        if not alert_sent:
            body['alert_exception'] = alert_exception
        return body

    def writeback(self, doc_type, body):
        # Convert any datetime objects to timestamps
        for key in body.keys():
            if isinstance(body[key], datetime.datetime):
                body[key] = dt_to_ts(body[key])
        if self.debug:
            logging.info("Skipping writing to ES: %s" % (body))
            return None

        if '@timestamp' not in body:
            body['@timestamp'] = dt_to_ts(ts_now())
        if self.writeback_es:
            try:
                res = self.writeback_es.create(index=self.writeback_index,
                                               doc_type=doc_type,
                                               body=body)
                return res
            except ElasticsearchException as e:
                logging.exception(
                    "Error writing alert info to elasticsearch: %s" % (e))
                self.writeback_es = None
        return None

    def find_recent_pending_alerts(self, time_limit):
        """ Queries writeback_es to find alerts that did not send
        and are newer than time_limit """
        query = {
            'query': {
                'query_string': {
                    'query': 'alert_sent:false'
                }
            },
            'filter': {
                'range': {
                    'alert_time': {
                        'from': dt_to_ts(ts_now() - time_limit),
                        'to': dt_to_ts(ts_now())
                    }
                }
            }
        }
        if self.writeback_es:
            try:
                res = self.writeback_es.search(index=self.writeback_index,
                                               doc_type='elastalert',
                                               body=query,
                                               size=1000)
                if res['hits']['hits']:
                    return res['hits']['hits']
            except:
                pass
        return []

    def send_pending_alerts(self):
        pending_alerts = self.find_recent_pending_alerts(self.alert_time_limit)
        for alert in pending_alerts:
            _id = alert['_id']
            alert = alert['_source']
            try:
                rule_name = alert.pop('rule_name')
                alert_time = alert.pop('alert_time')
                match_body = alert.pop('match_body')
            except KeyError:
                # Malformed alert, drop it
                continue

            agg_id = alert.get('aggregate_id', None)
            if agg_id:
                # Aggregated alerts will be taken care of by get_aggregated_matches
                continue

            # Find original rule
            for rule in self.rules:
                if rule['name'] == rule_name:
                    break
            else:
                # Original rule is missing, drop alert
                continue

            # Retry the alert unless it's a future alert
            if ts_now() > ts_to_dt(alert_time):
                aggregated_matches = self.get_aggregated_matches(_id)
                if aggregated_matches:
                    matches = [match_body] + [
                        agg_match['match_body']
                        for agg_match in aggregated_matches
                    ]
                    self.alert(matches, rule, alert_time=alert_time)
                    rule['current_aggregate_id'] = None
                else:
                    self.alert([match_body], rule, alert_time=alert_time)

                # Delete it from the index
                try:
                    self.writeback_es.delete(index=self.writeback_index,
                                             doc_type='elastalert',
                                             id=_id)
                except:
                    self.handle_error("Failed to delete alert %s at %s" %
                                      (_id, alert_time))

        # Send in memory aggregated alerts
        for rule in self.rules:
            if rule['agg_matches']:
                if ts_now() > rule['aggregate_alert_time']:
                    self.alert(rule['agg_matches'], rule)
                    rule['agg_matches'] = []

    def get_aggregated_matches(self, _id):
        """ Removes and returns all matches from writeback_es that have aggregate_id == _id """
        query = {
            'query': {
                'query_string': {
                    'query': 'aggregate_id:%s' % (_id)
                }
            }
        }
        matches = []
        if self.writeback_es:
            try:
                res = self.writeback_es.search(index=self.writeback_index,
                                               doc_type='elastalert',
                                               body=query)
                for match in res['hits']['hits']:
                    matches.append(match['_source'])
                    self.writeback_es.delete(index=self.writeback_index,
                                             doc_type='elastalert',
                                             id=match['_id'])
            except (KeyError, ElasticsearchException) as e:
                self.handle_error(
                    "Error fetching aggregated matches: %s" % (e), {'id': _id})
        return matches

    def add_aggregated_alert(self, match, rule):
        """ Save a match as a pending aggregate alert to elasticsearch. """
        if not rule['current_aggregate_id'] or rule[
                'aggregate_alert_time'] < ts_to_dt(
                    match[rule['timestamp_field']]):
            # First match, set alert_time
            match_time = ts_to_dt(match[rule['timestamp_field']])
            alert_time = match_time + rule['aggregation']
            rule['aggregate_alert_time'] = alert_time
            agg_id = None
        else:
            # Already pending aggregation, use existing alert_time
            alert_time = rule['aggregate_alert_time']
            agg_id = rule['current_aggregate_id']
            logging.info(
                'Adding alert for %s to aggregation, next alert at %s' %
                (rule['name'], alert_time))

        alert_body = self.get_alert_body(match, rule, False, alert_time)
        if agg_id:
            alert_body['aggregate_id'] = agg_id
        res = self.writeback('elastalert', alert_body)

        # If new aggregation, save _id
        if res and not agg_id:
            rule['current_aggregate_id'] = res['_id']

        # Couldn't write the match to ES, save it in memory for now
        if not res:
            rule['agg_matches'].append(match)

        return res

    def silence(self):
        """ Silence an alert for a period of time. --silence and --rule must be passed as args. """
        if not self.args.rule:
            logging.error('--silence must be used with --rule')
            exit(1)

        # With --rule, self.rules will only contain that specific rule
        rule_name = self.rules[0]['name']

        try:
            unit, num = self.args.silence.split('=')
            silence_time = datetime.timedelta(**{unit: int(num)})
            silence_ts = dt_to_ts(silence_time + datetime.datetime.utcnow())
        except (ValueError, TypeError):
            logging.error('%s is not a valid time period' %
                          (self.args.silence))
            exit(1)

        if not self.set_realert(rule_name, silence_ts):
            logging.error('Failed to save silence command to elasticsearch')
            exit(1)

        logging.info('Success. %s will be silenced until %s' %
                     (rule_name, silence_ts))

    def set_realert(self, rule_name, timestamp):
        """ Write a silence to elasticsearch for rule_name until timestamp. """
        body = {
            'rule_name': rule_name,
            '@timestamp': ts_now(),
            'until': timestamp
        }
        self.silence_cache[rule_name] = timestamp
        return self.writeback('silence', body)

    def is_silenced(self, rule_name):
        """ Checks if rule_name is currently silenced. Returns false on exception. """
        if rule_name in self.silence_cache:
            if ts_now() < ts_to_dt(self.silence_cache[rule_name]):
                return True
            else:
                self.silence_cache.pop(rule_name)
                return False

        query = {
            'filter': {
                'term': {
                    'rule_name': rule_name
                }
            },
            'sort': {
                'until': {
                    'order': 'desc'
                }
            }
        }

        if self.writeback_es:
            try:
                res = self.writeback_es.search(index=self.writeback_index,
                                               doc_type='silence',
                                               size=1,
                                               body=query,
                                               _source_include=['until'])
            except ElasticsearchException as e:
                self.handle_error(
                    "Error while querying for alert silence status: %s" % (e),
                    {'rule': rule_name})

                return False

            if res['hits']['hits']:
                until_ts = res['hits']['hits'][0]['_source']['until']
                if ts_now() < ts_to_dt(until_ts):
                    self.silence_cache[rule_name] = until_ts
                    return True
        return False

    def handle_error(self, message, data=None):
        ''' Logs message at error level and writes message, data and traceback to Elasticsearch. '''
        if not self.writeback_es:
            self.writeback_es = Elasticsearch(host=self.es_host,
                                              port=self.es_port)
        logging.error(message)
        body = {'message': message}
        tb = traceback.format_exc()
        body['traceback'] = tb.strip().split('\n')
        if data:
            body['data'] = data
        self.writeback('elastalert_error', body)

    def get_top_counts(self,
                       rule,
                       starttime,
                       endtime,
                       keys,
                       number=5,
                       qk=None):
        """ Counts the number of events for each unique value for each key field.
        Returns a dictionary with top_events_<key> mapped to the top 5 counts for each key. """
        all_counts = {}
        for key in keys:
            index = self.get_index(rule, starttime, endtime)
            buckets = self.get_hits_terms(rule, starttime, endtime, index, key,
                                          qk).values()[0]
            # get_hits_terms adds to num_hits, but we don't want to count these
            self.num_hits -= len(buckets)
            terms = {}
            for bucket in buckets:
                terms[bucket['key']] = bucket['doc_count']
            counts = terms.items()
            counts.sort(key=lambda x: x[1], reverse=True)
            # Save a dict with the top 5 events by key
            all_counts['top_events_%s' % (key)] = dict(counts[:number])
        return all_counts