Ejemplo n.º 1
0
def event_stream(sketch_id, query):
    es = ElasticsearchDataStore(host=current_app.config[u'ELASTIC_HOST'],
                                port=current_app.config[u'ELASTIC_PORT'])
    sketch = Sketch.query.get(sketch_id)
    if not sketch:
        sys.exit('No such sketch')
    indices = {t.searchindex.index_name for t in sketch.timelines}

    result = es.search(sketch_id=sketch_id,
                       query_string=query,
                       query_filter={u'limit': 10000},
                       query_dsl={},
                       indices=[u'_all'],
                       return_fields=[u'xml_string', u'timestamp'],
                       enable_scroll=True)

    scroll_id = result[u'_scroll_id']
    scroll_size = result[u'hits'][u'total']

    for event in result[u'hits'][u'hits']:
        yield event

    while scroll_size > 0:
        result = es.client.scroll(scroll_id=scroll_id, scroll=u'1m')
        scroll_id = result[u'_scroll_id']
        scroll_size = len(result[u'hits'][u'hits'])
        for event in result[u'hits'][u'hits']:
            yield event
Ejemplo n.º 2
0
def event_stream(sketch_id, query):
    es = ElasticsearchDataStore(
        host=current_app.config['ELASTIC_HOST'],
        port=current_app.config['ELASTIC_PORT'])
    sketch = Sketch.query.get(sketch_id)
    if not sketch:
        sys.exit('No such sketch')
    indices = {t.searchindex.index_name for t in sketch.timelines}

    result = es.search(
        sketch_id=sketch_id,
        query_string=query,
        query_filter={'size': 10000, 'terminate_after': 1000},
        query_dsl={},
        indices=['_all'],
        return_fields=['xml_string', 'timestamp'],
        enable_scroll=True)

    scroll_id = result['_scroll_id']
    scroll_size = result['hits']['total']

    for event in result['hits']['hits']:
        yield event

    while scroll_size > 0:
        result = es.client.scroll(scroll_id=scroll_id, scroll='1m')
        scroll_id = result['_scroll_id']
        scroll_size = len(result['hits']['hits'])
        for event in result['hits']['hits']:
            yield event
Ejemplo n.º 3
0
def export(sketch_id):
    """Generates CSV from search result.

    Args:
        sketch_id: Primary key for a sketch.
    Returns:
        CSV string with header.
    """
    sketch = Sketch.query.get_with_acl(sketch_id)
    view = sketch.get_user_view(current_user)
    query_filter = json.loads(view.query_filter)
    query_dsl = json.loads(view.query_dsl)
    indices = query_filter.get(u'indices', [])

    datastore = ElasticsearchDataStore(
        host=current_app.config[u'ELASTIC_HOST'],
        port=current_app.config[u'ELASTIC_PORT'])

    result = datastore.search(
        sketch_id, view.query_string, query_filter, query_dsl, indices,
        aggregations=None, return_results=True)

    csv_out = StringIO()
    csv_writer = csv.DictWriter(
        csv_out, fieldnames=[
            u'timestamp', u'message', u'timestamp_desc', u'datetime',
            u'timesketch_label', u'tag'])
    csv_writer.writeheader()
    for _event in result[u'hits'][u'hits']:
        csv_writer.writerow(
            dict((k, v.encode(u'utf-8') if isinstance(v, basestring) else v)
                 for k, v in _event[u'_source'].iteritems()))

    return csv_out.getvalue()
Ejemplo n.º 4
0
def export(sketch_id):
    """Generates CSV from search result.

    Args:
        sketch_id: Primary key for a sketch.
    Returns:
        CSV string with header.
    """
    sketch = Sketch.query.get_with_acl(sketch_id)
    view = sketch.get_user_view(current_user)
    query_filter = json.loads(view.query_filter)
    query_dsl = json.loads(view.query_dsl)
    indices = query_filter.get('indices', [])

    # Export more than the 500 first results.
    max_events_to_fetch = 10000
    query_filter['terminate_after'] = max_events_to_fetch
    query_filter['size'] = max_events_to_fetch

    datastore = ElasticsearchDataStore(
        host=current_app.config['ELASTIC_HOST'],
        port=current_app.config['ELASTIC_PORT'])

    result = datastore.search(
        sketch_id,
        view.query_string,
        query_filter,
        query_dsl,
        indices,
        aggregations=None)

    all_fields = set()
    for event in result['hits']['hits']:
        all_fields.update(event['_source'].keys())

    all_fields.difference_update(DEFAULT_FIELDS)
    fieldnames = DEFAULT_FIELDS + sorted(all_fields)

    csv_out = StringIO()
    csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames)
    csv_writer.writeheader()
    for _event in result['hits']['hits']:
        sources = _event['_source']
        row = {}
        for key, value in iter(sources.items()):
            if isinstance(value, six.binary_type):
                value = codecs.decode(value, 'utf-8')
            row[key] = value
        row['_index'] = _event['_index']
        if isinstance(row['_index'], six.binary_type):
            row['_index'] = row['_index'].encode('utf-8')
        csv_writer.writerow(row)

    return csv_out.getvalue()
Ejemplo n.º 5
0
def export(sketch_id):
    """Generates CSV from search result.

    Args:
        sketch_id: Primary key for a sketch.
    Returns:
        CSV string with header.
    """
    sketch = Sketch.query.get_with_acl(sketch_id)
    view = sketch.get_user_view(current_user)
    query_filter = json.loads(view.query_filter)
    query_dsl = json.loads(view.query_dsl)
    indices = query_filter.get('indices', [])

    # Export more than the 500 first results.
    max_events_to_fetch = 10000
    query_filter['terminate_after'] = max_events_to_fetch
    query_filter['size'] = max_events_to_fetch

    datastore = ElasticsearchDataStore(
        host=current_app.config['ELASTIC_HOST'],
        port=current_app.config['ELASTIC_PORT'])

    result = datastore.search(
        sketch_id,
        view.query_string,
        query_filter,
        query_dsl,
        indices,
        aggregations=None)

    all_fields = set()
    for event in result['hits']['hits']:
        all_fields.update(event['_source'].keys())

    all_fields.difference_update(DEFAULT_FIELDS)
    fieldnames = DEFAULT_FIELDS + sorted(all_fields)

    csv_out = StringIO()
    csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames)
    csv_writer.writeheader()
    for _event in result['hits']['hits']:
        sources = _event['_source']
        row = {}
        for key, value in iter(sources.items()):
            if isinstance(value, six.binary_type):
                value = codecs.decode(value, 'utf-8')
            row[key] = value
        row['_index'] = _event['_index']
        if isinstance(row['_index'], six.binary_type):
            row['_index'] = row['_index'].encode('utf-8')
        csv_writer.writerow(row)

    return csv_out.getvalue()
Ejemplo n.º 6
0
    def __init__(self, index, data_type):
        """Initializes a similarity scorer.

        Args:
            index: Elasticsearch index name.
            data_type: Name of the data_type.
        """
        self._datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])
        self._config = SimilarityScorerConfig(index, data_type)
Ejemplo n.º 7
0
    def __init__(self, index_name):
        """Initialize the analyzer object.

        Args:
            index_name: Elasticsearch index name.
        """
        self.name = self.NAME
        self.index_name = index_name
        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        if not hasattr(self, 'sketch'):
            self.sketch = None
Ejemplo n.º 8
0
def export(sketch_id):
    """Generates CSV from search result.

    Args:
        sketch_id: Primary key for a sketch.
    Returns:
        CSV string with header.
    """
    sketch = Sketch.query.get_with_acl(sketch_id)
    view = sketch.get_user_view(current_user)
    query_filter = json.loads(view.query_filter)
    query_dsl = json.loads(view.query_dsl)
    indices = query_filter.get(u'indices', [])

    # Export more than the 500 first results.
    max_events_to_fetch = 10000
    query_filter[u'limit'] = max_events_to_fetch

    datastore = ElasticsearchDataStore(
        host=current_app.config[u'ELASTIC_HOST'],
        port=current_app.config[u'ELASTIC_PORT'])

    result = datastore.search(
        sketch_id,
        view.query_string,
        query_filter,
        query_dsl,
        indices,
        aggregations=None)

    all_fields = set()
    for event in result[u'hits'][u'hits']:
        all_fields.update(event[u'_source'].keys())

    all_fields.difference_update(DEFAULT_FIELDS)
    fieldnames = DEFAULT_FIELDS + sorted(all_fields)

    csv_out = StringIO()
    csv_writer = csv.DictWriter(csv_out, fieldnames=fieldnames)
    csv_writer.writeheader()
    for _event in result[u'hits'][u'hits']:
        row = dict((k, v.encode(u'utf-8') if isinstance(v, basestring) else v)
                   for k, v in _event[u'_source'].iteritems())
        row[u'_index'] = _event[u'_index']
        if isinstance(row[u'_index'], basestring):
            row[u'_index'] = row[u'_index'].encode(u'utf-8')
        csv_writer.writerow(row)

    return csv_out.getvalue()
Ejemplo n.º 9
0
    def run(self, index_name):
        """Delete timeline in both Timesketch and Elasticsearch.

        Args:
            index_name: The name of the index in Elasticsearch
        """
        index_name = unicode(index_name.decode(encoding=u'utf-8'))
        searchindex = SearchIndex.query.filter_by(
            index_name=index_name).first()

        if not searchindex:
            sys.stdout.write(u'No such index\n')
            sys.exit()

        es = ElasticsearchDataStore(host=current_app.config[u'ELASTIC_HOST'],
                                    port=current_app.config[u'ELASTIC_PORT'])

        timelines = Timeline.query.filter_by(searchindex=searchindex).all()
        sketches = [
            t.sketch for t in timelines
            if t.sketch and t.sketch.get_status.status != u'deleted'
        ]
        if sketches:
            sys.stdout.write(u'WARNING: This timeline is in use by:\n')
            for sketch in sketches:
                sys.stdout.write(u' * {0:s}\n'.format(sketch.name))
                sys.stdout.flush()
        really_delete = prompt_bool(
            u'Are you sure you want to delete this timeline?')
        if really_delete:
            for timeline in timelines:
                db_session.delete(timeline)
            db_session.delete(searchindex)
            db_session.commit()
            es.client.indices.delete(index=index_name)
Ejemplo n.º 10
0
    def __init__(self, sketch_id=None, indices=None, timeline_ids=None):
        """Initialize the aggregator object.

        Args:
            field: String that contains the field name used for URL generation.
            sketch_id: Sketch ID.
            indices: Optional list of elasticsearch index names. If not provided
                the default behavior is to include all the indices in a sketch.
            timeline_ids: Optional list of timeline IDs, if not provided the
                default behavior is to query all the data in the provided
                search indices.
        """
        if not sketch_id and not indices:
            raise RuntimeError('Need at least sketch_id or index')

        self.elastic = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        self._sketch_url = '/sketch/{0:d}/explore'.format(sketch_id)
        self.field = ''
        self.indices = indices
        self.sketch = SQLSketch.query.get(sketch_id)
        self.timeline_ids = None

        active_timelines = self.sketch.active_timelines
        if not self.indices:
            self.indices = [t.searchindex.index_name for t in active_timelines]

        if timeline_ids:
            valid_ids = [t.id for t in active_timelines]
            self.timeline_ids = [t for t in timeline_ids if t in valid_ids]
Ejemplo n.º 11
0
    def __init__(self, sketch=None):
        """Initialize the graph object.

        Args:
            sketch (Sketch): Sketch object.

        Raises:
            KeyError if graph type specified is not supported.
        """
        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])
        if not GRAPH_TYPES.get(self.GRAPH_TYPE):
            raise KeyError(f'Graph type {self.GRAPH_TYPE} is not supported')
        self.graph = Graph(self.GRAPH_TYPE)
        self.sketch = sketch
Ejemplo n.º 12
0
    def datastore(self):
        """Property to get an instance of the datastore backend.

        Returns:
            Instance of timesketch.lib.datastores.elastic.ElasticSearchDatastore
        """
        return ElasticsearchDataStore(host=current_app.config[u'ELASTIC_HOST'],
                                      port=current_app.config[u'ELASTIC_PORT'])
Ejemplo n.º 13
0
def run_csv_jsonl(source_file_path, timeline_name, index_name, source_type):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        source_file_path: Path to CSV or JSONL file.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.

    Returns:
        Name (str) of the index.
    """
    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logging.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(
        host=current_app.config['ELASTIC_HOST'],
        port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(source_file_path):
            es.import_event(index_name, event_type, event)
        # Import the remaining events
        es.flush_queued_events()

    except (ImportError, NameError, UnboundLocalError):
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc(e)
        _set_timeline_status(index_name, status='fail', error_msg=error_msg)
        logging.error(error_msg)
        return None

    # Set status to ready when done
    _set_timeline_status(index_name, status='ready')

    return index_name
Ejemplo n.º 14
0
    def __init__(self, index_name, timeline_id=None):
        """Initialize the analyzer object.

        Args:
            index_name: Elasticsearch index name.
            timeline_id: The timeline ID.
        """
        self.name = self.NAME
        self.index_name = index_name
        self.timeline_id = timeline_id
        self.timeline_name = ''

        self.tagged_events = {}
        self.emoji_events = {}

        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        if not hasattr(self, 'sketch'):
            self.sketch = None
Ejemplo n.º 15
0
def run_csv_jsonl(source_file_path,
                  timeline_name,
                  index_name,
                  source_type,
                  delimiter=None,
                  username=None):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        source_file_path: Path to CSV or JSONL file.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.
        delimiter: Character used as a field separator
        username: Username of the user who will own the timeline.

    Returns:
        Dictionary with count of processed events.
    """
    event_type = u'generic_event'  # Document type for Elasticsearch
    validators = {
        u'csv': read_and_validate_csv,
        u'jsonl': read_and_validate_jsonl
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logging.info(u'Index name: %s', index_name)
    logging.info(u'Timeline name: %s', timeline_name)
    logging.info(u'Source type: %s', source_type)
    logging.info(u'Document type: %s', event_type)
    logging.info(u'Owner: %s', username)

    es = ElasticsearchDataStore(host=current_app.config[u'ELASTIC_HOST'],
                                port=current_app.config[u'ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(source_file_path, delimiter):
            es.import_event(index_name, event_type, event)
        # Import the remaining events
        total_events = es.import_event(index_name, event_type)
    except Exception as e:
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc(e)
        _set_timeline_status(index_name, status=u'fail', error_msg=error_msg)
        logging.error(error_msg)
        return

    # Set status to ready when done
    _set_timeline_status(index_name, status=u'ready')

    return {u'Events processed': total_events}
Ejemplo n.º 16
0
    def __init__(self, index_name):
        """Initialize the analyzer object.

        Args:
            index_name: Elasticsearch index name.
        """
        self.name = self.NAME
        self.index_name = index_name
        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        if not hasattr(self, 'sketch'):
            self.sketch = None
Ejemplo n.º 17
0
def run_csv(source_file_path, timeline_name, index_name, username=None):
    """Create a Celery task for processing a CSV file.

    Args:
        source_file_path: Path to CSV file.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        username: Username of the user who will own the timeline.

    Returns:
        Dictionary with count of processed events.
    """
    flush_interval = 1000  # events to queue before bulk index
    event_type = u'generic_event'  # Document type for Elasticsearch
    app = create_app()

    # Log information to Celery
    logging.info(u'Index name: %s', index_name)
    logging.info(u'Timeline name: %s', timeline_name)
    logging.info(u'Flush interval: %d', flush_interval)
    logging.info(u'Document type: %s', event_type)
    logging.info(u'Owner: %s', username)

    es = ElasticsearchDataStore(
        host=current_app.config[u'ELASTIC_HOST'],
        port=current_app.config[u'ELASTIC_PORT'])

    es.create_index(index_name=index_name, doc_type=event_type)
    for event in read_and_validate_csv(source_file_path):
        es.import_event(
            flush_interval, index_name, event_type, event)

    # Import the remaining events
    total_events = es.import_event(flush_interval, index_name, event_type)

    # We are done so let's remove the processing status flag
    with app.app_context():
        search_index = SearchIndex.query.filter_by(
            index_name=index_name).first()
        search_index.status.remove(search_index.status[0])
        db_session.add(search_index)
        db_session.commit()

    return {u'Events processed': total_events}
Ejemplo n.º 18
0
 def run(self, name, index, username):
     """Create the SearchIndex."""
     es = ElasticsearchDataStore(
         host=current_app.config['ELASTIC_HOST'],
         port=current_app.config['ELASTIC_PORT'])
     user = User.query.filter_by(username=username).first()
     if not user:
         sys.stderr.write('User does not exist\n')
         sys.exit(1)
     if not es.client.indices.exists(index=index):
         sys.stderr.write('Index does not exist in the datastore\n')
         sys.exit(1)
     if SearchIndex.query.filter_by(name=name, index_name=index).first():
         sys.stderr.write(
             'Index with this name already exist in Timesketch\n')
         sys.exit(1)
     searchindex = SearchIndex(
         name=name, description=name, user=user, index_name=index)
     db_session.add(searchindex)
     db_session.commit()
     searchindex.grant_permission('read')
     sys.stdout.write('Search index {0:s} created\n'.format(name))
Ejemplo n.º 19
0
    def __init__(self, sketch_id=None, index=None):
        """Initialize the aggregator object.

        Args:
            field: String that contains the field name used for URL generation.
            sketch_id: Sketch ID.
            index: List of elasticsearch index names.
        """
        if not sketch_id and not index:
            raise RuntimeError('Need at least sketch_id or index')

        self.elastic = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        self.field = ''
        self.index = index
        self.sketch = SQLSketch.query.get(sketch_id)
        self._sketch_url = '/sketch/{0:d}/explore'.format(sketch_id)

        if not self.index:
            active_timelines = self.sketch.active_timelines
            self.index = [t.searchindex.index_name for t in active_timelines]
Ejemplo n.º 20
0
class BaseIndexAnalyzer(object):
    """Base class for analyzers.

    Attributes:
        name: Analyzer name.
        index_name: Name if Elasticsearch index.
        datastore: Elasticsearch datastore client.
        sketch: Instance of Sketch object.
    """

    NAME = 'name'
    IS_SKETCH_ANALYZER = False

    # If this analyzer depends on another analyzer
    # it needs to be included in this frozenset by using
    # the indexer names.
    DEPENDENCIES = frozenset()

    def __init__(self, index_name):
        """Initialize the analyzer object.

        Args:
            index_name: Elasticsearch index name.
        """
        self.name = self.NAME
        self.index_name = index_name
        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        if not hasattr(self, 'sketch'):
            self.sketch = None

    def event_stream(self, query_string, query_filter=None, query_dsl=None,
                     indices=None, return_fields=None):
        """Search ElasticSearch.

        Args:
            query_string: Query string.
            query_filter: Dictionary containing filters to apply.
            query_dsl: Dictionary containing Elasticsearch DSL query.
            indices: List of indices to query.
            return_fields: List of fields to return.

        Returns:
            Generator of Event objects.

        Raises:
            ValueError: if neither query_string or query_dsl is provided.
        """
        if not (query_string or query_dsl):
            raise ValueError('Both query_string and query_dsl are missing')

        if not query_filter:
            query_filter = {'indices': self.index_name}

        # If not provided we default to the message field as this will always
        # be present.
        if not return_fields:
            return_fields = ['message']

        # Make sure we always return tag, human_readable and emoji attributes.
        return_fields.extend(['tag', 'human_readable', '__ts_emojis'])
        return_fields = list(set(return_fields))

        if not indices:
            indices = [self.index_name]

        # Refresh the index to make sure it is searchable.
        for index in indices:
            self.datastore.client.indices.refresh(index=index)

        event_generator = self.datastore.search_stream(
            query_string=query_string,
            query_filter=query_filter,
            query_dsl=query_dsl,
            indices=indices,
            return_fields=return_fields
        )
        for event in event_generator:
            yield Event(event, self.datastore, sketch=self.sketch)

    @_flush_datastore_decorator
    def run_wrapper(self):
        """A wrapper method to run the analyzer.

        This method is decorated to flush the bulk insert operation on the
        datastore. This makes sure that all events are indexed at exit.

        Returns:
            Return value of the run method.
        """
        result = self.run()
        return result

    @classmethod
    def get_kwargs(cls):
        """Get keyword arguments needed to instantiate the class.

        Every analyzer gets the index_name as its first argument from Celery.
        By default this is the only argument. If your analyzer need more
        arguments you can override this method and return as a dictionary.

        If you want more than one instance to be created for your analyzer you
        can return a list of dictionaries with kwargs and each one will be
        instantiated and registered in Celery. This is neat if you want to run
        your analyzer with different arguments in parallel.

        Returns:
            List of keyword argument dicts or None if no extra arguments are
            needed.
        """
        return None

    def run(self):
        """Entry point for the analyzer."""
        raise NotImplementedError
Ejemplo n.º 21
0
class SimilarityScorer(object):
    """Score events based on Jaccard distance."""
    def __init__(self, index, data_type):
        """Initializes a similarity scorer.

        Args:
            index: Elasticsearch index name.
            data_type: Name of the data_type.
        """
        self._datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])
        self._config = SimilarityScorerConfig(index, data_type)

    def _shingles_from_text(self, text):
        """Splits string into words.

        Args:
            text: String to extract words from.

        Returns:
            List of words.
        """
        # TODO: Remove stopwords using the NLTK python package.
        # TODO: Remove configured patterns from string.
        delimiters = self._config.delimiters
        return re.split('|'.join(delimiters), text)

    def _minhash_from_text(self, text):
        """Calculate minhash of text.

        Args:
            text: String to calculate minhash of.

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        minhash = MinHash(self._config.num_perm)
        for word in self._shingles_from_text(text):
            minhash.update(word.encode('utf8'))
        return minhash

    def _new_lsh_index(self):
        """Create a new LSH from a set of Timesketch events.

        Returns:
            A tuple with an LSH (instance of datasketch.lsh.LSH) and a
            dictionary with event ID as key and minhash as value.
        """
        minhashes = {}
        lsh = MinHashLSH(self._config.threshold, self._config.num_perm)

        # Event generator for streaming Elasticsearch results.
        events = self._datastore.search_stream(
            query_string=self._config.query,
            query_filter={},
            indices=[self._config.index],
            return_fields=[self._config.field])

        with lsh.insertion_session() as lsh_session:
            for event in events:
                event_id = event['_id']
                index_name = event['_index']
                event_type = event['_type']
                event_text = event['_source'][self._config.field]

                # Insert minhash in LSH index
                key = (event_id, event_type, index_name)
                minhash = self._minhash_from_text(event_text)
                minhashes[key] = minhash
                lsh_session.insert(key, minhash)

        return lsh, minhashes

    @staticmethod
    def _calculate_score(lsh, minhash, total_num_events):
        """Calculate a score based on Jaccard distance.

        The score is calculated based on how many similar events that there are
        for the event being scored. This is called neighbours and we simply
        calculate how many neighbours the event has divided by the total events
        in the LSH.

        Args:
            lsh: Instance of datasketch.lsh.MinHashLSH
            minhash: Instance of datasketch.minhash.MinHash
            total_num_events: Integer of how many events in the LSH

        Returns:
            A float between 0 and 1.
        """
        neighbours = lsh.query(minhash)
        return float(len(neighbours)) / float(total_num_events)

    def _update_event(self, event_id, event_type, index_name, score):
        """Add a similarity_score attribute to the event in Elasticsearch.

        Args:
            event_id: ID of the Elasticsearch document.
            event_type: The Elasticsearch type of the event.
            index_name: The name of the index in Elasticsearch.
            score: A numerical similarity score with value between 0 and 1.
        """
        update_doc = {'similarity_score': score}
        self._datastore.import_event(index_name,
                                     event_type,
                                     event_id=event_id,
                                     event=update_doc)

    def run(self):
        """Entry point for a SimilarityScorer.

        Returns:
            A dict with metadata about the processed data set.
        """
        lsh, minhashes = self._new_lsh_index()
        total_num_events = len(minhashes)
        for key, minhash in minhashes.items():
            event_id, event_type, index_name = key
            score = self._calculate_score(lsh, minhash, total_num_events)
            self._update_event(event_id, event_type, index_name, score)

        return dict(index=self._config.index,
                    data_type=self._config.data_type,
                    num_events_processed=total_num_events)
Ejemplo n.º 22
0
class BaseGraphPlugin:
    """Base class for a graph.

    Attributes:
        datastore (ElasticsearchDataStore): Elasticsearch datastore object.
        graph (nx.Graph): NetworkX Graph object.
    """
    # Name that the graph will be registered as.
    NAME = 'name'

    # Display name (used in the UI)
    DISPLAY_NAME = 'display_name'

    # Description of the plugin (used in the UI)
    DESCRIPTION = 'description'

    # Type of graph. There are four supported types: Undirected Graph,
    # Undirected Multi Graph, Directed Graph, Directed  Multi Graph.
    # If you have multiple edges between nodes you need to use the multi graphs.
    #
    # See NetworkX documentation for details:
    # https://networkx.org/documentation/stable/reference/classes/index.html
    GRAPH_TYPE = 'MultiDiGraph'

    def __init__(self, sketch=None):
        """Initialize the graph object.

        Args:
            sketch (Sketch): Sketch object.

        Raises:
            KeyError if graph type specified is not supported.
        """
        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])
        if not GRAPH_TYPES.get(self.GRAPH_TYPE):
            raise KeyError(f'Graph type {self.GRAPH_TYPE} is not supported')
        self.graph = Graph(self.GRAPH_TYPE)
        self.sketch = sketch

    def _get_all_sketch_indices(self):
        """List all indices in the Sketch.
        Returns:
            List of index names.
        """
        active_timelines = self.sketch.active_timelines
        indices = [t.searchindex.index_name for t in active_timelines]
        return indices

    # TODO: Refactor this to reuse across analyzers and graphs.
    def event_stream(
            self, query_string=None, query_filter=None, query_dsl=None,
            indices=None, return_fields=None, scroll=True):
        """Search ElasticSearch.

        Args:
            query_string: Query string.
            query_filter: Dictionary containing filters to apply.
            query_dsl: Dictionary containing Elasticsearch DSL query.
            indices: List of indices to query.
            return_fields: List of fields to return.
            scroll: Boolean determining whether we support scrolling searches
                or not. Defaults to True.

        Returns:
            Generator of Event objects.

        Raises:
            ValueError: if neither query_string or query_dsl is provided.
        """
        if not (query_string or query_dsl):
            raise ValueError('Both query_string and query_dsl are missing')

        # Query all sketch indices if none are specified.
        if not indices:
            indices = self._get_all_sketch_indices()

        if not query_filter:
            query_filter = {}

        return_fields = list(set(return_fields))

        event_generator = self.datastore.search_stream(
            query_string=query_string,
            query_filter=query_filter,
            query_dsl=query_dsl,
            indices=indices,
            return_fields=return_fields,
            enable_scroll=scroll,
        )
        return event_generator

    def generate(self):
        """Entry point for the graph."""
        raise NotImplementedError
Ejemplo n.º 23
0
 def __init__(self):
     """Initialize the data fetcher."""
     super(ApiDataFetcher, self).__init__()
     self._datastore = ElasticsearchDataStore(
         host=current_app.config['ELASTIC_HOST'],
         port=current_app.config['ELASTIC_PORT'])
Ejemplo n.º 24
0
class BaseIndexAnalyzer(object):
    """Base class for analyzers.

    Attributes:
        name: Analyzer name.
        index_name: Name if Elasticsearch index.
        datastore: Elasticsearch datastore client.
        sketch: Instance of Sketch object.
    """

    NAME = 'name'
    IS_SKETCH_ANALYZER = False

    # If this analyzer depends on another analyzer
    # it needs to be included in this frozenset by using
    # the indexer names.
    DEPENDENCIES = frozenset()

    def __init__(self, index_name):
        """Initialize the analyzer object.

        Args:
            index_name: Elasticsearch index name.
        """
        self.name = self.NAME
        self.index_name = index_name
        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        if not hasattr(self, 'sketch'):
            self.sketch = None

    def event_stream(
            self, query_string=None, query_filter=None, query_dsl=None,
            indices=None, return_fields=None):
        """Search ElasticSearch.

        Args:
            query_string: Query string.
            query_filter: Dictionary containing filters to apply.
            query_dsl: Dictionary containing Elasticsearch DSL query.
            indices: List of indices to query.
            return_fields: List of fields to return.

        Returns:
            Generator of Event objects.

        Raises:
            ValueError: if neither query_string or query_dsl is provided.
        """
        if not (query_string or query_dsl):
            raise ValueError('Both query_string and query_dsl are missing')

        if not query_filter:
            query_filter = {'indices': self.index_name}

        # If not provided we default to the message field as this will always
        # be present.
        if not return_fields:
            return_fields = ['message']

        # Make sure we always return tag, human_readable and emoji attributes.
        return_fields.extend(['tag', 'human_readable', '__ts_emojis'])
        return_fields = list(set(return_fields))

        if not indices:
            indices = [self.index_name]

        # Refresh the index to make sure it is searchable.
        for index in indices:
            self.datastore.client.indices.refresh(index=index)

        event_generator = self.datastore.search_stream(
            query_string=query_string,
            query_filter=query_filter,
            query_dsl=query_dsl,
            indices=indices,
            return_fields=return_fields
        )
        for event in event_generator:
            yield Event(event, self.datastore, sketch=self.sketch)

    @_flush_datastore_decorator
    def run_wrapper(self, analysis_id):
        """A wrapper method to run the analyzer.

        This method is decorated to flush the bulk insert operation on the
        datastore. This makes sure that all events are indexed at exit.

        Returns:
            Return value of the run method.
        """
        analysis = Analysis.query.get(analysis_id)
        analysis.set_status('STARTED')

        # Run the analyzer
        result = self.run()

        # Update database analysis object with result and status
        analysis.result = '{0:s}'.format(result)
        analysis.set_status('DONE')
        db_session.add(analysis)
        db_session.commit()

        return result

    def run(self):
        """Entry point for the analyzer."""
        raise NotImplementedError
Ejemplo n.º 25
0
class BaseAnalyzer:
    """Base class for analyzers.

    Attributes:
        name: Analyzer name.
        index_name: Name if Elasticsearch index.
        datastore: Elasticsearch datastore client.
        sketch: Instance of Sketch object.
        timeline_id: The ID of the timeline the analyzer runs on.
        tagged_events: Dict with all events to add tags and those tags.
        emoji_events: Dict with all events to add emojis and those emojis.
    """

    NAME = 'name'
    DISPLAY_NAME = None
    DESCRIPTION = None

    # If this analyzer depends on another analyzer
    # it needs to be included in this frozenset by using
    # the indexer names.
    DEPENDENCIES = frozenset()

    # Used as hints to the frontend UI in order to render input forms.
    FORM_FIELDS = []

    # Configure how long an analyzer should run before the timeline
    # gets fully indexed.
    SECONDS_PER_WAIT = 10
    MAXIMUM_WAITS = 360

    def __init__(self, index_name, sketch_id, timeline_id=None):
        """Initialize the analyzer object.

        Args:
            index_name: Elasticsearch index name.
            sketch_id: Sketch ID.
            timeline_id: The timeline ID.
        """
        self.name = self.NAME
        self.index_name = index_name
        self.sketch = Sketch(sketch_id=sketch_id)
        self.timeline_id = timeline_id
        self.timeline_name = ''

        self.tagged_events = {}
        self.emoji_events = {}

        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        if not hasattr(self, 'sketch'):
            self.sketch = None

    def event_pandas(self,
                     query_string=None,
                     query_filter=None,
                     query_dsl=None,
                     indices=None,
                     return_fields=None):
        """Search ElasticSearch.

        Args:
            query_string: Query string.
            query_filter: Dictionary containing filters to apply.
            query_dsl: Dictionary containing Elasticsearch DSL query.
            indices: List of indices to query.
            return_fields: List of fields to be included in the search results,
                if not included all fields will be included in the results.

        Returns:
            A python pandas object with all the events.

        Raises:
            ValueError: if neither query_string or query_dsl is provided.
        """
        if not (query_string or query_dsl):
            raise ValueError('Both query_string and query_dsl are missing')

        if not query_filter:
            query_filter = {'indices': self.index_name, 'size': 10000}

        if not indices:
            indices = [self.index_name]

        if self.timeline_id:
            timeline_ids = [self.timeline_id]
        else:
            timeline_ids = None

        # Refresh the index to make sure it is searchable.
        for index in indices:
            try:
                self.datastore.client.indices.refresh(index=index)
            except elasticsearch.NotFoundError:
                logger.error('Unable to refresh index: {0:s}, not found, '
                             'removing from list.'.format(index))
                broken_index = indices.index(index)
                _ = indices.pop(broken_index)

        if not indices:
            raise ValueError('Unable to get events, no indices to query.')

        if return_fields:
            default_fields = definitions.DEFAULT_SOURCE_FIELDS
            return_fields.extend(default_fields)
            return_fields = list(set(return_fields))
            return_fields = ','.join(return_fields)

        results = self.datastore.search_stream(
            sketch_id=self.sketch.id,
            query_string=query_string,
            query_filter=query_filter,
            query_dsl=query_dsl,
            indices=indices,
            timeline_ids=timeline_ids,
            return_fields=return_fields,
        )

        events = []
        for event in results:
            source = event.get('_source')
            source['_id'] = event.get('_id')
            source['_type'] = event.get('_type')
            source['_index'] = event.get('_index')
            events.append(source)

        return pandas.DataFrame(events)

    def event_stream(self,
                     query_string=None,
                     query_filter=None,
                     query_dsl=None,
                     indices=None,
                     return_fields=None,
                     scroll=True):
        """Search ElasticSearch.

        Args:
            query_string: Query string.
            query_filter: Dictionary containing filters to apply.
            query_dsl: Dictionary containing Elasticsearch DSL query.
            indices: List of indices to query.
            return_fields: List of fields to return.
            scroll: Boolean determining whether we support scrolling searches
                or not. Defaults to True.

        Returns:
            Generator of Event objects.

        Raises:
            ValueError: if neither query_string or query_dsl is provided.
        """
        if not (query_string or query_dsl):
            raise ValueError('Both query_string and query_dsl are missing')

        if not query_filter:
            query_filter = {'indices': self.index_name}

        # If not provided we default to the message field as this will always
        # be present.
        if not return_fields:
            return_fields = ['message']

        # Make sure we always return tag, human_readable and emoji attributes.
        return_fields.extend(['tag', 'human_readable', '__ts_emojis'])
        return_fields = list(set(return_fields))

        if not indices:
            indices = [self.index_name]

        # Refresh the index to make sure it is searchable.
        for index in indices:
            try:
                self.datastore.client.indices.refresh(index=index)
            except elasticsearch.NotFoundError:
                logger.error('Unable to find index: {0:s}, removing from '
                             'result set.'.format(index))
                broken_index = indices.index(index)
                _ = indices.pop(broken_index)
        if not indices:
            raise ValueError(
                'Unable to query for analyzers, discovered no index to query.')

        if self.timeline_id:
            timeline_ids = [self.timeline_id]
        else:
            timeline_ids = None

        # Exponential backoff for the call to Elasticsearch. Sometimes the
        # cluster can be a bit overloaded and timeout on requests. We want to
        # retry a few times in order to give the cluster a chance to return
        # results.
        backoff_in_seconds = 3
        retries = 5
        for x in range(0, retries):
            try:
                event_generator = self.datastore.search_stream(
                    query_string=query_string,
                    query_filter=query_filter,
                    query_dsl=query_dsl,
                    indices=indices,
                    return_fields=return_fields,
                    enable_scroll=scroll,
                    timeline_ids=timeline_ids)
                for event in event_generator:
                    yield Event(event,
                                self.datastore,
                                sketch=self.sketch,
                                analyzer=self)
                break  # Query was succesful
            except elasticsearch.TransportError as e:
                sleep_seconds = (backoff_in_seconds * 2**x +
                                 random.uniform(3, 7))
                logger.info(
                    'Attempt: {0:d}/{1:d} sleeping {2:f} for query {3:s}'.
                    format(x + 1, retries, sleep_seconds, query_string))
                time.sleep(sleep_seconds)

                if x == retries - 1:
                    logger.error(
                        'Timeout executing search for {0:s}: {1!s}'.format(
                            query_string, e),
                        exc_info=True)
                    raise

    @_flush_datastore_decorator
    def run_wrapper(self, analysis_id):
        """A wrapper method to run the analyzer.

        This method is decorated to flush the bulk insert operation on the
        datastore. This makes sure that all events are indexed at exit.

        Returns:
            Return value of the run method.
        """
        analysis = Analysis.query.get(analysis_id)
        analysis.set_status('STARTED')

        timeline = analysis.timeline
        self.timeline_name = timeline.name
        searchindex = timeline.searchindex

        counter = 0
        while True:
            status = searchindex.get_status.status
            status = status.lower()
            if status == 'ready':
                break

            if status == 'fail':
                logger.error(
                    'Unable to run analyzer on a failed index ({0:s})'.format(
                        searchindex.index_name))
                return 'Failed'

            time.sleep(self.SECONDS_PER_WAIT)
            counter += 1
            if counter >= self.MAXIMUM_WAITS:
                logger.error(
                    'Indexing has taken too long time, aborting run of '
                    'analyzer')
                return 'Failed'
            # Refresh the searchindex object.
            db_session.refresh(searchindex)

        # Run the analyzer. Broad Exception catch to catch any error and store
        # the error in the DB for display in the UI.
        try:
            result = self.run()
            analysis.set_status('DONE')
        except Exception:  # pylint: disable=broad-except
            analysis.set_status('ERROR')
            result = traceback.format_exc()

        # Update database analysis object with result and status
        analysis.result = '{0:s}'.format(result)
        db_session.add(analysis)
        db_session.commit()

        return result

    @classmethod
    def get_kwargs(cls):
        """Get keyword arguments needed to instantiate the class.
        Every analyzer gets the index_name as its first argument from Celery.
        By default this is the only argument. If your analyzer need more
        arguments you can override this method and return as a dictionary.

        If you want more than one instance to be created for your analyzer you
        can return a list of dictionaries with kwargs and each one will be
        instantiated and registered in Celery. This is neat if you want to run
        your analyzer with different arguments in parallel.

        Returns:
            List of keyword argument dicts or empty list if no extra arguments
            are needed.
        """
        return []

    def run(self):
        """Entry point for the analyzer."""
        raise NotImplementedError
Ejemplo n.º 26
0
class BaseIndexAnalyzer(object):
    """Base class for analyzers.

    Attributes:
        name: Analyzer name.
        index_name: Name if Elasticsearch index.
        datastore: Elasticsearch datastore client.
        sketch: Instance of Sketch object.
    """

    NAME = 'name'
    IS_SKETCH_ANALYZER = False

    # If this analyzer depends on another analyzer
    # it needs to be included in this frozenset by using
    # the indexer names.
    DEPENDENCIES = frozenset()

    def __init__(self, index_name):
        """Initialize the analyzer object.

        Args:
            index_name: Elasticsearch index name.
        """
        self.name = self.NAME
        self.index_name = index_name
        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        if not hasattr(self, 'sketch'):
            self.sketch = None

    def event_stream(
            self, query_string=None, query_filter=None, query_dsl=None,
            indices=None, return_fields=None):
        """Search ElasticSearch.

        Args:
            query_string: Query string.
            query_filter: Dictionary containing filters to apply.
            query_dsl: Dictionary containing Elasticsearch DSL query.
            indices: List of indices to query.
            return_fields: List of fields to return.

        Returns:
            Generator of Event objects.

        Raises:
            ValueError: if neither query_string or query_dsl is provided.
        """
        if not (query_string or query_dsl):
            raise ValueError('Both query_string and query_dsl are missing')

        if not query_filter:
            query_filter = {'indices': self.index_name}

        # If not provided we default to the message field as this will always
        # be present.
        if not return_fields:
            return_fields = ['message']

        # Make sure we always return tag, human_readable and emoji attributes.
        return_fields.extend(['tag', 'human_readable', '__ts_emojis'])
        return_fields = list(set(return_fields))

        if not indices:
            indices = [self.index_name]

        # Refresh the index to make sure it is searchable.
        for index in indices:
            self.datastore.client.indices.refresh(index=index)

        event_generator = self.datastore.search_stream(
            query_string=query_string,
            query_filter=query_filter,
            query_dsl=query_dsl,
            indices=indices,
            return_fields=return_fields
        )
        for event in event_generator:
            yield Event(event, self.datastore, sketch=self.sketch)

    @_flush_datastore_decorator
    def run_wrapper(self):
        """A wrapper method to run the analyzer.

        This method is decorated to flush the bulk insert operation on the
        datastore. This makes sure that all events are indexed at exit.

        Returns:
            Return value of the run method.
        """
        result = self.run()

        # Update the searchindex description with analyzer result.
        # TODO: Don't overload the description field.
        searchindex = SearchIndex.query.filter_by(
            index_name=self.index_name).first()

        # Some code paths set the description equals to the name. Remove that
        # here to get a clean description with only analyzer results.
        if searchindex.description == searchindex.name:
            searchindex.description = ''

        # Append the analyzer result.
        if result:
            searchindex.description = '{0:s}\n{1:s}'.format(
                searchindex.description, result)
        db_session.add(searchindex)
        db_session.commit()

        return result

    @classmethod
    def get_kwargs(cls):
        """Get keyword arguments needed to instantiate the class.

        Every analyzer gets the index_name as its first argument from Celery.
        By default this is the only argument. If your analyzer need more
        arguments you can override this method and return as a dictionary.

        If you want more than one instance to be created for your analyzer you
        can return a list of dictionaries with kwargs and each one will be
        instantiated and registered in Celery. This is neat if you want to run
        your analyzer with different arguments in parallel.

        Returns:
            List of keyword argument dicts or None if no extra arguments are
            needed.
        """
        return None

    def run(self):
        """Entry point for the analyzer."""
        raise NotImplementedError
Ejemplo n.º 27
0
class BaseIndexAnalyzer(object):
    """Base class for analyzers.

    Attributes:
        name: Analyzer name.
        index_name: Name if Elasticsearch index.
        datastore: Elasticsearch datastore client.
        sketch: Instance of Sketch object.
    """

    NAME = 'name'
    IS_SKETCH_ANALYZER = False

    # If this analyzer depends on another analyzer
    # it needs to be included in this frozenset by using
    # the indexer names.
    DEPENDENCIES = frozenset()

    # Used as hints to the frontend UI in order to render input forms.
    FORM_FIELDS = []

    # Configure how long an analyzer should run before the timeline
    # gets fully indexed.
    SECONDS_PER_WAIT = 10
    MAXIMUM_WAITS = 360

    def __init__(self, index_name):
        """Initialize the analyzer object.

        Args:
            index_name: Elasticsearch index name.
        """
        self.name = self.NAME
        self.index_name = index_name
        self.datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

        if not hasattr(self, 'sketch'):
            self.sketch = None

    def event_stream(self,
                     query_string=None,
                     query_filter=None,
                     query_dsl=None,
                     indices=None,
                     return_fields=None):
        """Search ElasticSearch.

        Args:
            query_string: Query string.
            query_filter: Dictionary containing filters to apply.
            query_dsl: Dictionary containing Elasticsearch DSL query.
            indices: List of indices to query.
            return_fields: List of fields to return.

        Returns:
            Generator of Event objects.

        Raises:
            ValueError: if neither query_string or query_dsl is provided.
        """
        if not (query_string or query_dsl):
            raise ValueError('Both query_string and query_dsl are missing')

        if not query_filter:
            query_filter = {'indices': self.index_name}

        # If not provided we default to the message field as this will always
        # be present.
        if not return_fields:
            return_fields = ['message']

        # Make sure we always return tag, human_readable and emoji attributes.
        return_fields.extend(['tag', 'human_readable', '__ts_emojis'])
        return_fields = list(set(return_fields))

        if not indices:
            indices = [self.index_name]

        # Refresh the index to make sure it is searchable.
        for index in indices:
            self.datastore.client.indices.refresh(index=index)

        event_generator = self.datastore.search_stream(
            query_string=query_string,
            query_filter=query_filter,
            query_dsl=query_dsl,
            indices=indices,
            return_fields=return_fields)
        for event in event_generator:
            yield Event(event, self.datastore, sketch=self.sketch)

    @_flush_datastore_decorator
    def run_wrapper(self, analysis_id):
        """A wrapper method to run the analyzer.

        This method is decorated to flush the bulk insert operation on the
        datastore. This makes sure that all events are indexed at exit.

        Returns:
            Return value of the run method.
        """
        analysis = Analysis.query.get(analysis_id)
        analysis.set_status('STARTED')

        timeline = analysis.timeline
        searchindex = timeline.searchindex

        counter = 0
        while True:
            status = searchindex.get_status.status
            status = status.lower()
            if status == 'ready':
                break

            if status == 'fail':
                logging.error(
                    'Unable to run analyzer on a failed index ({0:s})'.format(
                        searchindex.index_name))
                return 'Failed'

            time.sleep(self.SECONDS_PER_WAIT)
            counter += 1
            if counter >= self.MAXIMUM_WAITS:
                logging.error(
                    'Indexing has taken too long time, aborting run of '
                    'analyzer')
                return 'Failed'
            # Refresh the searchindex object.
            db_session.refresh(searchindex)

        # Run the analyzer. Broad Exception catch to catch any error and store
        # the error in the DB for display in the UI.
        try:
            result = self.run()
            analysis.set_status('DONE')
        except Exception:  # pylint: disable=broad-except
            analysis.set_status('ERROR')
            result = traceback.format_exc()

        # Update database analysis object with result and status
        analysis.result = '{0:s}'.format(result)
        db_session.add(analysis)
        db_session.commit()

        return result

    def run(self):
        """Entry point for the analyzer."""
        raise NotImplementedError
Ejemplo n.º 28
0
def run_csv_jsonl(file_path, events, timeline_name, index_name, source_type):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        file_path: Path to the JSON or CSV file.
        events: A string with the events.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.

    Returns:
        Name (str) of the index.
    """
    if events:
        file_handle = io.StringIO(events)
        source_type = 'jsonl'
    else:
        file_handle = codecs.open(file_path,
                                  'r',
                                  encoding='utf-8',
                                  errors='replace')

    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl,
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logging.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(host=current_app.config['ELASTIC_HOST'],
                                port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(file_handle):
            es.import_event(index_name, event_type, event)
        # Import the remaining events
        es.flush_queued_events()

    except errors.DataIngestionError as e:
        _set_timeline_status(index_name, status='fail', error_msg=str(e))
        raise

    except (RuntimeError, ImportError, NameError, UnboundLocalError,
            RequestError) as e:
        _set_timeline_status(index_name, status='fail', error_msg=str(e))
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc()
        _set_timeline_status(index_name, status='fail', error_msg=error_msg)
        logging.error('Error: {0!s}\n{1:s}'.format(e, error_msg))
        return None

    # Set status to ready when done
    _set_timeline_status(index_name, status='ready')

    return index_name
Ejemplo n.º 29
0
def run_plaso(file_path, events, timeline_name, index_name, source_type,
              timeline_id):
    """Create a Celery task for processing Plaso storage file.

    Args:
        file_path: Path to the plaso file on disk.
        events: String with event data, invalid for plaso files.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.
        timeline_id: ID of the timeline object this data belongs to.

    Raises:
        RuntimeError: If the function is called using events, plaso
            is not installed or is of unsupported version.
    Returns:
        Name (str) of the index.
    """
    if not plaso:
        raise RuntimeError(
            'Plaso isn\'t installed, unable to continue processing plaso '
            'files.')

    plaso_version = int(plaso.__version__)
    if plaso_version <= PLASO_MINIMUM_VERSION:
        raise RuntimeError(
            'Plaso version is out of date (version {0:d}, please upgrade to a '
            'version that is later than {1:d}'.format(plaso_version,
                                                      PLASO_MINIMUM_VERSION))

    if events:
        raise RuntimeError('Plaso uploads needs a file, not events.')

    event_type = 'generic_event'  # Document type for Elasticsearch

    mappings = None
    mappings_file_path = current_app.config.get('PLASO_MAPPING_FILE', '')
    if os.path.isfile(mappings_file_path):
        try:
            with open(mappings_file_path, 'r') as mfh:
                mappings = json.load(mfh)

                if not isinstance(mappings, dict):
                    raise RuntimeError(
                        'Unable to create mappings, the mappings are not a '
                        'dict, please look at the file: {0:s}'.format(
                            mappings_file_path))
        except (json.JSONDecodeError, IOError):
            logger.error('Unable to read in mapping', exc_info=True)

    elastic_server = current_app.config.get('ELASTIC_HOST')
    if not elastic_server:
        raise RuntimeError(
            'Unable to connect to Elastic, no server set, unable to '
            'process plaso file.')
    elastic_port = current_app.config.get('ELASTIC_PORT')
    if not elastic_port:
        raise RuntimeError(
            'Unable to connect to Elastic, no port set, unable to '
            'process plaso file.')

    es = ElasticsearchDataStore(host=elastic_server, port=elastic_port)

    try:
        es.create_index(index_name=index_name,
                        doc_type=event_type,
                        mappings=mappings)
    except errors.DataIngestionError as e:
        _set_timeline_status(timeline_id, status='fail', error_msg=str(e))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        raise

    except (RuntimeError, ImportError, NameError, UnboundLocalError,
            RequestError) as e:
        _set_timeline_status(timeline_id, status='fail', error_msg=str(e))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc()
        _set_timeline_status(timeline_id, status='fail', error_msg=error_msg)
        logger.error('Error: {0!s}\n{1:s}'.format(e, error_msg))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        return None

    message = 'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'
    logger.info(message.format(timeline_name, index_name, source_type))

    try:
        psort_path = current_app.config['PSORT_PATH']
    except KeyError:
        psort_path = 'psort.py'

    cmd = [
        psort_path,
        '-o',
        'elastic_ts',
        file_path,
        '--server',
        elastic_server,
        '--port',
        str(elastic_port),
        '--status_view',
        'none',
        '--index_name',
        index_name,
    ]

    if mappings_file_path:
        cmd.extend(['--elastic_mappings', mappings_file_path])

    if timeline_id:
        cmd.extend(['--timeline_identifier', str(timeline_id)])

    # Run psort.py
    try:
        subprocess.check_output(cmd,
                                stderr=subprocess.STDOUT,
                                encoding='utf-8')
    except subprocess.CalledProcessError as e:
        # Mark the searchindex and timelines as failed and exit the task
        _set_timeline_status(timeline_id, status='fail', error_msg=e.output)
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        return e.output

    # Mark the searchindex and timelines as ready
    _set_timeline_status(timeline_id, status='ready')

    return index_name
Ejemplo n.º 30
0
class ApiDataFetcher(interface.DataFetcher):
    """Data Fetcher for an API story exporter."""

    def __init__(self):
        """Initialize the data fetcher."""
        super(ApiDataFetcher, self).__init__()
        self._datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

    def get_aggregation(self, agg_dict):
        """Returns an aggregation object from an aggregation dict.

        Args:
            agg_dict (dict): a dictionary containing information
                about the stored aggregation.

        Returns:
            An aggregation object (instance of AggregationResult) from a
            saved aggregation or None if not found.
        """
        aggregation_id = agg_dict.get('id')
        if not aggregation_id:
            return None

        aggregation = Aggregation.query.get(aggregation_id)
        if not aggregation:
            return None

        try:
            agg_class = aggregator_manager.AggregatorManager.get_aggregator(
                aggregation.agg_type)
        except KeyError:
            return None

        if not agg_class:
            return pd.DataFrame()
        aggregator = agg_class(sketch_id=self._sketch_id)
        parameter_string = aggregation.parameters
        parameters = json.loads(parameter_string)
        return aggregator.run(**parameters)

    def get_view(self, view_dict):
        """Returns a data frame from a view dict.

        Args:
            view_dict (dict): a dictionary containing information
                about the stored view.

        Returns:
            A pandas DataFrame with the results from a view aggregation.
        """
        view_id = view_dict.get('id')
        if not view_id:
            return pd.DataFrame()

        view = View.query.get(view_id)
        if not view:
            return pd.DataFrame()

        if not view.query_string and not view.query_dsl:
            return pd.DataFrame()

        query_filter = view.query_filter
        if query_filter and isinstance(query_filter, str):
            query_filter = json.loads(query_filter)
        elif not query_filter:
            query_filter = {'indices': '_all', 'size': 100}

        if view.query_dsl:
            query_dsl = json.loads(view.query_dsl)
        else:
            query_dsl = None

        sketch = Sketch.query.get_with_acl(self._sketch_id)
        sketch_indices = [
            t.searchindex.index_name
            for t in sketch.active_timelines
        ]

        results = self._datastore.search_stream(
            sketch_id=self._sketch_id,
            query_string=view.query_string,
            query_filter=query_filter,
            query_dsl=query_dsl,
            indices=sketch_indices,
        )
        result_list = [x.get('_source') for x in results]
        return pd.DataFrame(result_list)
Ejemplo n.º 31
0
def run_csv_jsonl(file_path, events, timeline_name, index_name, source_type,
                  timeline_id):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        file_path: Path to the JSON or CSV file.
        events: A string with the events.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.
        timeline_id: ID of the timeline object this data belongs to.

    Returns:
        Name (str) of the index.
    """
    if events:
        file_handle = io.StringIO(events)
        source_type = 'jsonl'
    else:
        file_handle = codecs.open(file_path,
                                  'r',
                                  encoding='utf-8',
                                  errors='replace')

    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl,
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logger.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(host=current_app.config['ELASTIC_HOST'],
                                port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    final_counter = 0
    error_msg = ''
    error_count = 0
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(file_handle):
            es.import_event(index_name,
                            event_type,
                            event,
                            timeline_id=timeline_id)
            final_counter += 1

        # Import the remaining events
        results = es.flush_queued_events()

        error_container = results.get('error_container', {})
        error_msg = get_import_errors(error_container=error_container,
                                      index_name=index_name,
                                      total_count=results.get(
                                          'total_events', 0))

    except errors.DataIngestionError as e:
        _set_timeline_status(timeline_id, status='fail', error_msg=str(e))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        raise

    except (RuntimeError, ImportError, NameError, UnboundLocalError,
            RequestError) as e:
        _set_timeline_status(timeline_id, status='fail', error_msg=str(e))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc()
        _set_timeline_status(timeline_id, status='fail', error_msg=error_msg)
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        logger.error('Error: {0!s}\n{1:s}'.format(e, error_msg))
        return None

    if error_count:
        logger.info(
            'Index timeline: [{0:s}] to index [{1:s}] - {2:d} out of {3:d} '
            'events imported (in total {4:d} errors were discovered) '.format(
                timeline_name, index_name, (final_counter - error_count),
                final_counter, error_count))
    else:
        logger.info('Index timeline: [{0:s}] to index [{1:s}] - {2:d} '
                    'events imported.'.format(timeline_name, index_name,
                                              final_counter))

    # Set status to ready when done
    _set_timeline_status(timeline_id, status='ready', error_msg=error_msg)

    return index_name
Ejemplo n.º 32
0
class ApiDataFetcher(interface.DataFetcher):
    """Data Fetcher for an API story exporter."""

    def __init__(self):
        """Initialize the data fetcher."""
        super(ApiDataFetcher, self).__init__()
        self._datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])

    def get_aggregation(self, agg_dict):
        """Returns an aggregation object from an aggregation dict.

        Args:
            agg_dict (dict): a dictionary containing information
                about the stored aggregation.

        Returns:
            A dict with metadata information as well as the aggregation
            object (instance of AggregationResult) from a saved aggregation
            or an empty dict if not found.
        """
        aggregation_id = agg_dict.get('id')
        if not aggregation_id:
            return {}

        aggregation = Aggregation.query.get(aggregation_id)
        if not aggregation:
            return {}

        try:
            agg_class = aggregator_manager.AggregatorManager.get_aggregator(
                aggregation.agg_type)
        except KeyError:
            return {}

        if not agg_class:
            return pd.DataFrame()
        aggregator = agg_class(sketch_id=self._sketch_id)
        parameter_string = aggregation.parameters
        parameters = json.loads(parameter_string)
        data = {
            'aggregation': aggregator.run(**parameters),
            'name': aggregation.name,
            'description': aggregation.description,
            'agg_type': aggregation.agg_type,
            'parameters': parameters,
            'chart_type': aggregation.chart_type,
            'user': aggregation.user,
        }
        return data

    def get_aggregation_group(self, agg_dict):
        """Returns an aggregation object from an aggregation dict.

        Args:
            agg_dict (dict): a dictionary containing information
                about the stored aggregation.

        Returns:
            A dict that contains metadata about the aggregation group
            as well as a chart object (instance of altair.Chart)
            with the combined chart object from the group.
        """
        group_id = agg_dict.get('id')
        if not group_id:
            return None

        group = AggregationGroup.query.get(group_id)
        if not group:
            return None

        orientation = group.orientation

        result_chart = None
        for aggregator in group.aggregations:
            if aggregator.parameters:
                aggregator_parameters = json.loads(aggregator.parameters)
            else:
                aggregator_parameters = {}

            agg_class = aggregator_manager.AggregatorManager.get_aggregator(
                aggregator.agg_type)
            if not agg_class:
                continue

            aggregator_obj = agg_class(sketch_id=self._sketch_id)
            chart_type = aggregator_parameters.pop('supported_charts', None)
            color = aggregator_parameters.pop('chart_color', '')
            result_obj = aggregator_obj.run(**aggregator_parameters)

            chart = result_obj.to_chart(
                chart_name=chart_type,
                chart_title=aggregator_obj.chart_title,
                as_chart=True, interactive=True, color=color)

            if result_chart is None:
                result_chart = chart
            elif orientation == 'horizontal':
                result_chart = alt.hconcat(chart, result_chart)
            elif orientation == 'vertical':
                result_chart = alt.vconcat(chart, result_chart)
            else:
                result_chart = alt.layer(chart, result_chart)

        data = {
            'name': group.name,
            'description': group.description,
            'chart': result_chart,
            'parameters': group.parameters,
            'orientation': group.orientation,
            'user': group.user,
        }
        return data

    def get_view(self, view_dict):
        """Returns a data frame from a view dict.

        Args:
            view_dict (dict): a dictionary containing information
                about the stored view.

        Returns:
            A pandas DataFrame with the results from a view aggregation.
        """
        view_id = view_dict.get('id')
        if not view_id:
            return pd.DataFrame()

        view = View.query.get(view_id)
        if not view:
            return pd.DataFrame()

        if not view.query_string and not view.query_dsl:
            return pd.DataFrame()

        query_filter = view.query_filter
        if query_filter and isinstance(query_filter, str):
            query_filter = json.loads(query_filter)
        elif not query_filter:
            query_filter = {'indices': '_all', 'size': 100}

        if view.query_dsl:
            query_dsl = json.loads(view.query_dsl)
        else:
            query_dsl = None

        sketch = Sketch.query.get_with_acl(self._sketch_id)
        sketch_indices = [
            t.searchindex.index_name
            for t in sketch.active_timelines
        ]

        results = self._datastore.search_stream(
            sketch_id=self._sketch_id,
            query_string=view.query_string,
            query_filter=query_filter,
            query_dsl=query_dsl,
            indices=sketch_indices,
        )
        result_list = [x.get('_source') for x in results]
        return pd.DataFrame(result_list)