Ejemplo n.º 1
0
def run_csv_jsonl(source_file_path,
                  timeline_name,
                  index_name,
                  source_type,
                  delimiter=None,
                  username=None):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        source_file_path: Path to CSV or JSONL file.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.
        delimiter: Character used as a field separator
        username: Username of the user who will own the timeline.

    Returns:
        Dictionary with count of processed events.
    """
    event_type = u'generic_event'  # Document type for Elasticsearch
    validators = {
        u'csv': read_and_validate_csv,
        u'jsonl': read_and_validate_jsonl
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logging.info(u'Index name: %s', index_name)
    logging.info(u'Timeline name: %s', timeline_name)
    logging.info(u'Source type: %s', source_type)
    logging.info(u'Document type: %s', event_type)
    logging.info(u'Owner: %s', username)

    es = ElasticsearchDataStore(host=current_app.config[u'ELASTIC_HOST'],
                                port=current_app.config[u'ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(source_file_path, delimiter):
            es.import_event(index_name, event_type, event)
        # Import the remaining events
        total_events = es.import_event(index_name, event_type)
    except Exception as e:
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc(e)
        _set_timeline_status(index_name, status=u'fail', error_msg=error_msg)
        logging.error(error_msg)
        return

    # Set status to ready when done
    _set_timeline_status(index_name, status=u'ready')

    return {u'Events processed': total_events}
Ejemplo n.º 2
0
def run_csv_jsonl(source_file_path, timeline_name, index_name, source_type):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        source_file_path: Path to CSV or JSONL file.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.

    Returns:
        Name (str) of the index.
    """
    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logging.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(
        host=current_app.config['ELASTIC_HOST'],
        port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(source_file_path):
            es.import_event(index_name, event_type, event)
        # Import the remaining events
        es.flush_queued_events()

    except (ImportError, NameError, UnboundLocalError):
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc(e)
        _set_timeline_status(index_name, status='fail', error_msg=error_msg)
        logging.error(error_msg)
        return None

    # Set status to ready when done
    _set_timeline_status(index_name, status='ready')

    return index_name
Ejemplo n.º 3
0
def run_csv(source_file_path, timeline_name, index_name, username=None):
    """Create a Celery task for processing a CSV file.

    Args:
        source_file_path: Path to CSV file.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        username: Username of the user who will own the timeline.

    Returns:
        Dictionary with count of processed events.
    """
    flush_interval = 1000  # events to queue before bulk index
    event_type = u'generic_event'  # Document type for Elasticsearch
    app = create_app()

    # Log information to Celery
    logging.info(u'Index name: %s', index_name)
    logging.info(u'Timeline name: %s', timeline_name)
    logging.info(u'Flush interval: %d', flush_interval)
    logging.info(u'Document type: %s', event_type)
    logging.info(u'Owner: %s', username)

    es = ElasticsearchDataStore(
        host=current_app.config[u'ELASTIC_HOST'],
        port=current_app.config[u'ELASTIC_PORT'])

    es.create_index(index_name=index_name, doc_type=event_type)
    for event in read_and_validate_csv(source_file_path):
        es.import_event(
            flush_interval, index_name, event_type, event)

    # Import the remaining events
    total_events = es.import_event(flush_interval, index_name, event_type)

    # We are done so let's remove the processing status flag
    with app.app_context():
        search_index = SearchIndex.query.filter_by(
            index_name=index_name).first()
        search_index.status.remove(search_index.status[0])
        db_session.add(search_index)
        db_session.commit()

    return {u'Events processed': total_events}
Ejemplo n.º 4
0
def run_csv_jsonl(file_path, events, timeline_name, index_name, source_type):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        file_path: Path to the JSON or CSV file.
        events: A string with the events.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.

    Returns:
        Name (str) of the index.
    """
    if events:
        file_handle = io.StringIO(events)
        source_type = 'jsonl'
    else:
        file_handle = codecs.open(file_path,
                                  'r',
                                  encoding='utf-8',
                                  errors='replace')

    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl,
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logging.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(host=current_app.config['ELASTIC_HOST'],
                                port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(file_handle):
            es.import_event(index_name, event_type, event)
        # Import the remaining events
        es.flush_queued_events()

    except errors.DataIngestionError as e:
        _set_timeline_status(index_name, status='fail', error_msg=str(e))
        raise

    except (RuntimeError, ImportError, NameError, UnboundLocalError,
            RequestError) as e:
        _set_timeline_status(index_name, status='fail', error_msg=str(e))
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc()
        _set_timeline_status(index_name, status='fail', error_msg=error_msg)
        logging.error('Error: {0!s}\n{1:s}'.format(e, error_msg))
        return None

    # Set status to ready when done
    _set_timeline_status(index_name, status='ready')

    return index_name
Ejemplo n.º 5
0
def run_csv_jsonl(file_path, events, timeline_name, index_name, source_type,
                  timeline_id):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        file_path: Path to the JSON or CSV file.
        events: A string with the events.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.
        timeline_id: ID of the timeline object this data belongs to.

    Returns:
        Name (str) of the index.
    """
    if events:
        file_handle = io.StringIO(events)
        source_type = 'jsonl'
    else:
        file_handle = codecs.open(file_path,
                                  'r',
                                  encoding='utf-8',
                                  errors='replace')

    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl,
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logger.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(host=current_app.config['ELASTIC_HOST'],
                                port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    final_counter = 0
    error_msg = ''
    error_count = 0
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(file_handle):
            es.import_event(index_name,
                            event_type,
                            event,
                            timeline_id=timeline_id)
            final_counter += 1

        # Import the remaining events
        results = es.flush_queued_events()

        error_container = results.get('error_container', {})
        error_msg = get_import_errors(error_container=error_container,
                                      index_name=index_name,
                                      total_count=results.get(
                                          'total_events', 0))

    except errors.DataIngestionError as e:
        _set_timeline_status(timeline_id, status='fail', error_msg=str(e))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        raise

    except (RuntimeError, ImportError, NameError, UnboundLocalError,
            RequestError) as e:
        _set_timeline_status(timeline_id, status='fail', error_msg=str(e))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc()
        _set_timeline_status(timeline_id, status='fail', error_msg=error_msg)
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        logger.error('Error: {0!s}\n{1:s}'.format(e, error_msg))
        return None

    if error_count:
        logger.info(
            'Index timeline: [{0:s}] to index [{1:s}] - {2:d} out of {3:d} '
            'events imported (in total {4:d} errors were discovered) '.format(
                timeline_name, index_name, (final_counter - error_count),
                final_counter, error_count))
    else:
        logger.info('Index timeline: [{0:s}] to index [{1:s}] - {2:d} '
                    'events imported.'.format(timeline_name, index_name,
                                              final_counter))

    # Set status to ready when done
    _set_timeline_status(timeline_id, status='ready', error_msg=error_msg)

    return index_name
Ejemplo n.º 6
0
class SimilarityScorer(object):
    """Score events based on Jaccard distance."""
    def __init__(self, index, data_type):
        """Initializes a similarity scorer.

        Args:
            index: Elasticsearch index name.
            data_type: Name of the data_type.
        """
        self._datastore = ElasticsearchDataStore(
            host=current_app.config['ELASTIC_HOST'],
            port=current_app.config['ELASTIC_PORT'])
        self._config = SimilarityScorerConfig(index, data_type)

    def _shingles_from_text(self, text):
        """Splits string into words.

        Args:
            text: String to extract words from.

        Returns:
            List of words.
        """
        # TODO: Remove stopwords using the NLTK python package.
        # TODO: Remove configured patterns from string.
        delimiters = self._config.delimiters
        return re.split('|'.join(delimiters), text)

    def _minhash_from_text(self, text):
        """Calculate minhash of text.

        Args:
            text: String to calculate minhash of.

        Returns:
            A minhash (instance of datasketch.minhash.MinHash)
        """
        minhash = MinHash(self._config.num_perm)
        for word in self._shingles_from_text(text):
            minhash.update(word.encode('utf8'))
        return minhash

    def _new_lsh_index(self):
        """Create a new LSH from a set of Timesketch events.

        Returns:
            A tuple with an LSH (instance of datasketch.lsh.LSH) and a
            dictionary with event ID as key and minhash as value.
        """
        minhashes = {}
        lsh = MinHashLSH(self._config.threshold, self._config.num_perm)

        # Event generator for streaming Elasticsearch results.
        events = self._datastore.search_stream(
            query_string=self._config.query,
            query_filter={},
            indices=[self._config.index],
            return_fields=[self._config.field])

        with lsh.insertion_session() as lsh_session:
            for event in events:
                event_id = event['_id']
                index_name = event['_index']
                event_type = event['_type']
                event_text = event['_source'][self._config.field]

                # Insert minhash in LSH index
                key = (event_id, event_type, index_name)
                minhash = self._minhash_from_text(event_text)
                minhashes[key] = minhash
                lsh_session.insert(key, minhash)

        return lsh, minhashes

    @staticmethod
    def _calculate_score(lsh, minhash, total_num_events):
        """Calculate a score based on Jaccard distance.

        The score is calculated based on how many similar events that there are
        for the event being scored. This is called neighbours and we simply
        calculate how many neighbours the event has divided by the total events
        in the LSH.

        Args:
            lsh: Instance of datasketch.lsh.MinHashLSH
            minhash: Instance of datasketch.minhash.MinHash
            total_num_events: Integer of how many events in the LSH

        Returns:
            A float between 0 and 1.
        """
        neighbours = lsh.query(minhash)
        return float(len(neighbours)) / float(total_num_events)

    def _update_event(self, event_id, event_type, index_name, score):
        """Add a similarity_score attribute to the event in Elasticsearch.

        Args:
            event_id: ID of the Elasticsearch document.
            event_type: The Elasticsearch type of the event.
            index_name: The name of the index in Elasticsearch.
            score: A numerical similarity score with value between 0 and 1.
        """
        update_doc = {'similarity_score': score}
        self._datastore.import_event(index_name,
                                     event_type,
                                     event_id=event_id,
                                     event=update_doc)

    def run(self):
        """Entry point for a SimilarityScorer.

        Returns:
            A dict with metadata about the processed data set.
        """
        lsh, minhashes = self._new_lsh_index()
        total_num_events = len(minhashes)
        for key, minhash in minhashes.items():
            event_id, event_type, index_name = key
            score = self._calculate_score(lsh, minhash, total_num_events)
            self._update_event(event_id, event_type, index_name, score)

        return dict(index=self._config.index,
                    data_type=self._config.data_type,
                    num_events_processed=total_num_events)