Ejemplo n.º 1
0
def run_csv_jsonl(source_file_path, timeline_name, index_name, source_type):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        source_file_path: Path to CSV or JSONL file.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.

    Returns:
        Name (str) of the index.
    """
    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logging.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(
        host=current_app.config['ELASTIC_HOST'],
        port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(source_file_path):
            es.import_event(index_name, event_type, event)
        # Import the remaining events
        es.flush_queued_events()

    except (ImportError, NameError, UnboundLocalError):
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc(e)
        _set_timeline_status(index_name, status='fail', error_msg=error_msg)
        logging.error(error_msg)
        return None

    # Set status to ready when done
    _set_timeline_status(index_name, status='ready')

    return index_name
Ejemplo n.º 2
0
def run_csv_jsonl(file_path, events, timeline_name, index_name, source_type):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        file_path: Path to the JSON or CSV file.
        events: A string with the events.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.

    Returns:
        Name (str) of the index.
    """
    if events:
        file_handle = io.StringIO(events)
        source_type = 'jsonl'
    else:
        file_handle = codecs.open(file_path,
                                  'r',
                                  encoding='utf-8',
                                  errors='replace')

    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl,
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logging.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(host=current_app.config['ELASTIC_HOST'],
                                port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(file_handle):
            es.import_event(index_name, event_type, event)
        # Import the remaining events
        es.flush_queued_events()

    except errors.DataIngestionError as e:
        _set_timeline_status(index_name, status='fail', error_msg=str(e))
        raise

    except (RuntimeError, ImportError, NameError, UnboundLocalError,
            RequestError) as e:
        _set_timeline_status(index_name, status='fail', error_msg=str(e))
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc()
        _set_timeline_status(index_name, status='fail', error_msg=error_msg)
        logging.error('Error: {0!s}\n{1:s}'.format(e, error_msg))
        return None

    # Set status to ready when done
    _set_timeline_status(index_name, status='ready')

    return index_name
Ejemplo n.º 3
0
def run_csv_jsonl(file_path, events, timeline_name, index_name, source_type,
                  timeline_id):
    """Create a Celery task for processing a CSV or JSONL file.

    Args:
        file_path: Path to the JSON or CSV file.
        events: A string with the events.
        timeline_name: Name of the Timesketch timeline.
        index_name: Name of the datastore index.
        source_type: Type of file, csv or jsonl.
        timeline_id: ID of the timeline object this data belongs to.

    Returns:
        Name (str) of the index.
    """
    if events:
        file_handle = io.StringIO(events)
        source_type = 'jsonl'
    else:
        file_handle = codecs.open(file_path,
                                  'r',
                                  encoding='utf-8',
                                  errors='replace')

    event_type = 'generic_event'  # Document type for Elasticsearch
    validators = {
        'csv': read_and_validate_csv,
        'jsonl': read_and_validate_jsonl,
    }
    read_and_validate = validators.get(source_type)

    # Log information to Celery
    logger.info(
        'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format(
            timeline_name, index_name, source_type))

    es = ElasticsearchDataStore(host=current_app.config['ELASTIC_HOST'],
                                port=current_app.config['ELASTIC_PORT'])

    # Reason for the broad exception catch is that we want to capture
    # all possible errors and exit the task.
    final_counter = 0
    error_msg = ''
    error_count = 0
    try:
        es.create_index(index_name=index_name, doc_type=event_type)
        for event in read_and_validate(file_handle):
            es.import_event(index_name,
                            event_type,
                            event,
                            timeline_id=timeline_id)
            final_counter += 1

        # Import the remaining events
        results = es.flush_queued_events()

        error_container = results.get('error_container', {})
        error_msg = get_import_errors(error_container=error_container,
                                      index_name=index_name,
                                      total_count=results.get(
                                          'total_events', 0))

    except errors.DataIngestionError as e:
        _set_timeline_status(timeline_id, status='fail', error_msg=str(e))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        raise

    except (RuntimeError, ImportError, NameError, UnboundLocalError,
            RequestError) as e:
        _set_timeline_status(timeline_id, status='fail', error_msg=str(e))
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        raise

    except Exception as e:  # pylint: disable=broad-except
        # Mark the searchindex and timelines as failed and exit the task
        error_msg = traceback.format_exc()
        _set_timeline_status(timeline_id, status='fail', error_msg=error_msg)
        _close_index(index_name=index_name,
                     data_store=es,
                     timeline_id=timeline_id)
        logger.error('Error: {0!s}\n{1:s}'.format(e, error_msg))
        return None

    if error_count:
        logger.info(
            'Index timeline: [{0:s}] to index [{1:s}] - {2:d} out of {3:d} '
            'events imported (in total {4:d} errors were discovered) '.format(
                timeline_name, index_name, (final_counter - error_count),
                final_counter, error_count))
    else:
        logger.info('Index timeline: [{0:s}] to index [{1:s}] - {2:d} '
                    'events imported.'.format(timeline_name, index_name,
                                              final_counter))

    # Set status to ready when done
    _set_timeline_status(timeline_id, status='ready', error_msg=error_msg)

    return index_name