def run_csv_jsonl(source_file_path, timeline_name, index_name, source_type): """Create a Celery task for processing a CSV or JSONL file. Args: source_file_path: Path to CSV or JSONL file. timeline_name: Name of the Timesketch timeline. index_name: Name of the datastore index. source_type: Type of file, csv or jsonl. Returns: Name (str) of the index. """ event_type = 'generic_event' # Document type for Elasticsearch validators = { 'csv': read_and_validate_csv, 'jsonl': read_and_validate_jsonl } read_and_validate = validators.get(source_type) # Log information to Celery logging.info( 'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format( timeline_name, index_name, source_type)) es = ElasticsearchDataStore( host=current_app.config['ELASTIC_HOST'], port=current_app.config['ELASTIC_PORT']) # Reason for the broad exception catch is that we want to capture # all possible errors and exit the task. try: es.create_index(index_name=index_name, doc_type=event_type) for event in read_and_validate(source_file_path): es.import_event(index_name, event_type, event) # Import the remaining events es.flush_queued_events() except (ImportError, NameError, UnboundLocalError): raise except Exception as e: # pylint: disable=broad-except # Mark the searchindex and timelines as failed and exit the task error_msg = traceback.format_exc(e) _set_timeline_status(index_name, status='fail', error_msg=error_msg) logging.error(error_msg) return None # Set status to ready when done _set_timeline_status(index_name, status='ready') return index_name
def run_csv_jsonl(file_path, events, timeline_name, index_name, source_type): """Create a Celery task for processing a CSV or JSONL file. Args: file_path: Path to the JSON or CSV file. events: A string with the events. timeline_name: Name of the Timesketch timeline. index_name: Name of the datastore index. source_type: Type of file, csv or jsonl. Returns: Name (str) of the index. """ if events: file_handle = io.StringIO(events) source_type = 'jsonl' else: file_handle = codecs.open(file_path, 'r', encoding='utf-8', errors='replace') event_type = 'generic_event' # Document type for Elasticsearch validators = { 'csv': read_and_validate_csv, 'jsonl': read_and_validate_jsonl, } read_and_validate = validators.get(source_type) # Log information to Celery logging.info( 'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format( timeline_name, index_name, source_type)) es = ElasticsearchDataStore(host=current_app.config['ELASTIC_HOST'], port=current_app.config['ELASTIC_PORT']) # Reason for the broad exception catch is that we want to capture # all possible errors and exit the task. try: es.create_index(index_name=index_name, doc_type=event_type) for event in read_and_validate(file_handle): es.import_event(index_name, event_type, event) # Import the remaining events es.flush_queued_events() except errors.DataIngestionError as e: _set_timeline_status(index_name, status='fail', error_msg=str(e)) raise except (RuntimeError, ImportError, NameError, UnboundLocalError, RequestError) as e: _set_timeline_status(index_name, status='fail', error_msg=str(e)) raise except Exception as e: # pylint: disable=broad-except # Mark the searchindex and timelines as failed and exit the task error_msg = traceback.format_exc() _set_timeline_status(index_name, status='fail', error_msg=error_msg) logging.error('Error: {0!s}\n{1:s}'.format(e, error_msg)) return None # Set status to ready when done _set_timeline_status(index_name, status='ready') return index_name
def run_csv_jsonl(file_path, events, timeline_name, index_name, source_type, timeline_id): """Create a Celery task for processing a CSV or JSONL file. Args: file_path: Path to the JSON or CSV file. events: A string with the events. timeline_name: Name of the Timesketch timeline. index_name: Name of the datastore index. source_type: Type of file, csv or jsonl. timeline_id: ID of the timeline object this data belongs to. Returns: Name (str) of the index. """ if events: file_handle = io.StringIO(events) source_type = 'jsonl' else: file_handle = codecs.open(file_path, 'r', encoding='utf-8', errors='replace') event_type = 'generic_event' # Document type for Elasticsearch validators = { 'csv': read_and_validate_csv, 'jsonl': read_and_validate_jsonl, } read_and_validate = validators.get(source_type) # Log information to Celery logger.info( 'Index timeline [{0:s}] to index [{1:s}] (source: {2:s})'.format( timeline_name, index_name, source_type)) es = ElasticsearchDataStore(host=current_app.config['ELASTIC_HOST'], port=current_app.config['ELASTIC_PORT']) # Reason for the broad exception catch is that we want to capture # all possible errors and exit the task. final_counter = 0 error_msg = '' error_count = 0 try: es.create_index(index_name=index_name, doc_type=event_type) for event in read_and_validate(file_handle): es.import_event(index_name, event_type, event, timeline_id=timeline_id) final_counter += 1 # Import the remaining events results = es.flush_queued_events() error_container = results.get('error_container', {}) error_msg = get_import_errors(error_container=error_container, index_name=index_name, total_count=results.get( 'total_events', 0)) except errors.DataIngestionError as e: _set_timeline_status(timeline_id, status='fail', error_msg=str(e)) _close_index(index_name=index_name, data_store=es, timeline_id=timeline_id) raise except (RuntimeError, ImportError, NameError, UnboundLocalError, RequestError) as e: _set_timeline_status(timeline_id, status='fail', error_msg=str(e)) _close_index(index_name=index_name, data_store=es, timeline_id=timeline_id) raise except Exception as e: # pylint: disable=broad-except # Mark the searchindex and timelines as failed and exit the task error_msg = traceback.format_exc() _set_timeline_status(timeline_id, status='fail', error_msg=error_msg) _close_index(index_name=index_name, data_store=es, timeline_id=timeline_id) logger.error('Error: {0!s}\n{1:s}'.format(e, error_msg)) return None if error_count: logger.info( 'Index timeline: [{0:s}] to index [{1:s}] - {2:d} out of {3:d} ' 'events imported (in total {4:d} errors were discovered) '.format( timeline_name, index_name, (final_counter - error_count), final_counter, error_count)) else: logger.info('Index timeline: [{0:s}] to index [{1:s}] - {2:d} ' 'events imported.'.format(timeline_name, index_name, final_counter)) # Set status to ready when done _set_timeline_status(timeline_id, status='ready', error_msg=error_msg) return index_name