def _StopMonitoringProcess(self, process): """Stops monitoring a process. Args: process (MultiProcessBaseProcess): process. Raises: KeyError: if the process is not monitored. ValueError: if the process is missing. """ if process is None: raise ValueError('Missing process.') pid = process.pid self._RaiseIfNotMonitored(pid) del self._process_information_per_pid[pid] rpc_client = self._rpc_clients_per_pid.get(pid, None) if rpc_client: rpc_client.Close() del self._rpc_clients_per_pid[pid] if pid in self._rpc_errors_per_pid: del self._rpc_errors_per_pid[pid] logger.debug('Stopped monitoring process: {0:s} (PID: {1:d})'.format( process.name, pid))
def CreateRetryTask(self): """Creates a task that to retry a previously abandoned task. Returns: Task: a task that was abandoned but should be retried or None if there are no abandoned tasks that should be retried. """ with self._lock: abandoned_task = self._GetTaskPendingRetry() if not abandoned_task: return None # The abandoned task is kept in _tasks_abandoned so it can be still # identified in CheckTaskToMerge and UpdateTaskAsPendingMerge. retry_task = abandoned_task.CreateRetryTask() logger.debug('Retrying task {0:s} as {1:s}.'.format( abandoned_task.identifier, retry_task.identifier)) self._tasks_queued[retry_task.identifier] = retry_task self._total_number_of_tasks += 1 self.SampleTaskStatus(retry_task, 'created_retry') return retry_task
def CreateTask(self, session_identifier, storage_format=definitions.STORAGE_FORMAT_SQLITE): """Creates a task. Args: session_identifier (str): the identifier of the session the task is part of. storage_format (Optional[str]): the storage format that the task should be stored in. Returns: Task: task attribute container. """ task = tasks.Task(session_identifier) task.storage_format = storage_format logger.debug('Created task: {0:s}.'.format(task.identifier)) with self._lock: self._tasks_queued[task.identifier] = task self._total_number_of_tasks += 1 self.SampleTaskStatus(task, 'created') return task
def _AbandonInactiveProcessingTasks(self): """Marks processing tasks that exceed the inactive time as abandoned. This method does not lock the manager and should be called by a method holding the manager lock. """ if self._tasks_processing: inactive_time = time.time() - self._TASK_INACTIVE_TIME inactive_time = int(inactive_time * definitions.MICROSECONDS_PER_SECOND) # Abandon all tasks after they're identified so as not to modify the # dict while iterating over it. tasks_to_abandon = [] for task_identifier, task in self._tasks_processing.items(): if task.last_processing_time < inactive_time: logger.debug('Abandoned processing task: {0:s}.'.format( task_identifier)) self.SampleTaskStatus(task, 'abandoned_processing') tasks_to_abandon.append((task_identifier, task)) for task_identifier, task in tasks_to_abandon: self._tasks_abandoned[task_identifier] = task del self._tasks_processing[task_identifier]
def _StopAnalysisProcesses(self, abort=False): """Stops the analysis processes. Args: abort (bool): True to indicated the stop is issued on abort. """ logger.debug('Stopping analysis processes.') self._StopMonitoringProcesses() if abort: # Signal all the processes to abort. self._AbortTerminate() # Wake the processes to make sure that they are not blocking # waiting for the queue new items. for event_queue in self._event_queues.values(): event_queue.PushItem(plaso_queue.QueueAbort(), block=False) # Try waiting for the processes to exit normally. self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) for event_queue in self._event_queues.values(): event_queue.Close(abort=abort) if abort: # Kill any remaining processes. self._AbortKill() else: # Check if the processes are still alive and terminate them if necessary. self._AbortTerminate() self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) for event_queue in self._event_queues.values(): event_queue.Close(abort=True)
def _StartWorkerProcess(self, process_name): """Creates, starts, monitors and registers a worker process. Args: process_name (str): process name. Returns: MultiProcessWorkerProcess: extraction worker process or None if the process could not be started. """ logger.debug('Starting worker process {0:s}'.format(process_name)) queue_name = '{0:s} task queue'.format(process_name) task_queue = zeromq_queue.ZeroMQRequestConnectQueue( delay_open=True, linger_seconds=0, name=queue_name, port=self._task_queue_port, timeout_seconds=self._TASK_QUEUE_TIMEOUT_SECONDS) process = extraction_process.ExtractionWorkerProcess( task_queue, self.collection_filters_helper, self.knowledge_base, self._processing_configuration, enable_sigsegv_handler=self._enable_sigsegv_handler, name=process_name) # Remove all possible log handlers to prevent a child process from logging # to the main process log file and garbling the log. The log handlers are # recreated after the worker process has been started. for handler in logging.root.handlers: logging.root.removeHandler(handler) handler.close() process.start() loggers.ConfigureLogging(debug_output=self._debug_output, filename=self._log_filename, mode='a', quiet_mode=self._quiet_mode) try: self._StartMonitoringProcess(process) except (IOError, KeyError) as exception: pid = process.pid logger.error( ('Unable to monitor replacement worker process: {0:s} ' '(PID: {1:d}) with error: {2!s}').format( process_name, pid, exception)) self._TerminateProcess(process) return None self._RegisterProcess(process) self._last_worker_number += 1 return process
def SignalAbort(self): """Signals the process to abort.""" self._abort = True if self._foreman_status_wait_event: logger.debug('Abort foreman status wait event') self._foreman_status_wait_event.set() if self._analysis_mediator: self._analysis_mediator.SignalAbort()
def _GetStatus(self): """Retrieves status information. Returns: dict[str, object]: status attributes, indexed by name. """ logger.debug('Status update requested') if self._analysis_mediator: number_of_produced_event_tags = ( self._analysis_mediator.number_of_produced_event_tags) number_of_produced_reports = ( self._analysis_mediator.number_of_produced_analysis_reports) else: number_of_produced_event_tags = None number_of_produced_reports = None if self._process_information: used_memory = self._process_information.GetUsedMemory() or 0 else: used_memory = 0 if self._memory_profiler: self._memory_profiler.Sample('main', used_memory) # XML RPC does not support integer values > 2 GiB so we format them # as a string. used_memory = '{0:d}'.format(used_memory) status = { 'display_name': '', 'identifier': self._name, 'number_of_consumed_analysis_warnings': None, 'number_of_consumed_event_tags': None, 'number_of_consumed_events': self._number_of_consumed_events, 'number_of_consumed_reports': None, 'number_of_consumed_sources': None, 'number_of_produced_analysis_warnings': None, 'number_of_produced_event_tags': number_of_produced_event_tags, 'number_of_produced_events': None, 'number_of_produced_reports': number_of_produced_reports, 'number_of_produced_sources': None, 'processing_status': self._status, 'task_identifier': None, 'used_memory': used_memory} if self._status in ( definitions.STATUS_INDICATOR_ABORTED, definitions.STATUS_INDICATOR_COMPLETED): logger.debug('Set foreman status wait event') self._foreman_status_wait_event.set() return status
def _AbortJoin(self, timeout=None): """Aborts all registered processes by joining with the parent process. Args: timeout (int): number of seconds to wait for processes to join, where None represents no timeout. """ for pid, process in self._processes_per_pid.items(): logger.debug('Waiting for process: {0:s} (PID: {1:d}).'.format( process.name, pid)) process.join(timeout=timeout) if not process.is_alive(): logger.debug('Process {0:s} (PID: {1:d}) stopped.'.format( process.name, pid))
def HasPendingTasks(self): """Determines if there are tasks running or in need of retrying. Returns: bool: True if there are tasks that are active, ready to be merged or need to be retried. """ logger.debug('Checking for pending tasks') with self._lock: self._AbandonInactiveProcessingTasks() if self._tasks_processing: return True # There are no tasks being processed, but we might be # waiting for some tasks to be merged. if self._HasTasksPendingMerge(): return True # There are no tasks processing or pending merge, but there may # still be some waiting to be retried, so we check that. if self._HasTasksPendingRetry(): return True # It is possible that a worker has processed a task and the foreman has # not been informed about it, since there is no feedback from the worker # when it pops a task from the queue. # If we believe all the workers are idle for longer than the task # inactive time (timeout) abandon all queued tasks. This ensures # that processing actually stops when the foreman never gets an # update from a worker. if self._tasks_queued: inactive_time = time.time() - self._TASK_INACTIVE_TIME inactive_time = int(inactive_time * definitions.MICROSECONDS_PER_SECOND) if self._latest_task_processing_time < inactive_time: self._AbandonQueuedTasks() if self._tasks_queued: return True if self._tasks_merging: return True # There are no tasks pending any work. return False
def _StopProcessStatusRPCServer(self): """Stops the process status RPC server.""" if not self._rpc_server: return # Make sure the engine gets one more status update so it knows # the worker has completed. self._WaitForStatusNotRunning() self._rpc_server.Stop() self._rpc_server = None self.rpc_port.value = 0 logger.debug( 'Process: {0!s} process status RPC server stopped'.format(self._name))
def _RunProcess(self): """Runs the process.""" # Prevent the KeyboardInterrupt being raised inside the process. # This will prevent a process from generating a traceback when interrupted. signal.signal(signal.SIGINT, signal.SIG_IGN) # A SIGTERM signal handler is necessary to make sure IPC is cleaned up # correctly on terminate. signal.signal(signal.SIGTERM, self._SigTermHandler) # A SIGSEGV signal handler is necessary to try to indicate where # worker failed. # WARNING the SIGSEGV handler will deadlock the process on a real segfault. if self._enable_sigsegv_handler: self._original_sigsegv_handler = signal.signal( signal.SIGSEGV, self._SigSegvHandler) self._pid = os.getpid() self._process_information = process_info.ProcessInfo(self._pid) # We need to set the is running status explicitly to True in case # the process completes before the engine is able to determine # the status of the process, such as in the unit tests. self._status_is_running = True # Logging needs to be configured before the first output otherwise we # mess up the logging of the parent process. loggers.ConfigureLogging(debug_output=self._debug_output, filename=self._log_filename, quiet_mode=self._quiet_mode) logger.debug('Process: {0!s} (PID: {1:d}) started'.format( self._name, self._pid)) self._StartProcessStatusRPCServer() self._Main() self._StopProcessStatusRPCServer() logger.debug('Process: {0!s} (PID: {1:d}) stopped'.format( self._name, self._pid)) # Make sure log files are cleanly closed. logging.shutdown() self._status_is_running = False
def UpdateTaskAsProcessingByIdentifier(self, task_identifier): """Updates the task manager to reflect the task is processing. Args: task_identifier (str): unique identifier of the task. Raises: KeyError: if the task is not known to the task manager. """ with self._lock: task_processing = self._tasks_processing.get(task_identifier, None) if task_processing: task_processing.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_processing) return task_queued = self._tasks_queued.get(task_identifier, None) if task_queued: logger.debug('Task {0:s} was queued, now processing.'.format( task_identifier)) self._tasks_processing[task_identifier] = task_queued del self._tasks_queued[task_identifier] task_queued.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_queued) return task_abandoned = self._tasks_abandoned.get(task_identifier, None) if task_abandoned: del self._tasks_abandoned[task_identifier] self._tasks_processing[task_identifier] = task_abandoned logger.debug( 'Task {0:s} was abandoned, but now processing.'.format( task_identifier)) task_abandoned.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_abandoned) return if task_identifier in self._tasks_pending_merge: # No need to update the processing time, as this task is already # finished processing and is just waiting for merge. return # If we get here, we don't know what state the tasks is in, so raise. raise KeyError( 'Status of task {0:s} is unknown.'.format(task_identifier))
def _AbandonQueuedTasks(self): """Marks queued tasks abandoned. This method does not lock the manager and should be called by a method holding the manager lock. """ # Abandon all tasks after they're identified so as not to modify the # dict while iterating over it. tasks_to_abandon = [] for task_identifier, task in self._tasks_queued.items(): logger.debug( 'Abandoned queued task: {0:s}.'.format(task_identifier)) tasks_to_abandon.append((task_identifier, task)) for task_identifier, task in tasks_to_abandon: self._tasks_abandoned[task_identifier] = task del self._tasks_queued[task_identifier]
def _FillEventSourceHeap(self, storage_writer, event_source_heap, start_with_first=False): """Fills the event source heap with the available written event sources. Args: storage_writer (StorageWriter): storage writer for a session storage. event_source_heap (_EventSourceHeap): event source heap. start_with_first (Optional[bool]): True if the function should start with the first written event source. """ if self._processing_profiler: self._processing_profiler.StartTiming('fill_event_source_heap') if self._processing_profiler: self._processing_profiler.StartTiming('get_event_source') if start_with_first: event_source = storage_writer.GetFirstWrittenEventSource() else: event_source = storage_writer.GetNextWrittenEventSource() if self._processing_profiler: self._processing_profiler.StopTiming('get_event_source') while event_source: event_source_heap.PushEventSource(event_source) if event_source_heap.IsFull(): logger.debug('Event source heap is full.') break if self._processing_profiler: self._processing_profiler.StartTiming('get_event_source') event_source = storage_writer.GetNextWrittenEventSource() if self._processing_profiler: self._processing_profiler.StopTiming('get_event_source') if self._processing_profiler: self._processing_profiler.StopTiming('fill_event_source_heap')
def CompleteTask(self, task): """Completes a task. The task is complete and can be removed from the task manager. Args: task (Task): task. Raises: KeyError: if the task was not merging. """ with self._lock: if task.identifier not in self._tasks_merging: raise KeyError('Task {0:s} was not merging.'.format( task.identifier)) self.SampleTaskStatus(task, 'completed') del self._tasks_merging[task.identifier] logger.debug('Completed task {0:s}.'.format(task.identifier))
def _StopExtractionProcesses(self, abort=False): """Stops the extraction processes. Args: abort (bool): True to indicated the stop is issued on abort. """ logger.debug('Stopping extraction processes.') self._StopMonitoringProcesses() if abort: # Signal all the processes to abort. self._AbortTerminate() logger.debug('Emptying task queue.') self._task_queue.Empty() # Wake the processes to make sure that they are not blocking # waiting for the queue new items. for _ in self._processes_per_pid: try: self._task_queue.PushItem(plaso_queue.QueueAbort(), block=False) except errors.QueueFull: logger.warning( 'Task queue full, unable to push abort message.') # Try waiting for the processes to exit normally. self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) self._task_queue.Close(abort=abort) if not abort: # Check if the processes are still alive and terminate them if necessary. self._AbortTerminate() self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) self._task_queue.Close(abort=True) # Kill any lingering processes. self._AbortKill()
def RemoveTask(self, task): """Removes an abandoned task. Args: task (Task): task. Raises: KeyError: if the task was not abandoned or the task was abandoned and was not retried. """ with self._lock: if task.identifier not in self._tasks_abandoned: raise KeyError('Task {0:s} was not abandoned.'.format( task.identifier)) if not task.has_retry: raise KeyError( 'Will not remove a task {0:s} without retry task.'.format( task.identifier)) del self._tasks_abandoned[task.identifier] logger.debug('Removed task {0:s}.'.format(task.identifier))
def _StartProcessStatusRPCServer(self): """Starts the process status RPC server.""" if self._rpc_server: return self._rpc_server = plaso_xmlrpc.XMLProcessStatusRPCServer( self._GetStatus) hostname = 'localhost' # Try the PID as port number first otherwise pick something random # between 1024 and 60000. if self._pid < 1024 or self._pid > 60000: port = random.randint(1024, 60000) else: port = self._pid if not self._rpc_server.Start(hostname, port): port = 0 for _ in range(self._NUMBER_OF_RPC_SERVER_START_ATTEMPTS): port = random.randint(1024, 60000) if self._rpc_server.Start(hostname, port): break port = 0 if not port: logger.error( ('Unable to start a process status RPC server for {0!s} ' '(PID: {1:d})').format(self._name, self._pid)) self._rpc_server = None return self.rpc_port.value = port logger.debug('Process: {0!s} process status RPC server started'.format( self._name))
def UpdateTaskAsPendingMerge(self, task): """Updates the task manager to reflect that the task is ready to be merged. Args: task (Task): task. Raises: KeyError: if the task was not queued, processing or abandoned, or the task was abandoned and has a retry task. """ with self._lock: is_abandoned = task.identifier in self._tasks_abandoned is_processing = task.identifier in self._tasks_processing is_queued = task.identifier in self._tasks_queued if not is_queued and not is_processing and not is_abandoned: raise KeyError('Status of task {0:s} is unknown.'.format( task.identifier)) if is_abandoned and task.has_retry: raise KeyError( 'Will not merge a task {0:s} with retry task.'.format( task.identifier)) if is_queued: logger.debug('Task {0:s} was queued, now merging.'.format( task.identifier)) del self._tasks_queued[task.identifier] if is_processing: logger.debug('Task {0:s} was processing, now merging.'.format( task.identifier)) del self._tasks_processing[task.identifier] if is_abandoned: logger.debug('Task {0:s} was abandoned, now merging.'.format( task.identifier)) del self._tasks_abandoned[task.identifier] self._tasks_pending_merge.PushTask(task) self.SampleTaskStatus(task, 'pending_merge') task.UpdateProcessingTime() self._UpdateLatestProcessingTime(task)
def _AnalyzeEvents(self, storage_writer, analysis_plugins, event_filter=None): """Analyzes events in a Plaso storage. Args: storage_writer (StorageWriter): storage writer. analysis_plugins (dict[str, AnalysisPlugin]): analysis plugins that should be run and their names. event_filter (Optional[EventObjectFilter]): event filter. Returns: collections.Counter: counter containing information about the events processed and filtered. Raises: RuntimeError: if a non-recoverable situation is encountered. """ self._status = definitions.STATUS_INDICATOR_RUNNING self._number_of_consumed_analysis_reports = 0 self._number_of_consumed_events = 0 self._number_of_consumed_event_tags = 0 self._number_of_consumed_sources = 0 self._number_of_produced_analysis_reports = 0 self._number_of_produced_events = 0 self._number_of_produced_event_tags = 0 self._number_of_produced_sources = 0 number_of_filtered_events = 0 logger.debug('Processing events.') filter_limit = getattr(event_filter, 'limit', None) for event in storage_writer.GetSortedEvents(): event_data_identifier = event.GetEventDataIdentifier() event_data = storage_writer.GetAttributeContainerByIdentifier( events.EventData.CONTAINER_TYPE, event_data_identifier) event_data_stream_identifier = event_data.GetEventDataStreamIdentifier( ) if event_data_stream_identifier: event_data_stream = storage_writer.GetAttributeContainerByIdentifier( events.EventDataStream.CONTAINER_TYPE, event_data_stream_identifier) else: event_data_stream = None event_identifier = event.GetIdentifier() event_tag = self._event_tag_index.GetEventTagByIdentifier( storage_writer, event_identifier) if event_filter: filter_match = event_filter.Match(event, event_data, event_data_stream, event_tag) else: filter_match = None # pylint: disable=singleton-comparison if filter_match == False: number_of_filtered_events += 1 continue for event_queue in self._event_queues.values(): # TODO: Check for premature exit of analysis plugins. event_queue.PushItem((event, event_data, event_data_stream)) self._number_of_consumed_events += 1 if (event_filter and filter_limit and filter_limit == self._number_of_consumed_events): break logger.debug('Finished pushing events to analysis plugins.') # Signal that we have finished adding events. for event_queue in self._event_queues.values(): event_queue.PushItem(plaso_queue.QueueAbort(), block=False) logger.debug('Processing analysis plugin results.') # TODO: use a task based approach. plugin_names = list(analysis_plugins.keys()) while plugin_names: for plugin_name in list(plugin_names): if self._abort: break # TODO: temporary solution. task = tasks.Task() task.storage_format = definitions.STORAGE_FORMAT_SQLITE task.identifier = plugin_name merge_ready = self._CheckTaskReadyForMerge( definitions.STORAGE_FORMAT_SQLITE, task) if merge_ready: self._PrepareMergeTaskStorage( definitions.STORAGE_FORMAT_SQLITE, task) self._status = definitions.STATUS_INDICATOR_MERGING event_queue = self._event_queues[plugin_name] del self._event_queues[plugin_name] event_queue.Close() task_storage_reader = self._GetMergeTaskStorage( definitions.STORAGE_FORMAT_SQLITE, task) try: merge_helper = merge_helpers.AnalysisTaskMergeHelper( task_storage_reader, task.identifier) logger.debug('Starting merge of task: {0:s}'.format( merge_helper.task_identifier)) number_of_containers = self._MergeAttributeContainers( storage_writer, merge_helper) logger.debug( 'Merged {0:d} containers of task: {1:s}'.format( number_of_containers, merge_helper.task_identifier)) finally: task_storage_reader.Close() self._RemoveMergeTaskStorage( definitions.STORAGE_FORMAT_SQLITE, task) self._status = definitions.STATUS_INDICATOR_RUNNING # TODO: temporary solution. plugin_names.remove(plugin_name) events_counter = collections.Counter() events_counter['Events filtered'] = number_of_filtered_events events_counter['Events processed'] = self._number_of_consumed_events return events_counter
def AnalyzeEvents(self, session, knowledge_base_object, storage_writer, data_location, analysis_plugins, processing_configuration, event_filter=None, event_filter_expression=None, status_update_callback=None, storage_file_path=None): """Analyzes events in a Plaso storage. Args: session (Session): session in which the events are analyzed. knowledge_base_object (KnowledgeBase): contains information from the source data needed for processing. storage_writer (StorageWriter): storage writer. data_location (str): path to the location that data files should be loaded from. analysis_plugins (dict[str, AnalysisPlugin]): analysis plugins that should be run and their names. processing_configuration (ProcessingConfiguration): processing configuration. event_filter (Optional[EventObjectFilter]): event filter. event_filter_expression (Optional[str]): event filter expression. status_update_callback (Optional[function]): callback function for status updates. storage_file_path (Optional[str]): path to the session storage file. Returns: ProcessingStatus: processing status. Raises: KeyboardInterrupt: if a keyboard interrupt was raised. ValueError: if analysis plugins are missing. """ if not analysis_plugins: raise ValueError('Missing analysis plugins') abort_kill = False keyboard_interrupt = False queue_full = False self._analysis_plugins = {} self._data_location = data_location self._event_filter_expression = event_filter_expression self._events_status = processing_status.EventsStatus() self._knowledge_base = knowledge_base_object self._processing_configuration = processing_configuration self._session = session self._status_update_callback = status_update_callback self._storage_file_path = storage_file_path stored_event_labels_counter = {} if storage_writer.HasAttributeContainers('event_label_count'): stored_event_labels_counter = { event_label_count.label: event_label_count for event_label_count in storage_writer.GetAttributeContainers( 'event_label_count') } self._event_labels_counter = collections.Counter() if storage_writer.HasAttributeContainers('parser_count'): parsers_counter = { parser_count.name: parser_count.number_of_events for parser_count in storage_writer.GetAttributeContainers( 'parser_count') } total_number_of_events = parsers_counter['total'] else: total_number_of_events = 0 for stored_session in storage_writer.GetSessions(): total_number_of_events += stored_session.parsers_counter[ 'total'] self._events_status.total_number_of_events = total_number_of_events # Set up the storage writer before the analysis processes. self._StartTaskStorage(definitions.STORAGE_FORMAT_SQLITE) self._StartAnalysisProcesses(analysis_plugins) self._StartProfiling(self._processing_configuration.profiling) # Start the status update thread after open of the storage writer # so we don't have to clean up the thread if the open fails. self._StartStatusUpdateThread() try: self._AnalyzeEvents(storage_writer, analysis_plugins, event_filter=event_filter) for key, value in self._event_labels_counter.items(): event_label_count = stored_event_labels_counter.get(key, None) if event_label_count: event_label_count.number_of_events += value storage_writer.UpdateAttributeContainer(event_label_count) else: event_label_count = counts.EventLabelCount( label=key, number_of_events=value) storage_writer.AddAttributeContainer(event_label_count) self._status = definitions.STATUS_INDICATOR_FINALIZING except errors.QueueFull: queue_full = True self._abort = True except KeyboardInterrupt: keyboard_interrupt = True self._abort = True finally: self._processing_status.aborted = self._abort session.aborted = self._abort # Stop the status update thread after close of the storage writer # so we include the storage sync to disk in the status updates. self._StopStatusUpdateThread() self._StopProfiling() # Update the status view one last time before the analysis processses are # stopped. self._UpdateStatus() if queue_full: # TODO: handle abort on queue full more elegant. abort_kill = True else: try: self._StopAnalysisProcesses(abort=self._abort) except KeyboardInterrupt: keyboard_interrupt = True abort_kill = True if abort_kill: self._AbortKill() # The abort can leave the main process unresponsive # due to incorrectly finalized IPC. self._KillProcess(os.getpid()) try: self._StopTaskStorage(definitions.STORAGE_FORMAT_SQLITE, session.identifier, abort=self._abort) except (IOError, OSError) as exception: logger.error( 'Unable to stop task storage with error: {0!s}'.format( exception)) if self._abort: logger.debug('Analysis aborted.') else: logger.debug('Analysis completed.') # Update the status view one last time. self._UpdateStatus() # Reset values. self._analysis_plugins = {} self._data_location = None self._event_filter_expression = None self._knowledge_base = None self._processing_configuration = None self._session = None self._status_update_callback = None self._storage_file_path = None if keyboard_interrupt: raise KeyboardInterrupt return self._processing_status
def _Main(self): """The main loop.""" self._StartProfiling(self._processing_configuration.profiling) logger.debug('Analysis plugin: {0!s} (PID: {1:d}) started'.format( self._name, self._pid)) # Creating the threading event in the constructor will cause a pickle # error on Windows when an analysis process is created. self._foreman_status_wait_event = threading.Event() self._status = definitions.STATUS_INDICATOR_ANALYZING task = tasks.Task() task.storage_format = definitions.STORAGE_FORMAT_SQLITE # TODO: temporary solution. task.identifier = self._analysis_plugin.plugin_name self._task = task task_storage_writer = self._storage_factory.CreateTaskStorageWriter( definitions.STORAGE_FORMAT_SQLITE) if self._serializers_profiler: task_storage_writer.SetSerializersProfiler( self._serializers_profiler) if self._storage_profiler: task_storage_writer.SetStorageProfiler(self._storage_profiler) storage_file_path = self._GetTaskStorageFilePath( definitions.STORAGE_FORMAT_SQLITE, task) task_storage_writer.Open(path=storage_file_path) self._analysis_mediator = self._CreateAnalysisMediator( self._session, self._knowledge_base, self._processing_configuration, self._data_location) self._analysis_mediator.SetStorageWriter(task_storage_writer) # TODO: set event_filter_expression in mediator. task_storage_writer.AddAttributeContainer(task) try: logger.debug( '{0!s} (PID: {1:d}) started monitoring event queue.'.format( self._name, self._pid)) while not self._abort: try: queued_object = self._event_queue.PopItem() except (errors.QueueClose, errors.QueueEmpty) as exception: logger.debug( 'ConsumeItems exiting with exception {0!s}.'.format( type(exception))) break if isinstance(queued_object, plaso_queue.QueueAbort): logger.debug( 'ConsumeItems exiting, dequeued QueueAbort object.') break self._ProcessEvent(self._analysis_mediator, *queued_object) self._number_of_consumed_events += 1 logger.debug( '{0!s} (PID: {1:d}) stopped monitoring event queue.'.format( self._name, self._pid)) if not self._abort: self._status = definitions.STATUS_INDICATOR_REPORTING self._analysis_mediator.ProduceAnalysisReport( self._analysis_plugin) # All exceptions need to be caught here to prevent the process # from being killed by an uncaught exception. except Exception as exception: # pylint: disable=broad-except logger.warning( 'Unhandled exception in process: {0!s} (PID: {1:d}).'.format( self._name, self._pid)) logger.exception(exception) self._abort = True finally: task.aborted = self._abort task_storage_writer.UpdateAttributeContainer(task) task_storage_writer.Close() if self._serializers_profiler: task_storage_writer.SetSerializersProfiler(None) if self._storage_profiler: task_storage_writer.SetStorageProfiler(None) try: self._FinalizeTaskStorageWriter(definitions.STORAGE_FORMAT_SQLITE, task) except IOError as exception: logger.warning( 'Unable to finalize task storage with error: {0!s}'.format( exception)) if self._abort: self._status = definitions.STATUS_INDICATOR_ABORTED else: self._status = definitions.STATUS_INDICATOR_COMPLETED logger.debug('Wait for foreman status wait event') self._foreman_status_wait_event.clear() self._foreman_status_wait_event.wait(self._FOREMAN_STATUS_WAIT) logger.debug('Analysis plugin: {0!s} (PID: {1:d}) stopped'.format( self._name, self._pid)) self._StopProfiling() self._analysis_mediator = None self._foreman_status_wait_event = None self._task = None try: self._event_queue.Close(abort=self._abort) except errors.QueueAlreadyClosed: logger.error('Queue for {0:s} was already closed.'.format( self.name))
def ProcessSources(self, source_configurations, storage_writer, session_identifier, processing_configuration, enable_sigsegv_handler=False, status_update_callback=None, storage_file_path=None): """Processes the sources and extract events. Args: source_configurations (list[SourceConfigurationArtifact]): configurations of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. session_identifier (str): the identifier of the session the tasks are part of. processing_configuration (ProcessingConfiguration): processing configuration. enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler should be enabled. status_update_callback (Optional[function]): callback function for status updates. storage_file_path (Optional[str]): path to the session storage file. Returns: ProcessingStatus: processing status. """ self._enable_sigsegv_handler = enable_sigsegv_handler # Keep track of certain values so we can spawn new extraction workers. self._processing_configuration = processing_configuration self._debug_output = processing_configuration.debug_output self._log_filename = processing_configuration.log_filename self._status_update_callback = status_update_callback self._storage_file_path = storage_file_path self._storage_writer = storage_writer self._task_storage_format = processing_configuration.task_storage_format # Set up the task queue. task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue( delay_open=True, linger_seconds=0, maximum_items=1, name='main_task_queue', timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS) self._task_queue = task_outbound_queue # The ZeroMQ backed queue must be started first, so we can save its port. # TODO: raises: attribute-defined-outside-init # self._task_queue.name = 'Task queue' self._task_queue.Open() self._task_queue_port = self._task_queue.port # Set up the task storage before the worker processes. self._StartTaskStorage(self._task_storage_format) for worker_number in range(self._number_of_worker_processes): process_name = 'Worker_{0:02d}'.format(self._last_worker_number) worker_process = self._StartWorkerProcess(process_name) if not worker_process: logger.error('Unable to create worker process: {0:d}'.format( worker_number)) self._StartProfiling(self._processing_configuration.profiling) self._task_manager.StartProfiling( self._processing_configuration.profiling, self._name) if self._serializers_profiler: storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: storage_writer.SetStorageProfiler(self._storage_profiler) self._StartStatusUpdateThread() self._parsers_counter = collections.Counter({ parser_count.name: parser_count for parser_count in self._storage_writer.GetAttributeContainers( 'parser_count') }) try: self._ProcessSources(source_configurations, storage_writer, session_identifier) finally: # Stop the status update thread after close of the storage writer # so we include the storage sync to disk in the status updates. self._StopStatusUpdateThread() if self._serializers_profiler: storage_writer.SetSerializersProfiler(None) if self._storage_profiler: storage_writer.SetStorageProfiler(None) self._task_manager.StopProfiling() self._StopProfiling() try: self._StopExtractionProcesses(abort=self._abort) except KeyboardInterrupt: self._AbortKill() # The abort can leave the main process unresponsive # due to incorrectly finalized IPC. self._KillProcess(os.getpid()) # The task queue should be closed by _StopExtractionProcesses, this # close is a failsafe. self._task_queue.Close(abort=True) if self._processing_status.error_path_specs: task_storage_abort = True else: task_storage_abort = self._abort try: self._StopTaskStorage(self._task_storage_format, session_identifier, abort=task_storage_abort) except (IOError, OSError) as exception: logger.error( 'Unable to stop task storage with error: {0!s}'.format( exception)) if self._abort: logger.debug('Processing aborted.') self._processing_status.aborted = True else: logger.debug('Processing completed.') # Update the status view one last time. self._UpdateStatus() # Reset values. self._enable_sigsegv_handler = None self._processing_configuration = None self._status_update_callback = None self._storage_file_path = None self._storage_writer = None self._task_storage_format = None return self._processing_status
def _UpdateProcessingStatus(self, pid, process_status, used_memory): """Updates the processing status. Args: pid (int): process identifier (PID) of the worker process. process_status (dict[str, object]): status values received from the worker process. used_memory (int): size of used memory in bytes. Raises: KeyError: if the process is not registered with the engine. """ self._RaiseIfNotRegistered(pid) if not process_status: return process = self._processes_per_pid[pid] processing_status = process_status.get('processing_status', None) self._RaiseIfNotMonitored(pid) display_name = process_status.get('display_name', '') number_of_consumed_event_tags = process_status.get( 'number_of_consumed_event_tags', None) number_of_produced_event_tags = process_status.get( 'number_of_produced_event_tags', None) number_of_consumed_events = process_status.get( 'number_of_consumed_events', None) number_of_produced_events = process_status.get( 'number_of_produced_events', None) number_of_consumed_extraction_warnings = process_status.get( 'number_of_consumed_extraction_warnings', None) number_of_produced_extraction_warnings = process_status.get( 'number_of_produced_extraction_warnings', None) number_of_consumed_reports = process_status.get( 'number_of_consumed_reports', None) number_of_produced_reports = process_status.get( 'number_of_produced_reports', None) number_of_consumed_sources = process_status.get( 'number_of_consumed_sources', None) number_of_produced_sources = process_status.get( 'number_of_produced_sources', None) if processing_status != definitions.STATUS_INDICATOR_IDLE: last_activity_timestamp = process_status.get( 'last_activity_timestamp', 0.0) if last_activity_timestamp: last_activity_timestamp += self._worker_timeout current_timestamp = time.time() if current_timestamp > last_activity_timestamp: logger.error(( 'Process {0:s} (PID: {1:d}) has not reported activity within ' 'the timeout period.').format(process.name, pid)) processing_status = definitions.STATUS_INDICATOR_NOT_RESPONDING self._processing_status.UpdateWorkerStatus( process.name, processing_status, pid, used_memory, display_name, number_of_consumed_sources, number_of_produced_sources, number_of_consumed_events, number_of_produced_events, number_of_consumed_event_tags, number_of_produced_event_tags, number_of_consumed_reports, number_of_produced_reports, number_of_consumed_extraction_warnings, number_of_produced_extraction_warnings) task_identifier = process_status.get('task_identifier', '') if not task_identifier: return try: self._task_manager.UpdateTaskAsProcessingByIdentifier( task_identifier) return except KeyError: logger.debug( 'Worker {0:s} is processing unknown task: {1:s}.'.format( process.name, task_identifier))
def _Main(self): """The main loop.""" # We need a resolver context per process to prevent multi processing # issues with file objects stored in images. self._resolver_context = context.Context() for credential_configuration in self._processing_configuration.credentials: resolver.Resolver.key_chain.SetCredential( credential_configuration.path_spec, credential_configuration.credential_type, credential_configuration.credential_data) self._parser_mediator = self._CreateParserMediator( self._knowledge_base, self._resolver_context, self._processing_configuration) # We need to initialize the parser and hasher objects after the process # has forked otherwise on Windows the "fork" will fail with # a PickleError for Python modules that cannot be pickled. self._extraction_worker = worker.EventExtractionWorker( parser_filter_expression=( self._processing_configuration.parser_filter_expression)) self._extraction_worker.SetExtractionConfiguration( self._processing_configuration.extraction) self._parser_mediator.StartProfiling( self._processing_configuration.profiling, self._name, self._process_information) self._StartProfiling(self._processing_configuration.profiling) if self._analyzers_profiler: self._extraction_worker.SetAnalyzersProfiler( self._analyzers_profiler) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler( self._processing_profiler) logger.debug('Worker: {0!s} (PID: {1:d}) started.'.format( self._name, self._pid)) self._status = definitions.STATUS_INDICATOR_RUNNING try: logger.debug( '{0!s} (PID: {1:d}) started monitoring task queue.'.format( self._name, self._pid)) while not self._abort: try: task = self._task_queue.PopItem() except (errors.QueueClose, errors.QueueEmpty) as exception: logger.debug( 'ConsumeItems exiting with exception: {0!s}.'.format( type(exception))) break if isinstance(task, plaso_queue.QueueAbort): logger.debug( 'ConsumeItems exiting, dequeued QueueAbort object.') break self._ProcessTask(task) logger.debug( '{0!s} (PID: {1:d}) stopped monitoring task queue.'.format( self._name, self._pid)) # All exceptions need to be caught here to prevent the process # from being killed by an uncaught exception. except Exception as exception: # pylint: disable=broad-except logger.warning( 'Unhandled exception in process: {0!s} (PID: {1:d}).'.format( self._name, self._pid)) logger.exception(exception) self._abort = True if self._analyzers_profiler: self._extraction_worker.SetAnalyzersProfiler(None) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(None) self._StopProfiling() self._parser_mediator.StopProfiling() self._extraction_worker = None self._file_system_cache = [] self._parser_mediator = None self._resolver_context = None if self._abort: self._status = definitions.STATUS_INDICATOR_ABORTED else: self._status = definitions.STATUS_INDICATOR_COMPLETED logger.debug('Worker: {0!s} (PID: {1:d}) stopped.'.format( self._name, self._pid)) try: self._task_queue.Close(abort=self._abort) except errors.QueueAlreadyClosed: logger.error('Queue for {0:s} was already closed.'.format( self.name))
def _ScheduleTasks(self, storage_writer, session_identifier): """Schedules tasks. Args: storage_writer (StorageWriter): storage writer for a session storage. session_identifier (str): the identifier of the session the tasks are part of. """ logger.debug('Task scheduler started') self._status = definitions.STATUS_INDICATOR_RUNNING # TODO: make tasks persistent. # TODO: protect task scheduler loop by catch all and # handle abort path. event_source_heap = _EventSourceHeap() self._FillEventSourceHeap(storage_writer, event_source_heap, start_with_first=True) event_source = event_source_heap.PopEventSource() task = None has_pending_tasks = True while event_source or has_pending_tasks: if self._abort: break try: if not task: task = self._task_manager.CreateRetryTask() if not task and event_source: task = self._task_manager.CreateTask( session_identifier, storage_format=self._task_storage_format) task.file_entry_type = event_source.file_entry_type task.path_spec = event_source.path_spec event_source = None self._number_of_consumed_sources += 1 if task: if not self._ScheduleTask(task): self._task_manager.SampleTaskStatus( task, 'schedule_attempted') else: path_spec_string = self._GetPathSpecificationString( task.path_spec) logger.debug( 'Scheduled task: {0:s} for path specification: {1:s}' .format(task.identifier, path_spec_string.replace('\n', ' '))) self._task_manager.SampleTaskStatus(task, 'scheduled') task = None # Limit the number of attribute containers from a single task-based # storage file that are merged per loop to keep tasks flowing. merge_duration = time.time() number_of_containers = self._MergeTaskStorage( storage_writer, session_identifier, maximum_number_of_containers=self. _maximum_number_of_containers) merge_duration = time.time() - merge_duration if merge_duration > 0.0 and number_of_containers > 0: containers_per_second = number_of_containers / merge_duration self._maximum_number_of_containers = int( 0.5 * containers_per_second) if not event_source_heap.IsFull(): self._FillEventSourceHeap(storage_writer, event_source_heap) else: logger.debug('Event source heap is full.') if not task and not event_source: event_source = event_source_heap.PopEventSource() has_pending_tasks = self._task_manager.HasPendingTasks() except KeyboardInterrupt: if self._debug_output: traceback.print_exc() self._abort = True self._processing_status.aborted = True if self._status_update_callback: self._status_update_callback(self._processing_status) for task in self._task_manager.GetFailedTasks(): warning = warnings.ExtractionWarning( message='Worker failed to process path specification', path_spec=task.path_spec) self._storage_writer.AddAttributeContainer(warning) self._processing_status.error_path_specs.append(task.path_spec) self._status = definitions.STATUS_INDICATOR_IDLE if self._abort: logger.debug('Task scheduler aborted') else: logger.debug('Task scheduler stopped')
def _ProcessTask(self, task): """Processes a task. Args: task (Task): task. """ logger.debug('Started processing task: {0:s}.'.format(task.identifier)) if self._tasks_profiler: self._tasks_profiler.Sample(task, 'processing_started') task.storage_format = self._processing_configuration.task_storage_format self._task = task task_storage_writer = self._storage_factory.CreateTaskStorageWriter( self._processing_configuration.task_storage_format) if self._serializers_profiler: task_storage_writer.SetSerializersProfiler( self._serializers_profiler) if self._storage_profiler: task_storage_writer.SetStorageProfiler(self._storage_profiler) self._parser_mediator.SetStorageWriter(task_storage_writer) storage_file_path = self._GetTaskStorageFilePath( self._processing_configuration.task_storage_format, task) task_storage_writer.Open(path=storage_file_path, session_identifier=task.session_identifier, task_identifier=task.identifier) try: task_storage_writer.AddAttributeContainer(task) # TODO: add support for more task types. self._ProcessPathSpec(self._extraction_worker, self._parser_mediator, task.path_spec) self._number_of_consumed_sources += 1 finally: task.aborted = self._abort task_storage_writer.UpdateAttributeContainer(task) task_storage_writer.Close() self._parser_mediator.SetStorageWriter(None) try: self._FinalizeTaskStorageWriter( self._processing_configuration.task_storage_format, task) except IOError: pass self._task = None if self._tasks_profiler: self._tasks_profiler.Sample(task, 'processing_completed') logger.debug('Completed processing task: {0:s}.'.format( task.identifier))