def _StopAnalysisProcesses(self, abort=False): """Stops the analysis processes. Args: abort (bool): True to indicated the stop is issued on abort. """ logger.debug('Stopping analysis processes.') self._StopMonitoringProcesses() if abort: # Signal all the processes to abort. self._AbortTerminate() # Wake the processes to make sure that they are not blocking # waiting for the queue new items. for event_queue in self._event_queues.values(): event_queue.PushItem(plaso_queue.QueueAbort(), block=False) # Try waiting for the processes to exit normally. self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) for event_queue in self._event_queues.values(): event_queue.Close(abort=abort) if abort: # Kill any remaining processes. self._AbortKill() else: # Check if the processes are still alive and terminate them if necessary. self._AbortTerminate() self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) for event_queue in self._event_queues.values(): event_queue.Close(abort=True)
def _StopMonitoringProcess(self, process): """Stops monitoring a process. Args: process (MultiProcessBaseProcess): process. Raises: KeyError: if the process is not monitored. ValueError: if the process is missing. """ if process is None: raise ValueError('Missing process.') pid = process.pid self._RaiseIfNotMonitored(pid) del self._process_information_per_pid[pid] rpc_client = self._rpc_clients_per_pid.get(pid, None) if rpc_client: rpc_client.Close() del self._rpc_clients_per_pid[pid] if pid in self._rpc_errors_per_pid: del self._rpc_errors_per_pid[pid] logger.debug('Stopped monitoring process: {0:s} (PID: {1:d})'.format( process.name, pid))
def _TimeoutTasks(self, tasks_for_timeout): """Checks for inactive tasks and marks such tasks as abandoned. Note that this method does not lock the manager and should be called by a method holding the manager lock. Args: tasks_for_timeout (dict[str, Task]): mapping of task identifiers to Tasks that will be checked for inactivity and marked as abandoned if required. """ if not tasks_for_timeout: return inactive_time = int(time.time() * 1000000) - self._TASK_INACTIVE_TIME for task_identifier, task in iter(tasks_for_timeout.items()): last_active_time = task.last_processing_time if not last_active_time: last_active_time = task.start_time if last_active_time < inactive_time: logger.debug('Task {0:s} is abandoned'.format(task_identifier)) self._tasks_abandoned[task_identifier] = task del tasks_for_timeout[task_identifier]
def _AbandonInactiveProcessingTasks(self): """Marks processing tasks that exceed the inactive time as abandoned. This method does not lock the manager and should be called by a method holding the manager lock. """ if self._tasks_processing: inactive_time = time.time() - self._TASK_INACTIVE_TIME inactive_time = int(inactive_time * definitions.MICROSECONDS_PER_SECOND) # Abandon all tasks after they're identified so as not to modify the # dict while iterating over it. tasks_to_abandon = [] for task_identifier, task in self._tasks_processing.items(): if task.last_processing_time < inactive_time: logger.debug('Abandoned processing task: {0:s}.'.format( task_identifier)) self.SampleTaskStatus(task, 'abandoned_processing') tasks_to_abandon.append((task_identifier, task)) for task_identifier, task in tasks_to_abandon: self._tasks_abandoned[task_identifier] = task del self._tasks_processing[task_identifier]
def CreateTask(self, session_identifier, storage_format=definitions.STORAGE_FORMAT_SQLITE): """Creates a task. Args: session_identifier (str): the identifier of the session the task is part of. storage_format (Optional[str]): the storage format that the task should be stored in. Returns: Task: task attribute container. """ task = tasks.Task(session_identifier) task.storage_format = storage_format logger.debug('Created task: {0:s}.'.format(task.identifier)) with self._lock: self._tasks_queued[task.identifier] = task self._total_number_of_tasks += 1 self.SampleTaskStatus(task, 'created') return task
def CreateRetryTask(self): """Creates a task that to retry a previously abandoned task. Returns: Task: a task that was abandoned but should be retried or None if there are no abandoned tasks that should be retried. """ with self._lock: abandoned_task = self._GetTaskPendingRetry() if not abandoned_task: return None # The abandoned task is kept in _tasks_abandoned so it can be still # identified in CheckTaskToMerge and UpdateTaskAsPendingMerge. retry_task = abandoned_task.CreateRetryTask() logger.debug('Retrying task {0:s} as {1:s}.'.format( abandoned_task.identifier, retry_task.identifier)) self._tasks_queued[retry_task.identifier] = retry_task self._total_number_of_tasks += 1 self.SampleTaskStatus(retry_task, 'created_retry') return retry_task
def _StartWorkerProcess(self, process_name, storage_writer): """Creates, starts, monitors and registers a worker process. Args: process_name (str): process name. storage_writer (StorageWriter): storage writer for a session storage used to create task storage. Returns: MultiProcessWorkerProcess: extraction worker process or None if the process could not be started. """ process_name = 'Worker_{0:02d}'.format(self._last_worker_number) logger.debug('Starting worker process {0:s}'.format(process_name)) if self._use_zeromq: queue_name = '{0:s} task queue'.format(process_name) task_queue = zeromq_queue.ZeroMQRequestConnectQueue( delay_open=True, linger_seconds=0, name=queue_name, port=self._task_queue_port, timeout_seconds=self._TASK_QUEUE_TIMEOUT_SECONDS) else: task_queue = self._task_queue process = worker_process.WorkerProcess( task_queue, storage_writer, self.knowledge_base, self._session_identifier, self._processing_configuration, enable_sigsegv_handler=self._enable_sigsegv_handler, name=process_name) # Remove all possible log handlers to prevent a child process from logging # to the main process log file and garbling the log. The log handlers are # recreated after the worker process has been started. for handler in logging.root.handlers: logging.root.removeHandler(handler) handler.close() process.start() loggers.ConfigureLogging( debug_output=self._debug_output, filename=self._log_filename, mode='a', quiet_mode=self._quiet_mode) try: self._StartMonitoringProcess(process) except (IOError, KeyError) as exception: pid = process.pid logger.error(( 'Unable to monitor replacement worker process: {0:s} ' '(PID: {1:d}) with error: {2!s}').format( process_name, pid, exception)) self._TerminateProcess(process) return None self._RegisterProcess(process) self._last_worker_number += 1 return process
def _AbandonQueuedTasks(self): """Marks queued tasks abandoned. This method does not lock the manager and should be called by a method holding the manager lock. """ for task_identifier, task in iter(self._tasks_queued.items()): logger.debug('Abandoned queued task: {0:s}.'.format(task_identifier)) self._tasks_abandoned[task_identifier] = task del self._tasks_queued[task_identifier]
def _ProcessTask(self, task): """Processes a task. Args: task (Task): task. """ logger.debug('Started processing task: {0:s}.'.format(task.identifier)) if self._tasks_profiler: self._tasks_profiler.Sample(task, 'processing_started') self._task = task task_storage_writer = self._storage_writer.CreateTaskStorage( task, self._processing_configuration.task_storage_format) if self._serializers_profiler: task_storage_writer.SetSerializersProfiler( self._serializers_profiler) task_storage_writer.Open() self._parser_mediator.SetStorageWriter(task_storage_writer) task_storage_writer.WriteTaskStart() try: # TODO: add support for more task types. self._ProcessPathSpec(self._extraction_worker, self._parser_mediator, task.path_spec) self._number_of_consumed_sources += 1 if self._guppy_memory_profiler: self._guppy_memory_profiler.Sample() finally: task_storage_writer.WriteTaskCompletion(aborted=self._abort) self._parser_mediator.SetStorageWriter(None) task_storage_writer.Close() try: self._storage_writer.FinalizeTaskStorage(task) except IOError: pass self._task = None if self._tasks_profiler: self._tasks_profiler.Sample(task, 'processing_completed') logger.debug('Completed processing task: {0:s}.'.format( task.identifier))
def _AbandonQueuedTasks(self): """Marks queued tasks abandoned. This method does not lock the manager and should be called by a method holding the manager lock. """ for task_identifier, task in iter(self._tasks_queued.items()): logger.debug( 'Abandoned queued task: {0:s}.'.format(task_identifier)) self._tasks_abandoned[task_identifier] = task del self._tasks_queued[task_identifier]
def _ProcessTask(self, task): """Processes a task. Args: task (Task): task. """ logger.debug('Started processing task: {0:s}.'.format(task.identifier)) if self._tasks_profiler: self._tasks_profiler.Sample(task, 'processing_started') self._task = task storage_writer = self._storage_writer.CreateTaskStorage(task) if self._serializers_profiler: storage_writer.SetSerializersProfiler(self._serializers_profiler) storage_writer.Open() self._parser_mediator.SetStorageWriter(storage_writer) storage_writer.WriteTaskStart() try: # TODO: add support for more task types. self._ProcessPathSpec( self._extraction_worker, self._parser_mediator, task.path_spec) self._number_of_consumed_sources += 1 if self._guppy_memory_profiler: self._guppy_memory_profiler.Sample() finally: storage_writer.WriteTaskCompletion(aborted=self._abort) self._parser_mediator.SetStorageWriter(None) storage_writer.Close() try: self._storage_writer.FinalizeTaskStorage(task) except IOError: pass self._task = None if self._tasks_profiler: self._tasks_profiler.Sample(task, 'processing_completed') logger.debug('Completed processing task: {0:s}.'.format(task.identifier))
def _AbortJoin(self, timeout=None): """Aborts all registered processes by joining with the parent process. Args: timeout (int): number of seconds to wait for processes to join, where None represents no timeout. """ for pid, process in iter(self._processes_per_pid.items()): logger.debug('Waiting for process: {0:s} (PID: {1:d}).'.format( process.name, pid)) process.join(timeout=timeout) if not process.is_alive(): logger.debug('Process {0:s} (PID: {1:d}) stopped.'.format( process.name, pid))
def CompleteTask(self, task): """Completes a task. The task is complete and can be removed from the task manager. Args: task (Task): task. """ with self._lock: if task.identifier in self._tasks_merging: del self._tasks_merging[task.identifier] logger.debug('Task {0:s} is complete.'.format(task.identifier)) if task.identifier in self._tasks_pending_merge: logger.debug( 'Task {0:s} completed while pending merge.'.format( task.identifier)) return if task.identifier in self._tasks_processing: del self._tasks_processing[task.identifier] logger.debug('Task {0:s} completed from processing.'.format( task.identifier)) return if task.identifier in self._tasks_queued: del self._tasks_queued[task.identifier] logger.debug('Task {0:s} is completed from queued.'.format( task.identifier)) return
def HasPendingTasks(self): """Determines if there are tasks running or in need of retrying. Returns: bool: True if there are tasks that are active, ready to be merged or need to be retried. """ logger.debug('Checking for pending tasks') with self._lock: self._AbandonInactiveProcessingTasks() if self._tasks_processing: return True # There are no tasks being processed, but we might be # waiting for some tasks to be merged. if self._HasTasksPendingMerge(): return True # There are no tasks processing or pending merge, but there may # still be some waiting to be retried, so we check that. if self._HasTasksPendingRetry(): return True # It is possible that a worker has processed a task and the foreman has # not been informed about it, since there is no feedback from the worker # when it pops a task from the queue. # If we believe all the workers are idle for longer than the task # inactive time (timeout) abandon all queued tasks. This ensures # that processing actually stops when the foreman never gets an # update from a worker. if self._tasks_queued: inactive_time = time.time() - self._TASK_INACTIVE_TIME inactive_time = int(inactive_time * definitions.MICROSECONDS_PER_SECOND) if self._latest_task_processing_time < inactive_time: self._AbandonQueuedTasks() if self._tasks_queued: return True if self._tasks_merging: return True # There are no tasks pending any work. return False
def _StopProcessStatusRPCServer(self): """Stops the process status RPC server.""" if not self._rpc_server: return # Make sure the engine gets one more status update so it knows # the worker has completed. self._WaitForStatusNotRunning() self._rpc_server.Stop() self._rpc_server = None self.rpc_port.value = 0 logger.debug('Process: {0!s} process status RPC server stopped'.format( self._name))
def _StopProcessStatusRPCServer(self): """Stops the process status RPC server.""" if not self._rpc_server: return # Make sure the engine gets one more status update so it knows # the worker has completed. self._WaitForStatusNotRunning() self._rpc_server.Stop() self._rpc_server = None self.rpc_port.value = 0 logger.debug( 'Process: {0!s} process status RPC server stopped'.format(self._name))
def _AbandonQueuedTasks(self): """Marks queued tasks abandoned. This method does not lock the manager and should be called by a method holding the manager lock. """ # Abandon all tasks after they're identified so as not to modify the # dict while iterating over it. tasks_to_abandon = [] for task_identifier, task in iter(self._tasks_queued.items()): logger.debug('Abandoned queued task: {0:s}.'.format(task_identifier)) tasks_to_abandon.append((task_identifier, task)) for task_identifier, task in tasks_to_abandon: self._tasks_abandoned[task_identifier] = task del self._tasks_queued[task_identifier]
def UpdateTaskAsProcessingByIdentifier(self, task_identifier): """Updates the task manager to reflect the task is processing. Args: task_identifier (str): unique identifier of the task. Raises: KeyError: if the task is not known to the task manager. """ with self._lock: task_processing = self._tasks_processing.get(task_identifier, None) if task_processing: task_processing.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_processing) return task_queued = self._tasks_queued.get(task_identifier, None) if task_queued: logger.debug('Task {0:s} was queued, now processing.'.format( task_identifier)) self._tasks_processing[task_identifier] = task_queued del self._tasks_queued[task_identifier] task_queued.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_queued) return task_abandoned = self._tasks_abandoned.get(task_identifier, None) if task_abandoned: del self._tasks_abandoned[task_identifier] self._tasks_processing[task_identifier] = task_abandoned logger.debug( 'Task {0:s} was abandoned, but now processing.'.format( task_identifier)) task_abandoned.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_abandoned) return if task_identifier in self._tasks_pending_merge: # No need to update the processing time, as this task is already # finished processing and is just waiting for merge. return # If we get here, we don't know what state the tasks is in, so raise. raise KeyError( 'Status of task {0:s} is unknown.'.format(task_identifier))
def _RunProcess(self): """Runs the process.""" # Prevent the KeyboardInterrupt being raised inside the process. # This will prevent a process from generating a traceback when interrupted. signal.signal(signal.SIGINT, signal.SIG_IGN) # A SIGTERM signal handler is necessary to make sure IPC is cleaned up # correctly on terminate. signal.signal(signal.SIGTERM, self._SigTermHandler) # A SIGSEGV signal handler is necessary to try to indicate where # worker failed. # WARNING the SIGSEGV handler will deadlock the process on a real segfault. if self._enable_sigsegv_handler: self._original_sigsegv_handler = signal.signal( signal.SIGSEGV, self._SigSegvHandler) self._pid = os.getpid() self._process_information = process_info.ProcessInfo(self._pid) # We need to set the is running status explicitly to True in case # the process completes before the engine is able to determine # the status of the process, such as in the unit tests. self._status_is_running = True # Logging needs to be configured before the first output otherwise we # mess up the logging of the parent process. loggers.ConfigureLogging(debug_output=self._debug_output, filename=self._log_filename, quiet_mode=self._quiet_mode) logger.debug('Process: {0!s} (PID: {1:d}) started'.format( self._name, self._pid)) self._StartProcessStatusRPCServer() self._Main() self._StopProcessStatusRPCServer() logger.debug('Process: {0!s} (PID: {1:d}) stopped'.format( self._name, self._pid)) # Make sure log files are cleanly closed. logging.shutdown() self._status_is_running = False
def run(self): """Runs the process.""" # Prevent the KeyboardInterrupt being raised inside the process. # This will prevent a process from generating a traceback when interrupted. signal.signal(signal.SIGINT, signal.SIG_IGN) # A SIGTERM signal handler is necessary to make sure IPC is cleaned up # correctly on terminate. signal.signal(signal.SIGTERM, self._SigTermHandler) # A SIGSEGV signal handler is necessary to try to indicate where # worker failed. # WARNING the SIGSEGV handler will deadlock the process on a real segfault. if self._enable_sigsegv_handler: self._original_sigsegv_handler = signal.signal( signal.SIGSEGV, self._SigSegvHandler) self._pid = os.getpid() self._process_information = process_info.ProcessInfo(self._pid) # We need to set the is running status explicitly to True in case # the process completes before the engine is able to determine # the status of the process, e.g. in the unit tests. self._status_is_running = True # Logging needs to be configured before the first output otherwise we # mess up the logging of the parent process. loggers.ConfigureLogging( debug_output=self._debug_output, filename=self._log_filename, quiet_mode=self._quiet_mode) logger.debug( 'Process: {0!s} (PID: {1:d}) started'.format(self._name, self._pid)) self._StartProcessStatusRPCServer() self._Main() self._StopProcessStatusRPCServer() logger.debug( 'Process: {0!s} (PID: {1:d}) stopped'.format(self._name, self._pid)) # Make sure log files are cleanly closed. logging.shutdown() self._status_is_running = False
def CreateTask(self, session_identifier): """Creates a task. Args: session_identifier (str): the identifier of the session the task is part of. Returns: Task: task attribute container. """ task = tasks.Task(session_identifier) logger.debug('Created task: {0:s}.'.format(task.identifier)) with self._lock: self._QueueTask(task) return task
def UpdateTaskAsProcessingByIdentifier(self, task_identifier): """Updates the task manager to reflect the task is processing. Args: task_identifier (str): unique identifier of the task. Raises: KeyError: if the task is not known to the task manager. """ with self._lock: task_processing = self._tasks_processing.get(task_identifier, None) if task_processing: task_processing.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_processing) return task_queued = self._tasks_queued.get(task_identifier, None) if task_queued: logger.debug('Task {0:s} was queued, now processing.'.format( task_identifier)) self._tasks_processing[task_identifier] = task_queued del self._tasks_queued[task_identifier] task_queued.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_queued) return task_abandoned = self._tasks_abandoned.get(task_identifier, None) if task_abandoned: del self._tasks_abandoned[task_identifier] self._tasks_processing[task_identifier] = task_abandoned logger.debug('Task {0:s} was abandoned, but now processing.'.format( task_identifier)) task_abandoned.UpdateProcessingTime() self._UpdateLatestProcessingTime(task_abandoned) return if task_identifier in self._tasks_pending_merge: # No need to update the processing time, as this task is already # finished processing and is just waiting for merge. return # If we get here, we don't know what state the tasks is in, so raise. raise KeyError('Status of task {0:s} is unknown.'.format(task_identifier))
def CreateRetryTask(self): """Creates a task that to retry a previously abandoned task. Returns: Task: a task that was abandoned but should be retried or None if there are no abandoned tasks that should be retried. """ with self._lock: abandoned_task = self._GetTaskPendingRetry() if not abandoned_task: return None retry_task = abandoned_task.CreateRetry() logger.debug('Retrying task {0:s} as {1:s}.'.format( abandoned_task.identifier, retry_task.identifier)) self._QueueTask(retry_task) return retry_task
def _AbandonInactiveProcessingTasks(self): """Marks processing tasks that exceed the inactive time as abandoned. This method does not lock the manager and should be called by a method holding the manager lock. """ if self._tasks_processing: inactive_time = time.time() - self._TASK_INACTIVE_TIME inactive_time = int(inactive_time * definitions.MICROSECONDS_PER_SECOND) for task_identifier, task in iter(self._tasks_processing.items()): if task.last_processing_time < inactive_time: logger.debug('Abandoned processing task: {0:s}.'.format( task_identifier)) self._tasks_abandoned[task_identifier] = task del self._tasks_processing[task_identifier]
def _StopAnalysisProcesses(self, abort=False): """Stops the analysis processes. Args: abort (bool): True to indicated the stop is issued on abort. """ logger.debug('Stopping analysis processes.') self._StopMonitoringProcesses() # Note that multiprocessing.Queue is very sensitive regarding # blocking on either a get or a put. So we try to prevent using # any blocking behavior. if abort: # Signal all the processes to abort. self._AbortTerminate() if not self._use_zeromq: logger.debug('Emptying queues.') for event_queue in self._event_queues.values(): event_queue.Empty() # Wake the processes to make sure that they are not blocking # waiting for the queue new items. for event_queue in self._event_queues.values(): event_queue.PushItem(plaso_queue.QueueAbort(), block=False) # Try waiting for the processes to exit normally. self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) for event_queue in self._event_queues.values(): event_queue.Close(abort=abort) if abort: # Kill any remaining processes. self._AbortKill() else: # Check if the processes are still alive and terminate them if necessary. self._AbortTerminate() self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) for event_queue in self._event_queues.values(): event_queue.Close(abort=True)
def _StopExtractionProcesses(self, abort=False): """Stops the extraction processes. Args: abort (bool): True to indicated the stop is issued on abort. """ logger.debug('Stopping extraction processes.') self._StopMonitoringProcesses() # Note that multiprocessing.Queue is very sensitive regarding # blocking on either a get or a put. So we try to prevent using # any blocking behavior. if abort: # Signal all the processes to abort. self._AbortTerminate() logger.debug('Emptying task queue.') self._task_queue.Empty() # Wake the processes to make sure that they are not blocking # waiting for the queue new items. for _ in self._processes_per_pid: try: self._task_queue.PushItem(plaso_queue.QueueAbort(), block=False) except errors.QueueFull: logger.warning( 'Task queue full, unable to push abort message.') # Try waiting for the processes to exit normally. self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) self._task_queue.Close(abort=abort) if not abort: # Check if the processes are still alive and terminate them if necessary. self._AbortTerminate() self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) self._task_queue.Close(abort=True) # Kill any lingering processes. self._AbortKill()
def _FillEventSourceHeap(self, storage_writer, event_source_heap, start_with_first=False): """Fills the event source heap with the available written event sources. Args: storage_writer (StorageWriter): storage writer for a session storage. event_source_heap (_EventSourceHeap): event source heap. start_with_first (Optional[bool]): True if the function should start with the first written event source. """ if self._processing_profiler: self._processing_profiler.StartTiming('fill_event_source_heap') if self._processing_profiler: self._processing_profiler.StartTiming('get_event_source') if start_with_first: event_source = storage_writer.GetFirstWrittenEventSource() else: event_source = storage_writer.GetNextWrittenEventSource() if self._processing_profiler: self._processing_profiler.StopTiming('get_event_source') while event_source: event_source_heap.PushEventSource(event_source) if event_source_heap.IsFull(): logger.debug('Source heap is full.') break if self._processing_profiler: self._processing_profiler.StartTiming('get_event_source') event_source = storage_writer.GetNextWrittenEventSource() if self._processing_profiler: self._processing_profiler.StopTiming('get_event_source') if self._processing_profiler: self._processing_profiler.StopTiming('fill_event_source_heap')
def CompleteTask(self, task): """Completes a task. The task is complete and can be removed from the task manager. Args: task (Task): task. Raises: KeyError: if the task was not merging. """ with self._lock: if task.identifier not in self._tasks_merging: raise KeyError('Task {0:s} was not merging.'.format(task.identifier)) self.SampleTaskStatus(task, 'completed') del self._tasks_merging[task.identifier] logger.debug('Completed task {0:s}.'.format(task.identifier))
def CreateTask(self, session_identifier): """Creates a task. Args: session_identifier (str): the identifier of the session the task is part of. Returns: Task: task attribute container. """ task = tasks.Task(session_identifier) logger.debug('Created task: {0:s}.'.format(task.identifier)) with self._lock: self._tasks_queued[task.identifier] = task self._total_number_of_tasks += 1 self.SampleTaskStatus(task, 'created') return task
def GetRetryTask(self): """Creates a task that is an attempt to retry an abandoned task. Returns: Task: a task that is a retry of an existing task or None if there are no tasks that need to be retried. """ with self._lock: for abandoned_task in self._tasks_abandoned.values(): # Only retry abandoned tasks that are yet to be retried and # are not themselves retries of another task. if self._TaskIsRetriable(abandoned_task): retry_task = abandoned_task.CreateRetry() logger.debug( 'Retrying task {0:s} as {1:s}'.format( abandoned_task.identifier, retry_task.identifier)) self._tasks_queued[retry_task.identifier] = retry_task self._total_number_of_tasks += 1 return retry_task return None
def _StopExtractionProcesses(self, abort=False): """Stops the extraction processes. Args: abort (bool): True to indicated the stop is issued on abort. """ logger.debug('Stopping extraction processes.') self._StopMonitoringProcesses() # Note that multiprocessing.Queue is very sensitive regarding # blocking on either a get or a put. So we try to prevent using # any blocking behavior. if abort: # Signal all the processes to abort. self._AbortTerminate() logger.debug('Emptying task queue.') self._task_queue.Empty() # Wake the processes to make sure that they are not blocking # waiting for the queue new items. for _ in self._processes_per_pid: try: self._task_queue.PushItem(plaso_queue.QueueAbort(), block=False) except errors.QueueFull: logger.warning('Task queue full, unable to push abort message.') # Try waiting for the processes to exit normally. self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) self._task_queue.Close(abort=abort) if not abort: # Check if the processes are still alive and terminate them if necessary. self._AbortTerminate() self._AbortJoin(timeout=self._PROCESS_JOIN_TIMEOUT) self._task_queue.Close(abort=True) # Kill any lingering processes. self._AbortKill()
def RemoveTask(self, task): """Removes an abandoned task. Args: task (Task): task. Raises: KeyError: if the task was not abandoned or the task was abandoned and was not retried. """ with self._lock: if task.identifier not in self._tasks_abandoned: raise KeyError('Task {0:s} was not abandoned.'.format(task.identifier)) if not task.has_retry: raise KeyError( 'Will not remove a task {0:s} without retry task.'.format( task.identifier)) del self._tasks_abandoned[task.identifier] logger.debug('Removed task {0:s}.'.format(task.identifier))
def UpdateTaskAsPendingMerge(self, task): """Updates the task manager to reflect the task is ready to be merged. Args: task (Task): task. Raises: KeyError: if the task was not processing or abandoned. """ with self._lock: is_processing = task.identifier in self._tasks_processing is_abandoned = task.identifier in self._tasks_abandoned is_queued = task.identifier in self._tasks_queued if not (is_queued or is_abandoned or is_processing): raise KeyError('Status of task {0:s} is unknown.'.format( task.identifier)) self._tasks_pending_merge.PushTask(task) task.UpdateProcessingTime() self._UpdateLastestProcessingTime(task) if is_queued: del self._tasks_queued[task.identifier] if is_processing: del self._tasks_processing[task.identifier] if is_abandoned: del self._tasks_abandoned[task.identifier] if is_abandoned: logger.warning( 'Previously abandoned task {0:s} is now pending merge.'.format( task.identifier)) else: logger.debug('Task {0:s} is pending merge.'.format( task.identifier))
def _StartProcessStatusRPCServer(self): """Starts the process status RPC server.""" if self._rpc_server: return self._rpc_server = plaso_xmlrpc.XMLProcessStatusRPCServer( self._GetStatus) hostname = 'localhost' # Try the PID as port number first otherwise pick something random # between 1024 and 60000. if self._pid < 1024 or self._pid > 60000: port = random.randint(1024, 60000) else: port = self._pid if not self._rpc_server.Start(hostname, port): port = 0 for _ in range(self._NUMBER_OF_RPC_SERVER_START_ATTEMPTS): port = random.randint(1024, 60000) if self._rpc_server.Start(hostname, port): break port = 0 if not port: logger.error( ('Unable to start a process status RPC server for {0!s} ' '(PID: {1:d})').format(self._name, self._pid)) self._rpc_server = None return self.rpc_port.value = port logger.debug('Process: {0!s} process status RPC server started'.format( self._name))
def _AbandonInactiveProcessingTasks(self): """Marks processing tasks that exceed the inactive time as abandoned. This method does not lock the manager and should be called by a method holding the manager lock. """ if self._tasks_processing: inactive_time = time.time() - self._TASK_INACTIVE_TIME inactive_time = int(inactive_time * definitions.MICROSECONDS_PER_SECOND) # Abandon all tasks after they're identified so as not to modify the # dict while iterating over it. tasks_to_abandon = [] for task_identifier, task in iter(self._tasks_processing.items()): if task.last_processing_time < inactive_time: logger.debug('Abandoned processing task: {0:s}.'.format( task_identifier)) self.SampleTaskStatus(task, 'abandoned_processing') tasks_to_abandon.append((task_identifier, task)) for task_identifier, task in tasks_to_abandon: self._tasks_abandoned[task_identifier] = task del self._tasks_processing[task_identifier]
def _StartProcessStatusRPCServer(self): """Starts the process status RPC server.""" if self._rpc_server: return self._rpc_server = plaso_xmlrpc.XMLProcessStatusRPCServer(self._GetStatus) hostname = 'localhost' # Try the PID as port number first otherwise pick something random # between 1024 and 60000. if self._pid < 1024 or self._pid > 60000: port = random.randint(1024, 60000) else: port = self._pid if not self._rpc_server.Start(hostname, port): port = 0 for _ in range(self._NUMBER_OF_RPC_SERVER_START_ATTEMPTS): port = random.randint(1024, 60000) if self._rpc_server.Start(hostname, port): break port = 0 if not port: logger.error(( 'Unable to start a process status RPC server for {0!s} ' '(PID: {1:d})').format(self._name, self._pid)) self._rpc_server = None return self.rpc_port.value = port logger.debug( 'Process: {0!s} process status RPC server started'.format(self._name))
def UpdateTaskAsPendingMerge(self, task): """Updates the task manager to reflect that the task is ready to be merged. Args: task (Task): task. Raises: KeyError: if the task was not queued, processing or abandoned, or the task was abandoned and has a retry task. """ with self._lock: is_abandoned = task.identifier in self._tasks_abandoned is_processing = task.identifier in self._tasks_processing is_queued = task.identifier in self._tasks_queued if not is_queued and not is_processing and not is_abandoned: raise KeyError('Status of task {0:s} is unknown.'.format( task.identifier)) if is_abandoned and task.has_retry: raise KeyError( 'Will not merge a task {0:s} with retry task.'.format( task.identifier)) if is_queued: logger.debug('Task {0:s} was queued, now merging.'.format( task.identifier)) del self._tasks_queued[task.identifier] if is_processing: logger.debug('Task {0:s} was processing, now merging.'.format( task.identifier)) del self._tasks_processing[task.identifier] if is_abandoned: logger.debug('Task {0:s} was abandoned, now merging.'.format( task.identifier)) del self._tasks_abandoned[task.identifier] self._tasks_pending_merge.PushTask(task) self.SampleTaskStatus(task, 'pending_merge') task.UpdateProcessingTime() self._UpdateLatestProcessingTime(task)
def UpdateTaskAsPendingMerge(self, task): """Updates the task manager to reflect the task is ready to be merged. Args: task (Task): task. Raises: KeyError: if the task was not queued, processing or abandoned, or the task was abandoned and has a retry task. """ with self._lock: is_abandoned = task.identifier in self._tasks_abandoned is_processing = task.identifier in self._tasks_processing is_queued = task.identifier in self._tasks_queued if not is_queued and not is_processing and not is_abandoned: raise KeyError('Status of task {0:s} is unknown.'.format( task.identifier)) if is_abandoned and task.has_retry: raise KeyError('Will not merge a task {0:s} with retry task.'.format( task.identifier)) if is_queued: logger.debug('Task {0:s} was queued, now merging.'.format( task.identifier)) del self._tasks_queued[task.identifier] if is_processing: logger.debug('Task {0:s} was processing, now merging.'.format( task.identifier)) del self._tasks_processing[task.identifier] if is_abandoned: logger.debug('Task {0:s} was abandoned, now merging.'.format( task.identifier)) del self._tasks_abandoned[task.identifier] self._tasks_pending_merge.PushTask(task) self.SampleTaskStatus(task, 'pending_merge') task.UpdateProcessingTime() self._UpdateLatestProcessingTime(task)
def _Main(self): """The main loop.""" # We need a resolver context per process to prevent multi processing # issues with file objects stored in images. resolver_context = context.Context() for credential_configuration in self._processing_configuration.credentials: resolver.Resolver.key_chain.SetCredential( credential_configuration.path_spec, credential_configuration.credential_type, credential_configuration.credential_data) self._parser_mediator = parsers_mediator.ParserMediator( None, self._knowledge_base, artifacts_filter_helper=self._artifacts_filter_helper, preferred_year=self._processing_configuration.preferred_year, resolver_context=resolver_context, temporary_directory=self._processing_configuration.temporary_directory) self._parser_mediator.SetEventExtractionConfiguration( self._processing_configuration.event_extraction) self._parser_mediator.SetInputSourceConfiguration( self._processing_configuration.input_source) # We need to initialize the parser and hasher objects after the process # has forked otherwise on Windows the "fork" will fail with # a PickleError for Python modules that cannot be pickled. self._extraction_worker = worker.EventExtractionWorker( parser_filter_expression=( self._processing_configuration.parser_filter_expression)) self._extraction_worker.SetExtractionConfiguration( self._processing_configuration.extraction) self._parser_mediator.StartProfiling( self._processing_configuration.profiling, self._name, self._process_information) self._StartProfiling(self._processing_configuration.profiling) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(self._processing_profiler) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: self._storage_writer.SetStorageProfiler(self._storage_profiler) logger.debug('Worker: {0!s} (PID: {1:d}) started.'.format( self._name, self._pid)) self._status = definitions.STATUS_INDICATOR_RUNNING try: logger.debug('{0!s} (PID: {1:d}) started monitoring task queue.'.format( self._name, self._pid)) while not self._abort: try: task = self._task_queue.PopItem() except (errors.QueueClose, errors.QueueEmpty) as exception: logger.debug('ConsumeItems exiting with exception {0:s}.'.format( type(exception))) break if isinstance(task, plaso_queue.QueueAbort): logger.debug('ConsumeItems exiting, dequeued QueueAbort object.') break self._ProcessTask(task) logger.debug('{0!s} (PID: {1:d}) stopped monitoring task queue.'.format( self._name, self._pid)) # All exceptions need to be caught here to prevent the process # from being killed by an uncaught exception. except Exception as exception: # pylint: disable=broad-except logger.warning( 'Unhandled exception in process: {0!s} (PID: {1:d}).'.format( self._name, self._pid)) logger.exception(exception) self._abort = True if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(None) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(None) if self._storage_profiler: self._storage_writer.SetStorageProfiler(None) self._StopProfiling() self._parser_mediator.StopProfiling() self._extraction_worker = None self._parser_mediator = None self._storage_writer = None if self._abort: self._status = definitions.STATUS_INDICATOR_ABORTED else: self._status = definitions.STATUS_INDICATOR_COMPLETED logger.debug('Worker: {0!s} (PID: {1:d}) stopped.'.format( self._name, self._pid)) try: self._task_queue.Close(abort=self._abort) except errors.QueueAlreadyClosed: logger.error('Queue for {0:s} was already closed.'.format(self.name))
def _ScheduleTasks(self, storage_writer): """Schedules tasks. Args: storage_writer (StorageWriter): storage writer for a session storage. """ logger.debug('Task scheduler started') self._status = definitions.STATUS_INDICATOR_RUNNING # TODO: make tasks persistent. # TODO: protect task scheduler loop by catch all and # handle abort path. event_source_heap = _EventSourceHeap() self._FillEventSourceHeap( storage_writer, event_source_heap, start_with_first=True) event_source = event_source_heap.PopEventSource() task = None while event_source or self._task_manager.HasPendingTasks(): if self._abort: break try: if not task: task = self._task_manager.CreateRetryTask() if not task and event_source: task = self._task_manager.CreateTask(self._session_identifier) task.file_entry_type = event_source.file_entry_type task.path_spec = event_source.path_spec event_source = None self._number_of_consumed_sources += 1 if self._guppy_memory_profiler: self._guppy_memory_profiler.Sample() if task: if self._ScheduleTask(task): logger.debug( 'Scheduled task {0:s} for path specification {1:s}'.format( task.identifier, task.path_spec.comparable)) self._task_manager.SampleTaskStatus(task, 'scheduled') task = None else: self._task_manager.SampleTaskStatus(task, 'schedule_attempted') self._MergeTaskStorage(storage_writer) if not event_source_heap.IsFull(): self._FillEventSourceHeap(storage_writer, event_source_heap) if not task and not event_source: event_source = event_source_heap.PopEventSource() except KeyboardInterrupt: self._abort = True self._processing_status.aborted = True if self._status_update_callback: self._status_update_callback(self._processing_status) for task in self._task_manager.GetFailedTasks(): warning = warnings.ExtractionWarning( message='Worker failed to process path specification', path_spec=task.path_spec) self._storage_writer.AddWarning(warning) self._processing_status.error_path_specs.append(task.path_spec) self._status = definitions.STATUS_INDICATOR_IDLE if self._abort: logger.debug('Task scheduler aborted') else: logger.debug('Task scheduler stopped')
def _Main(self): """The main loop.""" self._StartProfiling(self._processing_configuration.profiling) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: self._storage_writer.SetStorageProfiler(self._storage_profiler) logger.debug('Analysis plugin: {0!s} (PID: {1:d}) started'.format( self._name, self._pid)) # Creating the threading event in the constructor will cause a pickle # error on Windows when an analysis process is created. self._foreman_status_wait_event = threading.Event() self._status = definitions.PROCESSING_STATUS_ANALYZING task = tasks.Task() # TODO: temporary solution. task.identifier = self._analysis_plugin.plugin_name self._task = task storage_writer = self._storage_writer.CreateTaskStorage(task) if self._serializers_profiler: storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: storage_writer.SetStorageProfiler(self._storage_profiler) storage_writer.Open() self._analysis_mediator = analysis_mediator.AnalysisMediator( storage_writer, self._knowledge_base, data_location=self._data_location) # TODO: set event_filter_expression in mediator. storage_writer.WriteTaskStart() try: logger.debug( '{0!s} (PID: {1:d}) started monitoring event queue.'.format( self._name, self._pid)) while not self._abort: try: event = self._event_queue.PopItem() except (errors.QueueClose, errors.QueueEmpty) as exception: logger.debug('ConsumeItems exiting with exception {0:s}.'.format( type(exception))) break if isinstance(event, plaso_queue.QueueAbort): logger.debug('ConsumeItems exiting, dequeued QueueAbort object.') break self._ProcessEvent(self._analysis_mediator, event) self._number_of_consumed_events += 1 if self._guppy_memory_profiler: self._guppy_memory_profiler.Sample() logger.debug( '{0!s} (PID: {1:d}) stopped monitoring event queue.'.format( self._name, self._pid)) if not self._abort: self._status = definitions.PROCESSING_STATUS_REPORTING self._analysis_mediator.ProduceAnalysisReport(self._analysis_plugin) # All exceptions need to be caught here to prevent the process # from being killed by an uncaught exception. except Exception as exception: # pylint: disable=broad-except logger.warning( 'Unhandled exception in process: {0!s} (PID: {1:d}).'.format( self._name, self._pid)) logger.exception(exception) self._abort = True finally: storage_writer.WriteTaskCompletion(aborted=self._abort) storage_writer.Close() if self._serializers_profiler: storage_writer.SetSerializersProfiler(None) if self._storage_profiler: storage_writer.SetStorageProfiler(None) try: self._storage_writer.FinalizeTaskStorage(task) except IOError: pass if self._abort: self._status = definitions.PROCESSING_STATUS_ABORTED else: self._status = definitions.PROCESSING_STATUS_COMPLETED self._foreman_status_wait_event.wait(self._FOREMAN_STATUS_WAIT) logger.debug('Analysis plugin: {0!s} (PID: {1:d}) stopped'.format( self._name, self._pid)) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(None) if self._storage_profiler: self._storage_writer.SetStorageProfiler(None) self._StopProfiling() self._analysis_mediator = None self._foreman_status_wait_event = None self._storage_writer = None self._task = None try: self._event_queue.Close(abort=self._abort) except errors.QueueAlreadyClosed: logger.error('Queue for {0:s} was already closed.'.format(self.name))
def _UpdateProcessingStatus(self, pid, process_status, used_memory): """Updates the processing status. Args: pid (int): process identifier (PID) of the worker process. process_status (dict[str, object]): status values received from the worker process. used_memory (int): size of used memory in bytes. Raises: KeyError: if the process is not registered with the engine. """ self._RaiseIfNotRegistered(pid) if not process_status: return process = self._processes_per_pid[pid] processing_status = process_status.get('processing_status', None) self._RaiseIfNotMonitored(pid) display_name = process_status.get('display_name', '') number_of_consumed_event_tags = process_status.get( 'number_of_consumed_event_tags', None) number_of_produced_event_tags = process_status.get( 'number_of_produced_event_tags', None) number_of_consumed_events = process_status.get( 'number_of_consumed_events', None) number_of_produced_events = process_status.get( 'number_of_produced_events', None) number_of_consumed_reports = process_status.get( 'number_of_consumed_reports', None) number_of_produced_reports = process_status.get( 'number_of_produced_reports', None) number_of_consumed_sources = process_status.get( 'number_of_consumed_sources', None) number_of_produced_sources = process_status.get( 'number_of_produced_sources', None) number_of_consumed_warnings = process_status.get( 'number_of_consumed_warnings', None) number_of_produced_warnings = process_status.get( 'number_of_produced_warnings', None) if processing_status != definitions.STATUS_INDICATOR_IDLE: last_activity_timestamp = process_status.get( 'last_activity_timestamp', 0.0) if last_activity_timestamp: last_activity_timestamp += self._PROCESS_WORKER_TIMEOUT current_timestamp = time.time() if current_timestamp > last_activity_timestamp: logger.error(( 'Process {0:s} (PID: {1:d}) has not reported activity within ' 'the timeout period.').format(process.name, pid)) processing_status = definitions.STATUS_INDICATOR_NOT_RESPONDING self._processing_status.UpdateWorkerStatus( process.name, processing_status, pid, used_memory, display_name, number_of_consumed_sources, number_of_produced_sources, number_of_consumed_events, number_of_produced_events, number_of_consumed_event_tags, number_of_produced_event_tags, number_of_consumed_reports, number_of_produced_reports, number_of_consumed_warnings, number_of_produced_warnings) task_identifier = process_status.get('task_identifier', '') if not task_identifier: return try: self._task_manager.UpdateTaskAsProcessingByIdentifier(task_identifier) return except KeyError: logger.debug( 'Worker {0:s} is processing unknown task: {1:s}.'.format( process.name, task_identifier))
def ProcessSources( self, session_identifier, source_path_specs, storage_writer, processing_configuration, enable_sigsegv_handler=False, number_of_worker_processes=0, status_update_callback=None, worker_memory_limit=None): """Processes the sources and extract events. Args: session_identifier (str): identifier of the session. source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. processing_configuration (ProcessingConfiguration): processing configuration. enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler should be enabled. number_of_worker_processes (Optional[int]): number of worker processes. status_update_callback (Optional[function]): callback function for status updates. worker_memory_limit (Optional[int]): maximum amount of memory a worker is allowed to consume, where None represents the default memory limit and 0 represents no limit. Returns: ProcessingStatus: processing status. """ if number_of_worker_processes < 1: # One worker for each "available" CPU (minus other processes). # The number here is derived from the fact that the engine starts up: # * A main process. # # If we want to utilize all CPUs on the system we therefore need to start # up workers that amounts to the total number of CPUs - the other # processes. try: cpu_count = multiprocessing.cpu_count() - 1 if cpu_count <= self._WORKER_PROCESSES_MINIMUM: cpu_count = self._WORKER_PROCESSES_MINIMUM elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM: cpu_count = self._WORKER_PROCESSES_MAXIMUM except NotImplementedError: logger.error(( 'Unable to determine number of CPUs defaulting to {0:d} worker ' 'processes.').format(self._WORKER_PROCESSES_MINIMUM)) cpu_count = self._WORKER_PROCESSES_MINIMUM number_of_worker_processes = cpu_count self._enable_sigsegv_handler = enable_sigsegv_handler self._number_of_worker_processes = number_of_worker_processes if worker_memory_limit is None: self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT else: self._worker_memory_limit = worker_memory_limit # Keep track of certain values so we can spawn new extraction workers. self._processing_configuration = processing_configuration self._debug_output = processing_configuration.debug_output self._log_filename = processing_configuration.log_filename self._session_identifier = session_identifier self._status_update_callback = status_update_callback self._storage_writer = storage_writer # Set up the task queue. if not self._use_zeromq: self._task_queue = multi_process_queue.MultiProcessingQueue( maximum_number_of_queued_items=self._maximum_number_of_tasks) else: task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue( delay_open=True, linger_seconds=0, maximum_items=1, name='main_task_queue', timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS) self._task_queue = task_outbound_queue # The ZeroMQ backed queue must be started first, so we can save its port. # TODO: raises: attribute-defined-outside-init # self._task_queue.name = 'Task queue' self._task_queue.Open() self._task_queue_port = self._task_queue.port self._StartProfiling(self._processing_configuration.profiling) self._task_manager.StartProfiling( self._processing_configuration.profiling, self._name) if self._serializers_profiler: storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: storage_writer.SetStorageProfiler(self._storage_profiler) # Set up the storage writer before the worker processes. storage_writer.StartTaskStorage() for worker_number in range(number_of_worker_processes): # First argument to _StartWorkerProcess is not used. extraction_process = self._StartWorkerProcess('', storage_writer) if not extraction_process: logger.error('Unable to create worker process: {0:d}'.format( worker_number)) self._StartStatusUpdateThread() try: # Open the storage file after creating the worker processes otherwise # the ZIP storage file will remain locked as long as the worker processes # are alive. storage_writer.Open() storage_writer.WriteSessionStart() try: storage_writer.WritePreprocessingInformation(self.knowledge_base) self._ProcessSources(source_path_specs, storage_writer) finally: storage_writer.WriteSessionCompletion(aborted=self._abort) storage_writer.Close() finally: # Stop the status update thread after close of the storage writer # so we include the storage sync to disk in the status updates. self._StopStatusUpdateThread() if self._serializers_profiler: storage_writer.SetSerializersProfiler(None) if self._storage_profiler: storage_writer.SetStorageProfiler(None) self._task_manager.StopProfiling() self._StopProfiling() try: self._StopExtractionProcesses(abort=self._abort) except KeyboardInterrupt: self._AbortKill() # The abort can leave the main process unresponsive # due to incorrectly finalized IPC. self._KillProcess(os.getpid()) # The task queue should be closed by _StopExtractionProcesses, this # close is a failsafe, primarily due to MultiProcessingQueue's # blocking behavior. self._task_queue.Close(abort=True) if self._processing_status.error_path_specs: task_storage_abort = True else: task_storage_abort = self._abort try: storage_writer.StopTaskStorage(abort=task_storage_abort) except (IOError, OSError) as exception: logger.error('Unable to stop task storage with error: {0!s}'.format( exception)) if self._abort: logger.debug('Processing aborted.') self._processing_status.aborted = True else: logger.debug('Processing completed.') # Reset values. self._enable_sigsegv_handler = None self._number_of_worker_processes = None self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT self._processing_configuration = None self._session_identifier = None self._status_update_callback = None self._storage_writer = None return self._processing_status
def _Main(self): """The main loop.""" # We need a resolver context per process to prevent multi processing # issues with file objects stored in images. resolver_context = context.Context() for credential_configuration in self._processing_configuration.credentials: resolver.Resolver.key_chain.SetCredential( credential_configuration.path_spec, credential_configuration.credential_type, credential_configuration.credential_data) self._parser_mediator = parsers_mediator.ParserMediator( None, self._knowledge_base, preferred_year=self._processing_configuration.preferred_year, resolver_context=resolver_context, temporary_directory=self._processing_configuration.temporary_directory) self._parser_mediator.SetEventExtractionConfiguration( self._processing_configuration.event_extraction) self._parser_mediator.SetInputSourceConfiguration( self._processing_configuration.input_source) # We need to initialize the parser and hasher objects after the process # has forked otherwise on Windows the "fork" will fail with # a PickleError for Python modules that cannot be pickled. self._extraction_worker = worker.EventExtractionWorker( parser_filter_expression=( self._processing_configuration.parser_filter_expression)) self._extraction_worker.SetExtractionConfiguration( self._processing_configuration.extraction) self._parser_mediator.StartProfiling( self._processing_configuration.profiling, self._name, self._process_information) self._StartProfiling(self._processing_configuration.profiling) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(self._processing_profiler) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: self._storage_writer.SetStorageProfiler(self._storage_profiler) logger.debug('Worker: {0!s} (PID: {1:d}) started.'.format( self._name, self._pid)) self._status = definitions.PROCESSING_STATUS_RUNNING try: logger.debug('{0!s} (PID: {1:d}) started monitoring task queue.'.format( self._name, self._pid)) while not self._abort: try: task = self._task_queue.PopItem() except (errors.QueueClose, errors.QueueEmpty) as exception: logger.debug('ConsumeItems exiting with exception {0:s}.'.format( type(exception))) break if isinstance(task, plaso_queue.QueueAbort): logger.debug('ConsumeItems exiting, dequeued QueueAbort object.') break self._ProcessTask(task) logger.debug('{0!s} (PID: {1:d}) stopped monitoring task queue.'.format( self._name, self._pid)) # All exceptions need to be caught here to prevent the process # from being killed by an uncaught exception. except Exception as exception: # pylint: disable=broad-except logger.warning( 'Unhandled exception in process: {0!s} (PID: {1:d}).'.format( self._name, self._pid)) logger.exception(exception) self._abort = True if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(None) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(None) if self._storage_profiler: self._storage_writer.SetStorageProfiler(None) self._StopProfiling() self._parser_mediator.StopProfiling() self._extraction_worker = None self._parser_mediator = None self._storage_writer = None if self._abort: self._status = definitions.PROCESSING_STATUS_ABORTED else: self._status = definitions.PROCESSING_STATUS_COMPLETED logger.debug('Worker: {0!s} (PID: {1:d}) stopped.'.format( self._name, self._pid)) try: self._task_queue.Close(abort=self._abort) except errors.QueueAlreadyClosed: logger.error('Queue for {0:s} was already closed.'.format(self.name))
def _AnalyzeEvents(self, storage_writer, analysis_plugins, event_filter=None): """Analyzes events in a plaso storage. Args: storage_writer (StorageWriter): storage writer. analysis_plugins (dict[str, AnalysisPlugin]): analysis plugins that should be run and their names. event_filter (Optional[FilterObject]): event filter. Returns: collections.Counter: counter containing information about the events processed and filtered. Raises: RuntimeError: if a non-recoverable situation is encountered. """ self._status = definitions.PROCESSING_STATUS_RUNNING self._number_of_consumed_errors = 0 self._number_of_consumed_events = 0 self._number_of_consumed_reports = 0 self._number_of_consumed_sources = 0 self._number_of_produced_errors = 0 self._number_of_produced_events = 0 self._number_of_produced_reports = 0 self._number_of_produced_sources = 0 number_of_filtered_events = 0 logger.debug('Processing events.') filter_limit = getattr(event_filter, 'limit', None) for event in storage_writer.GetSortedEvents(): event_data_identifier = event.GetEventDataIdentifier() if event_data_identifier: event_data = storage_writer.GetEventDataByIdentifier( event_data_identifier) if event_data: for attribute_name, attribute_value in event_data.GetAttributes(): setattr(event, attribute_name, attribute_value) event_identifier = event.GetIdentifier() event.tag = self._event_tag_index.GetEventTagByIdentifier( storage_writer, event_identifier) if event_filter: filter_match = event_filter.Match(event) else: filter_match = None # pylint: disable=singleton-comparison if filter_match == False: number_of_filtered_events += 1 continue for event_queue in self._event_queues.values(): # TODO: Check for premature exit of analysis plugins. event_queue.PushItem(event) self._number_of_consumed_events += 1 if (event_filter and filter_limit and filter_limit == self._number_of_consumed_events): break logger.debug('Finished pushing events to analysis plugins.') # Signal that we have finished adding events. for event_queue in self._event_queues.values(): event_queue.PushItem(plaso_queue.QueueAbort(), block=False) logger.debug('Processing analysis plugin results.') # TODO: use a task based approach. plugin_names = [plugin_name for plugin_name in analysis_plugins.keys()] while plugin_names: for plugin_name in list(plugin_names): if self._abort: break # TODO: temporary solution. task = tasks.Task() task.identifier = plugin_name merge_ready = storage_writer.CheckTaskReadyForMerge(task) if merge_ready: storage_writer.PrepareMergeTaskStorage(task) self._status = definitions.PROCESSING_STATUS_MERGING event_queue = self._event_queues[plugin_name] del self._event_queues[plugin_name] event_queue.Close() storage_merge_reader = storage_writer.StartMergeTaskStorage(task) storage_merge_reader.MergeAttributeContainers( callback=self._MergeEventTag) # TODO: temporary solution. plugin_names.remove(plugin_name) self._status = definitions.PROCESSING_STATUS_RUNNING self._number_of_produced_event_tags = ( storage_writer.number_of_event_tags) self._number_of_produced_reports = ( storage_writer.number_of_analysis_reports) try: storage_writer.StopTaskStorage(abort=self._abort) except (IOError, OSError) as exception: logger.error('Unable to stop task storage with error: {0!s}'.format( exception)) if self._abort: logger.debug('Processing aborted.') else: logger.debug('Processing completed.') events_counter = collections.Counter() events_counter['Events filtered'] = number_of_filtered_events events_counter['Events processed'] = self._number_of_consumed_events return events_counter