Esempio n. 1
0
  def _ProcessPathSpec(self, extraction_worker, parser_mediator, path_spec):
    """Processes a path specification.

    Args:
      extraction_worker (worker.ExtractionWorker): extraction worker.
      parser_mediator (ParserMediator): parser mediator.
      path_spec (dfvfs.PathSpec): path specification.
    """
    self._current_display_name = parser_mediator.GetDisplayNameForPathSpec(
        path_spec)

    try:
      extraction_worker.ProcessPathSpec(parser_mediator, path_spec)

    except dfvfs_errors.CacheFullError:
      # TODO: signal engine of failure.
      self._abort = True
      logger.error((
          'ABORT: detected cache full error while processing path spec: '
          '{0:s}').format(self._current_display_name))

    except Exception as exception:  # pylint: disable=broad-except
      parser_mediator.ProduceExtractionWarning((
          'unable to process path specification with error: '
          '{0!s}').format(exception), path_spec=path_spec)

      if self._processing_configuration.debug_output:
        logger.warning((
            'Unhandled exception while processing path specification: '
            '{0:s}.').format(self._current_display_name))
        logger.exception(exception)
Esempio n. 2
0
  def _StartWorkerProcess(self, process_name, storage_writer):
    """Creates, starts, monitors and registers a worker process.

    Args:
      process_name (str): process name.
      storage_writer (StorageWriter): storage writer for a session storage used
          to create task storage.

    Returns:
      MultiProcessWorkerProcess: extraction worker process or None if the
          process could not be started.
    """
    process_name = 'Worker_{0:02d}'.format(self._last_worker_number)
    logger.debug('Starting worker process {0:s}'.format(process_name))

    if self._use_zeromq:
      queue_name = '{0:s} task queue'.format(process_name)
      task_queue = zeromq_queue.ZeroMQRequestConnectQueue(
          delay_open=True, linger_seconds=0, name=queue_name,
          port=self._task_queue_port,
          timeout_seconds=self._TASK_QUEUE_TIMEOUT_SECONDS)
    else:
      task_queue = self._task_queue

    process = worker_process.WorkerProcess(
        task_queue, storage_writer, self.knowledge_base,
        self._session_identifier, self._processing_configuration,
        enable_sigsegv_handler=self._enable_sigsegv_handler, name=process_name)

    # Remove all possible log handlers to prevent a child process from logging
    # to the main process log file and garbling the log. The log handlers are
    # recreated after the worker process has been started.
    for handler in logging.root.handlers:
      logging.root.removeHandler(handler)
      handler.close()

    process.start()

    loggers.ConfigureLogging(
        debug_output=self._debug_output, filename=self._log_filename,
        mode='a', quiet_mode=self._quiet_mode)

    try:
      self._StartMonitoringProcess(process)

    except (IOError, KeyError) as exception:
      pid = process.pid
      logger.error((
          'Unable to monitor replacement worker process: {0:s} '
          '(PID: {1:d}) with error: {2!s}').format(
              process_name, pid, exception))

      self._TerminateProcess(process)
      return None

    self._RegisterProcess(process)

    self._last_worker_number += 1

    return process
Esempio n. 3
0
    def _StartWorkerProcess(self, process_name, storage_writer):
        """Creates, starts, monitors and registers a worker process.

    Args:
      process_name (str): process name.
      storage_writer (StorageWriter): storage writer for a session storage used
          to create task storage.

    Returns:
      MultiProcessWorkerProcess: extraction worker process or None on error.
    """
        analysis_plugin = self._analysis_plugins.get(process_name, None)
        if not analysis_plugin:
            logger.error('Missing analysis plugin: {0:s}'.format(process_name))
            return None

        queue_name = '{0:s} output event queue'.format(process_name)
        output_event_queue = zeromq_queue.ZeroMQPushBindQueue(
            name=queue_name, timeout_seconds=self._QUEUE_TIMEOUT)
        # Open the queue so it can bind to a random port, and we can get the
        # port number to use in the input queue.
        output_event_queue.Open()

        self._event_queues[process_name] = output_event_queue

        queue_name = '{0:s} input event queue'.format(process_name)
        input_event_queue = zeromq_queue.ZeroMQPullConnectQueue(
            name=queue_name,
            delay_open=True,
            port=output_event_queue.port,
            timeout_seconds=self._QUEUE_TIMEOUT)

        process = analysis_process.AnalysisProcess(
            input_event_queue,
            storage_writer,
            self._knowledge_base,
            analysis_plugin,
            self._processing_configuration,
            data_location=self._data_location,
            event_filter_expression=self._event_filter_expression,
            name=process_name)

        process.start()

        logger.info('Started analysis plugin: {0:s} (PID: {1:d}).'.format(
            process_name, process.pid))

        try:
            self._StartMonitoringProcess(process)
        except (IOError, KeyError) as exception:
            logger.error(
                ('Unable to monitor analysis plugin: {0:s} (PID: {1:d}) '
                 'with error: {2!s}').format(process_name, process.pid,
                                             exception))

            process.terminate()
            return None

        self._RegisterProcess(process)
        return process
Esempio n. 4
0
  def _ProcessPathSpec(self, extraction_worker, parser_mediator, path_spec):
    """Processes a path specification.

    Args:
      extraction_worker (worker.ExtractionWorker): extraction worker.
      parser_mediator (ParserMediator): parser mediator.
      path_spec (dfvfs.PathSpec): path specification.
    """
    self._current_display_name = parser_mediator.GetDisplayNameForPathSpec(
        path_spec)

    try:
      extraction_worker.ProcessPathSpec(parser_mediator, path_spec)

    except dfvfs_errors.CacheFullError:
      # TODO: signal engine of failure.
      self._abort = True
      logger.error((
          'ABORT: detected cache full error while processing path spec: '
          '{0:s}').format(self._current_display_name))

    except Exception as exception:  # pylint: disable=broad-except
      parser_mediator.ProduceExtractionError((
          'unable to process path specification with error: '
          '{0!s}').format(exception), path_spec=path_spec)

      if self._processing_configuration.debug_output:
        logger.warning((
            'Unhandled exception while processing path specification: '
            '{0:s}.').format(self._current_display_name))
        logger.exception(exception)
Esempio n. 5
0
  def _KillProcess(self, pid):
    """Issues a SIGKILL or equivalent to the process.

    Args:
      pid (int): process identifier (PID).
    """
    if sys.platform.startswith('win'):
      process_terminate = 1
      handle = ctypes.windll.kernel32.OpenProcess(
          process_terminate, False, pid)
      ctypes.windll.kernel32.TerminateProcess(handle, -1)
      ctypes.windll.kernel32.CloseHandle(handle)

    else:
      try:
        os.kill(pid, signal.SIGKILL)
      except OSError as exception:
        logger.error('Unable to kill process {0:d} with error: {1!s}'.format(
            pid, exception))
Esempio n. 6
0
  def _KillProcess(self, pid):
    """Issues a SIGKILL or equivalent to the process.

    Args:
      pid (int): process identifier (PID).
    """
    if sys.platform.startswith('win'):
      process_terminate = 1
      handle = ctypes.windll.kernel32.OpenProcess(
          process_terminate, False, pid)
      ctypes.windll.kernel32.TerminateProcess(handle, -1)
      ctypes.windll.kernel32.CloseHandle(handle)

    else:
      try:
        os.kill(pid, signal.SIGKILL)
      except OSError as exception:
        logger.error('Unable to kill process {0:d} with error: {1!s}'.format(
            pid, exception))
Esempio n. 7
0
  def _StartAnalysisProcesses(self, storage_writer, analysis_plugins):
    """Starts the analysis processes.

    Args:
      storage_writer (StorageWriter): storage writer.
      analysis_plugins (dict[str, AnalysisPlugin]): analysis plugins that
          should be run and their names.
    """
    logger.info('Starting analysis plugins.')

    for analysis_plugin in analysis_plugins.values():
      self._analysis_plugins[analysis_plugin.NAME] = analysis_plugin

      process = self._StartWorkerProcess(analysis_plugin.NAME, storage_writer)
      if not process:
        logger.error('Unable to create analysis process: {0:s}'.format(
            analysis_plugin.NAME))

    logger.info('Analysis plugins running')
Esempio n. 8
0
  def _StartAnalysisProcesses(self, storage_writer, analysis_plugins):
    """Starts the analysis processes.

    Args:
      storage_writer (StorageWriter): storage writer.
      analysis_plugins (dict[str, AnalysisPlugin]): analysis plugins that
          should be run and their names.
    """
    logger.info('Starting analysis plugins.')

    for analysis_plugin in analysis_plugins.values():
      self._analysis_plugins[analysis_plugin.NAME] = analysis_plugin

      process = self._StartWorkerProcess(analysis_plugin.NAME, storage_writer)
      if not process:
        logger.error('Unable to create analysis process: {0:s}'.format(
            analysis_plugin.NAME))

    logger.info('Analysis plugins running')
Esempio n. 9
0
    def _StartProcessStatusRPCServer(self):
        """Starts the process status RPC server."""
        if self._rpc_server:
            return

        self._rpc_server = plaso_xmlrpc.XMLProcessStatusRPCServer(
            self._GetStatus)

        hostname = 'localhost'

        # Try the PID as port number first otherwise pick something random
        # between 1024 and 60000.
        if self._pid < 1024 or self._pid > 60000:
            port = random.randint(1024, 60000)
        else:
            port = self._pid

        if not self._rpc_server.Start(hostname, port):
            port = 0
            for _ in range(self._NUMBER_OF_RPC_SERVER_START_ATTEMPTS):
                port = random.randint(1024, 60000)
                if self._rpc_server.Start(hostname, port):
                    break

                port = 0

        if not port:
            logger.error(
                ('Unable to start a process status RPC server for {0!s} '
                 '(PID: {1:d})').format(self._name, self._pid))
            self._rpc_server = None
            return

        self.rpc_port.value = port

        logger.debug('Process: {0!s} process status RPC server started'.format(
            self._name))
Esempio n. 10
0
  def _StartProcessStatusRPCServer(self):
    """Starts the process status RPC server."""
    if self._rpc_server:
      return

    self._rpc_server = plaso_xmlrpc.XMLProcessStatusRPCServer(self._GetStatus)

    hostname = 'localhost'

    # Try the PID as port number first otherwise pick something random
    # between 1024 and 60000.
    if self._pid < 1024 or self._pid > 60000:
      port = random.randint(1024, 60000)
    else:
      port = self._pid

    if not self._rpc_server.Start(hostname, port):
      port = 0
      for _ in range(self._NUMBER_OF_RPC_SERVER_START_ATTEMPTS):
        port = random.randint(1024, 60000)
        if self._rpc_server.Start(hostname, port):
          break

        port = 0

    if not port:
      logger.error((
          'Unable to start a process status RPC server for {0!s} '
          '(PID: {1:d})').format(self._name, self._pid))
      self._rpc_server = None
      return

    self.rpc_port.value = port

    logger.debug(
        'Process: {0!s} process status RPC server started'.format(self._name))
Esempio n. 11
0
    def ProcessSources(self,
                       session_identifier,
                       source_path_specs,
                       storage_writer,
                       processing_configuration,
                       enable_sigsegv_handler=False,
                       number_of_worker_processes=0,
                       status_update_callback=None,
                       worker_memory_limit=None):
        """Processes the sources and extract events.

    Args:
      session_identifier (str): identifier of the session.
      source_path_specs (list[dfvfs.PathSpec]): path specifications of
          the sources to process.
      storage_writer (StorageWriter): storage writer for a session storage.
      processing_configuration (ProcessingConfiguration): processing
          configuration.
      enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler
          should be enabled.
      number_of_worker_processes (Optional[int]): number of worker processes.
      status_update_callback (Optional[function]): callback function for status
          updates.
      worker_memory_limit (Optional[int]): maximum amount of memory a worker is
          allowed to consume, where None represents the default memory limit
          and 0 represents no limit.

    Returns:
      ProcessingStatus: processing status.
    """
        if number_of_worker_processes < 1:
            # One worker for each "available" CPU (minus other processes).
            # The number here is derived from the fact that the engine starts up:
            # * A main process.
            #
            # If we want to utilize all CPUs on the system we therefore need to start
            # up workers that amounts to the total number of CPUs - the other
            # processes.
            try:
                cpu_count = multiprocessing.cpu_count() - 1

                if cpu_count <= self._WORKER_PROCESSES_MINIMUM:
                    cpu_count = self._WORKER_PROCESSES_MINIMUM

                elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM:
                    cpu_count = self._WORKER_PROCESSES_MAXIMUM

            except NotImplementedError:
                logger.error((
                    'Unable to determine number of CPUs defaulting to {0:d} worker '
                    'processes.').format(self._WORKER_PROCESSES_MINIMUM))
                cpu_count = self._WORKER_PROCESSES_MINIMUM

            number_of_worker_processes = cpu_count

        self._enable_sigsegv_handler = enable_sigsegv_handler
        self._number_of_worker_processes = number_of_worker_processes

        if worker_memory_limit is None:
            self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT
        else:
            self._worker_memory_limit = worker_memory_limit

        # Keep track of certain values so we can spawn new extraction workers.
        self._processing_configuration = processing_configuration

        self._debug_output = processing_configuration.debug_output
        self._log_filename = processing_configuration.log_filename
        self._session_identifier = session_identifier
        self._status_update_callback = status_update_callback
        self._storage_writer = storage_writer

        # Set up the task queue.
        if not self._use_zeromq:
            self._task_queue = multi_process_queue.MultiProcessingQueue(
                maximum_number_of_queued_items=self._maximum_number_of_tasks)

        else:
            task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue(
                delay_open=True,
                linger_seconds=0,
                maximum_items=1,
                name='main_task_queue',
                timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS)
            self._task_queue = task_outbound_queue

            # The ZeroMQ backed queue must be started first, so we can save its port.
            # TODO: raises: attribute-defined-outside-init
            # self._task_queue.name = 'Task queue'
            self._task_queue.Open()
            self._task_queue_port = self._task_queue.port

        self._StartProfiling(self._processing_configuration.profiling)
        self._task_manager.StartProfiling(
            self._processing_configuration.profiling, self._name)

        if self._serializers_profiler:
            storage_writer.SetSerializersProfiler(self._serializers_profiler)

        if self._storage_profiler:
            storage_writer.SetStorageProfiler(self._storage_profiler)

        # Set up the storage writer before the worker processes.
        storage_writer.StartTaskStorage()

        for worker_number in range(number_of_worker_processes):
            # First argument to _StartWorkerProcess is not used.
            extraction_process = self._StartWorkerProcess('', storage_writer)
            if not extraction_process:
                logger.error('Unable to create worker process: {0:d}'.format(
                    worker_number))

        self._StartStatusUpdateThread()

        try:
            # Open the storage file after creating the worker processes otherwise
            # the ZIP storage file will remain locked as long as the worker processes
            # are alive.
            storage_writer.Open()
            storage_writer.WriteSessionStart()

            try:
                storage_writer.WritePreprocessingInformation(
                    self.knowledge_base)

                self._ProcessSources(source_path_specs, storage_writer)

            finally:
                storage_writer.WriteSessionCompletion(aborted=self._abort)

                storage_writer.Close()

        finally:
            # Stop the status update thread after close of the storage writer
            # so we include the storage sync to disk in the status updates.
            self._StopStatusUpdateThread()

            if self._serializers_profiler:
                storage_writer.SetSerializersProfiler(None)

            if self._storage_profiler:
                storage_writer.SetStorageProfiler(None)

            self._task_manager.StopProfiling()
            self._StopProfiling()

        try:
            self._StopExtractionProcesses(abort=self._abort)

        except KeyboardInterrupt:
            self._AbortKill()

            # The abort can leave the main process unresponsive
            # due to incorrectly finalized IPC.
            self._KillProcess(os.getpid())

        # The task queue should be closed by _StopExtractionProcesses, this
        # close is a failsafe, primarily due to MultiProcessingQueue's
        # blocking behavior.
        self._task_queue.Close(abort=True)

        if self._processing_status.error_path_specs:
            task_storage_abort = True
        else:
            task_storage_abort = self._abort

        try:
            storage_writer.StopTaskStorage(abort=task_storage_abort)
        except (IOError, OSError) as exception:
            logger.error(
                'Unable to stop task storage with error: {0!s}'.format(
                    exception))

        if self._abort:
            logger.debug('Processing aborted.')
            self._processing_status.aborted = True
        else:
            logger.debug('Processing completed.')

        # Reset values.
        self._enable_sigsegv_handler = None
        self._number_of_worker_processes = None
        self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT

        self._processing_configuration = None

        self._session_identifier = None
        self._status_update_callback = None
        self._storage_writer = None

        return self._processing_status
Esempio n. 12
0
    def _UpdateProcessingStatus(self, pid, process_status, used_memory):
        """Updates the processing status.

    Args:
      pid (int): process identifier (PID) of the worker process.
      process_status (dict[str, object]): status values received from
          the worker process.
      used_memory (int): size of used memory in bytes.

    Raises:
      KeyError: if the process is not registered with the engine.
    """
        self._RaiseIfNotRegistered(pid)

        if not process_status:
            return

        process = self._processes_per_pid[pid]

        processing_status = process_status.get('processing_status', None)

        self._RaiseIfNotMonitored(pid)

        display_name = process_status.get('display_name', '')

        number_of_consumed_event_tags = process_status.get(
            'number_of_consumed_event_tags', None)
        number_of_produced_event_tags = process_status.get(
            'number_of_produced_event_tags', None)

        number_of_consumed_events = process_status.get(
            'number_of_consumed_events', None)
        number_of_produced_events = process_status.get(
            'number_of_produced_events', None)

        number_of_consumed_reports = process_status.get(
            'number_of_consumed_reports', None)
        number_of_produced_reports = process_status.get(
            'number_of_produced_reports', None)

        number_of_consumed_sources = process_status.get(
            'number_of_consumed_sources', None)
        number_of_produced_sources = process_status.get(
            'number_of_produced_sources', None)

        number_of_consumed_warnings = process_status.get(
            'number_of_consumed_warnings', None)
        number_of_produced_warnings = process_status.get(
            'number_of_produced_warnings', None)

        if processing_status != definitions.STATUS_INDICATOR_IDLE:
            last_activity_timestamp = process_status.get(
                'last_activity_timestamp', 0.0)

            if last_activity_timestamp:
                last_activity_timestamp += self._PROCESS_WORKER_TIMEOUT

                current_timestamp = time.time()
                if current_timestamp > last_activity_timestamp:
                    logger.error((
                        'Process {0:s} (PID: {1:d}) has not reported activity within '
                        'the timeout period.').format(process.name, pid))
                    processing_status = definitions.STATUS_INDICATOR_NOT_RESPONDING

        self._processing_status.UpdateWorkerStatus(
            process.name, processing_status, pid, used_memory, display_name,
            number_of_consumed_sources, number_of_produced_sources,
            number_of_consumed_events, number_of_produced_events,
            number_of_consumed_event_tags, number_of_produced_event_tags,
            number_of_consumed_reports, number_of_produced_reports,
            number_of_consumed_warnings, number_of_produced_warnings)

        task_identifier = process_status.get('task_identifier', '')
        if not task_identifier:
            return

        try:
            self._task_manager.UpdateTaskAsProcessingByIdentifier(
                task_identifier)
            return
        except KeyError:
            logger.debug(
                'Worker {0:s} is processing unknown task: {1:s}.'.format(
                    process.name, task_identifier))
Esempio n. 13
0
    def _MergeTaskStorage(self, storage_writer):
        """Merges a task storage with the session storage.

    This function checks all task stores that are ready to merge and updates
    the scheduled tasks. Note that to prevent this function holding up
    the task scheduling loop only the first available task storage is merged.

    Args:
      storage_writer (StorageWriter): storage writer for a session storage used
          to merge task storage.
    """
        if self._processing_profiler:
            self._processing_profiler.StartTiming('merge_check')

        for task_identifier in storage_writer.GetProcessedTaskIdentifiers():
            try:
                task = self._task_manager.GetProcessedTaskByIdentifier(
                    task_identifier)

                self._task_manager.SampleTaskStatus(task, 'processed')

                to_merge = self._task_manager.CheckTaskToMerge(task)
                if not to_merge:
                    storage_writer.RemoveProcessedTaskStorage(task)

                    self._task_manager.RemoveTask(task)
                    self._task_manager.SampleTaskStatus(
                        task, 'removed_processed')

                else:
                    storage_writer.PrepareMergeTaskStorage(task)
                    self._task_manager.UpdateTaskAsPendingMerge(task)

            except KeyError:
                logger.error(
                    'Unable to retrieve task: {0:s} to prepare it to be merged.'
                    .format(task_identifier))
                continue

        if self._processing_profiler:
            self._processing_profiler.StopTiming('merge_check')

        task = None
        if not self._storage_merge_reader_on_hold:
            task = self._task_manager.GetTaskPendingMerge(self._merge_task)

        # Limit the number of attribute containers from a single task-based
        # storage file that are merged per loop to keep tasks flowing.
        if task or self._storage_merge_reader:
            self._status = definitions.STATUS_INDICATOR_MERGING

            if self._processing_profiler:
                self._processing_profiler.StartTiming('merge')

            if task:
                if self._storage_merge_reader:
                    self._merge_task_on_hold = self._merge_task
                    self._storage_merge_reader_on_hold = self._storage_merge_reader

                    self._task_manager.SampleTaskStatus(
                        self._merge_task_on_hold, 'merge_on_hold')

                self._merge_task = task
                try:
                    self._storage_merge_reader = storage_writer.StartMergeTaskStorage(
                        task)

                    self._task_manager.SampleTaskStatus(task, 'merge_started')

                except IOError as exception:
                    logger.error(
                        ('Unable to merge results of task: {0:s} '
                         'with error: {1!s}').format(task.identifier,
                                                     exception))
                    self._storage_merge_reader = None

            if self._storage_merge_reader:
                fully_merged = self._storage_merge_reader.MergeAttributeContainers(
                    maximum_number_of_containers=self.
                    _MAXIMUM_NUMBER_OF_CONTAINERS)
            else:
                # TODO: Do something more sensible when this happens, perhaps
                # retrying the task once that is implemented. For now, we mark the task
                # as fully merged because we can't continue with it.
                fully_merged = True

            if self._processing_profiler:
                self._processing_profiler.StopTiming('merge')

            if fully_merged:
                try:
                    self._task_manager.CompleteTask(self._merge_task)

                except KeyError as exception:
                    logger.error(
                        'Unable to complete task: {0:s} with error: {1!s}'.
                        format(self._merge_task.identifier, exception))

                if not self._storage_merge_reader_on_hold:
                    self._merge_task = None
                    self._storage_merge_reader = None
                else:
                    self._merge_task = self._merge_task_on_hold
                    self._storage_merge_reader = self._storage_merge_reader_on_hold

                    self._merge_task_on_hold = None
                    self._storage_merge_reader_on_hold = None

                    self._task_manager.SampleTaskStatus(
                        self._merge_task, 'merge_resumed')

            self._status = definitions.STATUS_INDICATOR_RUNNING
            self._number_of_produced_events = storage_writer.number_of_events
            self._number_of_produced_sources = storage_writer.number_of_event_sources
            self._number_of_produced_warnings = storage_writer.number_of_warnings
Esempio n. 14
0
    def _GetEventIdentifiers(self, event):
        """Retrieves different identifiers of the event.

    Every event contains event data, which consists of attributes and values.
    These attributes and values can be represented as a string and used for
    sorting and uniquely identifying events. This function determines multiple
    identifiers:
    * an identifier of the attributes and values without the timestamp
      description (or usage). This is referred to as the MACB group
      identifier.
    * an identifier of the attributes and values including the timestamp
      description (or usage). This is referred to as the event content
      identifier.

    The identifier without the timestamp description can be used to group
    events that have the same MACB (modification, access, change, birth)
    timestamps. The PsortEventHeap will store these events individually and
    relies on PsortMultiProcessEngine to do the actual grouping of events.

    Args:
      event (EventObject): event.

    Returns:
      tuple: contains:

        str: identifier of the event MACB group or None if the event cannot
            be grouped.
        str: identifier of the event content.
    """
        attributes = []

        attribute_string = 'data_type: {0:s}'.format(event.data_type)
        attributes.append(attribute_string)

        for attribute_name, attribute_value in sorted(event.GetAttributes()):
            if attribute_name in self._IDENTIFIER_EXCLUDED_ATTRIBUTES:
                continue

            if not attribute_value:
                continue

            if attribute_name == 'pathspec':
                attribute_value = attribute_value.comparable

            elif isinstance(attribute_value, dict):
                attribute_value = sorted(attribute_value.items())

            elif isinstance(attribute_value, set):
                attribute_value = sorted(list(attribute_value))

            elif isinstance(attribute_value, py2to3.BYTES_TYPE):
                attribute_value = repr(attribute_value)

            try:
                attribute_string = '{0:s}: {1!s}'.format(
                    attribute_name, attribute_value)
            except UnicodeDecodeError:
                logger.error(
                    'Failed to decode attribute {0:s}'.format(attribute_name))
            attributes.append(attribute_string)

        # The 'atime', 'ctime', 'crtime', 'mtime' are included for backwards
        # compatibility with the filestat parser.
        if event.timestamp_desc in ('atime', 'ctime', 'crtime', 'mtime',
                                    definitions.TIME_DESCRIPTION_LAST_ACCESS,
                                    definitions.TIME_DESCRIPTION_CHANGE,
                                    definitions.TIME_DESCRIPTION_CREATION,
                                    definitions.TIME_DESCRIPTION_MODIFICATION):
            macb_group_identifier = ', '.join(attributes)
        else:
            macb_group_identifier = None

        attributes.insert(0, event.timestamp_desc)
        content_identifier = ', '.join(attributes)

        return macb_group_identifier, content_identifier
Esempio n. 15
0
  def _CheckStatusAnalysisProcess(self, pid):
    """Checks the status of an analysis process.

    Args:
      pid (int): process ID (PID) of a registered analysis process.

    Raises:
      KeyError: if the process is not registered with the engine.
    """
    # TODO: Refactor this method, simplify and separate concerns (monitoring
    # vs management).
    self._RaiseIfNotRegistered(pid)

    if pid in self._completed_analysis_processes:
      status_indicator = definitions.PROCESSING_STATUS_COMPLETED
      process_status = {
          'processing_status': status_indicator}
      used_memory = 0

    else:
      process = self._processes_per_pid[pid]

      process_status = self._QueryProcessStatus(process)
      if process_status is None:
        process_is_alive = False
      else:
        process_is_alive = True

      process_information = self._process_information_per_pid[pid]
      used_memory = process_information.GetUsedMemory() or 0

      if self._worker_memory_limit and used_memory > self._worker_memory_limit:
        logger.warning((
            'Process: {0:s} (PID: {1:d}) killed because it exceeded the '
            'memory limit: {2:d}.').format(
                process.name, pid, self._worker_memory_limit))
        self._KillProcess(pid)

      if isinstance(process_status, dict):
        self._rpc_errors_per_pid[pid] = 0
        status_indicator = process_status.get('processing_status', None)

        if status_indicator == definitions.PROCESSING_STATUS_COMPLETED:
          self._completed_analysis_processes.add(pid)

      else:
        rpc_errors = self._rpc_errors_per_pid.get(pid, 0) + 1
        self._rpc_errors_per_pid[pid] = rpc_errors

        if rpc_errors > self._MAXIMUM_RPC_ERRORS:
          process_is_alive = False

        if process_is_alive:
          rpc_port = process.rpc_port.value
          logger.warning((
              'Unable to retrieve process: {0:s} (PID: {1:d}) status via '
              'RPC socket: http://localhost:{2:d}').format(
                  process.name, pid, rpc_port))

          processing_status_string = 'RPC error'
          status_indicator = definitions.PROCESSING_STATUS_RUNNING
        else:
          processing_status_string = 'killed'
          status_indicator = definitions.PROCESSING_STATUS_KILLED

        process_status = {
            'processing_status': processing_status_string}

    self._UpdateProcessingStatus(pid, process_status, used_memory)

    if status_indicator in definitions.PROCESSING_ERROR_STATUS:
      logger.error((
          'Process {0:s} (PID: {1:d}) is not functioning correctly. '
          'Status code: {2!s}.').format(
              process.name, pid, status_indicator))

      self._TerminateProcessByPid(pid)
Esempio n. 16
0
  def _UpdateProcessingStatus(self, pid, process_status, used_memory):
    """Updates the processing status.

    Args:
      pid (int): process identifier (PID) of the worker process.
      process_status (dict[str, object]): status values received from
          the worker process.
      used_memory (int): size of used memory in bytes.

    Raises:
      KeyError: if the process is not registered with the engine.
    """
    self._RaiseIfNotRegistered(pid)

    if not process_status:
      return

    process = self._processes_per_pid[pid]

    status_indicator = process_status.get('processing_status', None)

    self._RaiseIfNotMonitored(pid)

    display_name = process_status.get('display_name', '')

    number_of_consumed_event_tags = process_status.get(
        'number_of_consumed_event_tags', None)
    number_of_produced_event_tags = process_status.get(
        'number_of_produced_event_tags', None)

    number_of_consumed_events = process_status.get(
        'number_of_consumed_events', None)
    number_of_produced_events = process_status.get(
        'number_of_produced_events', None)

    number_of_consumed_reports = process_status.get(
        'number_of_consumed_reports', None)
    number_of_produced_reports = process_status.get(
        'number_of_produced_reports', None)

    number_of_consumed_sources = process_status.get(
        'number_of_consumed_sources', None)
    number_of_produced_sources = process_status.get(
        'number_of_produced_sources', None)

    number_of_consumed_warnings = process_status.get(
        'number_of_consumed_warnings', None)
    number_of_produced_warnings = process_status.get(
        'number_of_produced_warnings', None)

    if status_indicator != definitions.STATUS_INDICATOR_IDLE:
      last_activity_timestamp = process_status.get(
          'last_activity_timestamp', 0.0)

      if last_activity_timestamp:
        last_activity_timestamp += self._worker_timeout

        current_timestamp = time.time()
        if current_timestamp > last_activity_timestamp:
          logger.error((
              'Process {0:s} (PID: {1:d}) has not reported activity within '
              'the timeout period.').format(process.name, pid))
          status_indicator = definitions.STATUS_INDICATOR_NOT_RESPONDING

    self._processing_status.UpdateWorkerStatus(
        process.name, status_indicator, pid, used_memory, display_name,
        number_of_consumed_sources, number_of_produced_sources,
        number_of_consumed_events, number_of_produced_events,
        number_of_consumed_event_tags, number_of_produced_event_tags,
        number_of_consumed_reports, number_of_produced_reports,
        number_of_consumed_warnings, number_of_produced_warnings)
Esempio n. 17
0
  def _GetEventIdentifiers(self, event, event_data, event_data_stream):
    """Retrieves different identifiers of the event.

    The event data attributes and values can be represented as a string and used
    for sorting and uniquely identifying events. This function determines
    multiple identifiers:
    * an identifier of the attributes and values without the timestamp
      description (or usage). This is referred to as the MACB group
      identifier.
    * an identifier of the attributes and values including the timestamp
      description (or usage). This is referred to as the event content
      identifier.

    The identifier without the timestamp description can be used to group
    events that have the same MACB (modification, access, change, birth)
    timestamps. The PsortEventHeap will store these events individually and
    relies on PsortMultiProcessEngine to do the actual grouping of events.

    Args:
      event (EventObject): event.
      event_data (EventData): event data.
      event_data_stream (EventDataStream): event data stream.

    Returns:
      tuple: containing:

        str: identifier of the event MACB group or None if the event cannot
            be grouped.
        str: identifier of the event content.
    """
    attributes = []

    attribute_string = 'data_type: {0:s}'.format(event_data.data_type)
    attributes.append(attribute_string)

    event_attributes = list(event_data.GetAttributes())
    if event_data_stream:
      event_data_stream_attributes = event_data_stream.GetAttributes()
      event_attributes.extend(event_data_stream_attributes)

    for attribute_name, attribute_value in sorted(event_attributes):
      # The filestat parser operates on file entry level and has no event data
      # stream with a path specification. Therefore we need filename and inode
      # to make sure events of different file entries are considered unique.
      if event_data.data_type == 'fs:stat' and attribute_name in (
          'filename', 'inode'):
        pass

      elif attribute_name in self._IDENTIFIER_EXCLUDED_ATTRIBUTES:
        continue

      if not attribute_value:
        continue

      # Note that support for event_data.pathspec is kept for backwards
      # compatibility. The current value is event_data_stream.path_spec.
      if attribute_name in ('path_spec', 'pathspec'):
        attribute_value = attribute_value.comparable

      elif isinstance(attribute_value, dict):
        attribute_value = sorted(attribute_value.items())

      elif isinstance(attribute_value, set):
        attribute_value = sorted(list(attribute_value))

      elif isinstance(attribute_value, bytes):
        attribute_value = repr(attribute_value)

      try:
        attribute_string = '{0:s}: {1!s}'.format(
            attribute_name, attribute_value)
      except UnicodeDecodeError:
        logger.error('Failed to decode attribute {0:s}'.format(
            attribute_name))
      attributes.append(attribute_string)

    # The 'atime', 'ctime', 'crtime', 'mtime' are included for backwards
    # compatibility with the filestat parser.
    if event.timestamp_desc in (
        'atime', 'ctime', 'crtime', 'mtime',
        definitions.TIME_DESCRIPTION_LAST_ACCESS,
        definitions.TIME_DESCRIPTION_CHANGE,
        definitions.TIME_DESCRIPTION_CREATION,
        definitions.TIME_DESCRIPTION_MODIFICATION):
      macb_group_identifier = ', '.join(attributes)
    else:
      macb_group_identifier = None

    timestamp_desc = event.timestamp_desc
    if timestamp_desc is None:
      logger.warning('Missing timestamp_desc attribute')
      timestamp_desc = definitions.TIME_DESCRIPTION_UNKNOWN

    attributes.insert(0, timestamp_desc)
    content_identifier = ', '.join(attributes)

    return macb_group_identifier, content_identifier
Esempio n. 18
0
  def _AnalyzeEvents(self, storage_writer, analysis_plugins, event_filter=None):
    """Analyzes events in a plaso storage.

    Args:
      storage_writer (StorageWriter): storage writer.
      analysis_plugins (dict[str, AnalysisPlugin]): analysis plugins that
          should be run and their names.
      event_filter (Optional[EventObjectFilter]): event filter.

    Returns:
      collections.Counter: counter containing information about the events
          processed and filtered.

    Raises:
      RuntimeError: if a non-recoverable situation is encountered.
    """
    self._status = definitions.STATUS_INDICATOR_RUNNING
    self._number_of_consumed_events = 0
    self._number_of_consumed_reports = 0
    self._number_of_consumed_sources = 0
    self._number_of_consumed_warnings = 0
    self._number_of_produced_events = 0
    self._number_of_produced_reports = 0
    self._number_of_produced_sources = 0
    self._number_of_produced_warnings = 0

    number_of_filtered_events = 0

    logger.debug('Processing events.')

    filter_limit = getattr(event_filter, 'limit', None)

    for event in storage_writer.GetSortedEvents():
      event_data_identifier = event.GetEventDataIdentifier()
      event_data = storage_writer.GetEventDataByIdentifier(
          event_data_identifier)

      event_data_stream_identifier = event_data.GetEventDataStreamIdentifier()
      if event_data_stream_identifier:
        event_data_stream = storage_writer.GetEventDataStreamByIdentifier(
            event_data_stream_identifier)
      else:
        event_data_stream = None

      event_identifier = event.GetIdentifier()
      event_tag = self._event_tag_index.GetEventTagByIdentifier(
          storage_writer, event_identifier)

      if event_filter:
        filter_match = event_filter.Match(
            event, event_data, event_data_stream, event_tag)
      else:
        filter_match = None

      # pylint: disable=singleton-comparison
      if filter_match == False:
        number_of_filtered_events += 1
        continue

      for event_queue in self._event_queues.values():
        # TODO: Check for premature exit of analysis plugins.
        event_queue.PushItem((event, event_data, event_data_stream))

      self._number_of_consumed_events += 1

      if (event_filter and filter_limit and
          filter_limit == self._number_of_consumed_events):
        break

    logger.debug('Finished pushing events to analysis plugins.')
    # Signal that we have finished adding events.
    for event_queue in self._event_queues.values():
      event_queue.PushItem(plaso_queue.QueueAbort(), block=False)

    logger.debug('Processing analysis plugin results.')

    # TODO: use a task based approach.
    plugin_names = list(analysis_plugins.keys())
    while plugin_names:
      for plugin_name in list(plugin_names):
        if self._abort:
          break

        # TODO: temporary solution.
        task = tasks.Task()
        task.storage_format = definitions.STORAGE_FORMAT_SQLITE
        task.identifier = plugin_name

        merge_ready = storage_writer.CheckTaskReadyForMerge(task)
        if merge_ready:
          storage_writer.PrepareMergeTaskStorage(task)
          self._status = definitions.STATUS_INDICATOR_MERGING

          event_queue = self._event_queues[plugin_name]
          del self._event_queues[plugin_name]

          event_queue.Close()

          storage_merge_reader = storage_writer.StartMergeTaskStorage(task)

          storage_merge_reader.MergeAttributeContainers(
              callback=self._MergeEventTag)
          # TODO: temporary solution.
          plugin_names.remove(plugin_name)

          self._status = definitions.STATUS_INDICATOR_RUNNING

          self._number_of_produced_event_tags = (
              storage_writer.number_of_event_tags)
          self._number_of_produced_reports = (
              storage_writer.number_of_analysis_reports)

    try:
      storage_writer.StopTaskStorage(abort=self._abort)
    except (IOError, OSError) as exception:
      logger.error('Unable to stop task storage with error: {0!s}'.format(
          exception))

    if self._abort:
      logger.debug('Processing aborted.')
    else:
      logger.debug('Processing completed.')

    events_counter = collections.Counter()
    events_counter['Events filtered'] = number_of_filtered_events
    events_counter['Events processed'] = self._number_of_consumed_events

    return events_counter
Esempio n. 19
0
  def ProcessSources(
      self, session, source_path_specs, storage_writer,
      processing_configuration, enable_sigsegv_handler=False,
      status_update_callback=None):
    """Processes the sources and extract events.

    Args:
      session (Session): session in which the sources are processed.
      source_path_specs (list[dfvfs.PathSpec]): path specifications of
          the sources to process.
      storage_writer (StorageWriter): storage writer for a session storage.
      processing_configuration (ProcessingConfiguration): processing
          configuration.
      enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler
          should be enabled.
      status_update_callback (Optional[function]): callback function for status
          updates.

    Returns:
      ProcessingStatus: processing status.
    """
    self._enable_sigsegv_handler = enable_sigsegv_handler

    # Keep track of certain values so we can spawn new extraction workers.
    self._processing_configuration = processing_configuration

    self._debug_output = processing_configuration.debug_output
    self._log_filename = processing_configuration.log_filename
    self._session_identifier = session.identifier
    self._status_update_callback = status_update_callback
    self._storage_writer = storage_writer

    # Set up the task queue.
    task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue(
        delay_open=True, linger_seconds=0, maximum_items=1,
        name='main_task_queue',
        timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS)
    self._task_queue = task_outbound_queue

    # The ZeroMQ backed queue must be started first, so we can save its port.
    # TODO: raises: attribute-defined-outside-init
    # self._task_queue.name = 'Task queue'
    self._task_queue.Open()
    self._task_queue_port = self._task_queue.port

    self._StartProfiling(self._processing_configuration.profiling)
    self._task_manager.StartProfiling(
        self._processing_configuration.profiling, self._name)

    if self._serializers_profiler:
      storage_writer.SetSerializersProfiler(self._serializers_profiler)

    if self._storage_profiler:
      storage_writer.SetStorageProfiler(self._storage_profiler)

    # Set up the storage writer before the worker processes.
    storage_writer.StartTaskStorage()

    for worker_number in range(self._number_of_worker_processes):
      # First argument to _StartWorkerProcess is not used.
      extraction_process = self._StartWorkerProcess('', storage_writer)
      if not extraction_process:
        logger.error('Unable to create worker process: {0:d}'.format(
            worker_number))

    self._StartStatusUpdateThread()

    # TODO: decouple session and storage writer?
    session.source_configurations = (
        self.knowledge_base.GetSourceConfigurationArtifacts())

    try:
      # Open the storage file after creating the worker processes otherwise
      # the ZIP storage file will remain locked as long as the worker processes
      # are alive.
      storage_writer.Open()
      storage_writer.WriteSessionStart()

      try:
        storage_writer.WriteSessionConfiguration()

        self._ProcessSources(source_path_specs, storage_writer)

      finally:
        storage_writer.WriteSessionCompletion(aborted=self._abort)

        storage_writer.Close()

    finally:
      # Stop the status update thread after close of the storage writer
      # so we include the storage sync to disk in the status updates.
      self._StopStatusUpdateThread()

      if self._serializers_profiler:
        storage_writer.SetSerializersProfiler(None)

      if self._storage_profiler:
        storage_writer.SetStorageProfiler(None)

      self._task_manager.StopProfiling()
      self._StopProfiling()

    try:
      self._StopExtractionProcesses(abort=self._abort)

    except KeyboardInterrupt:
      self._AbortKill()

      # The abort can leave the main process unresponsive
      # due to incorrectly finalized IPC.
      self._KillProcess(os.getpid())

    # The task queue should be closed by _StopExtractionProcesses, this
    # close is a failsafe.
    self._task_queue.Close(abort=True)

    if self._processing_status.error_path_specs:
      task_storage_abort = True
    else:
      task_storage_abort = self._abort

    try:
      storage_writer.StopTaskStorage(abort=task_storage_abort)
    except (IOError, OSError) as exception:
      logger.error('Unable to stop task storage with error: {0!s}'.format(
          exception))

    if self._abort:
      logger.debug('Processing aborted.')
      self._processing_status.aborted = True
    else:
      logger.debug('Processing completed.')

    # Reset values.
    self._enable_sigsegv_handler = None

    self._processing_configuration = None

    self._session_identifier = None
    self._status_update_callback = None
    self._storage_writer = None

    return self._processing_status
Esempio n. 20
0
  def __init__(
      self, maximum_number_of_tasks=None, number_of_worker_processes=0,
      worker_memory_limit=None, worker_timeout=None):
    """Initializes an engine.

    Args:
      maximum_number_of_tasks (Optional[int]): maximum number of concurrent
          tasks, where 0 represents no limit.
      number_of_worker_processes (Optional[int]): number of worker processes.
      worker_memory_limit (Optional[int]): maximum amount of memory a worker is
          allowed to consume, where None represents the default memory limit
          and 0 represents no limit.
      worker_timeout (Optional[float]): number of minutes before a worker
          process that is not providing status updates is considered inactive,
          where None or 0.0 represents the default timeout.
    """
    if maximum_number_of_tasks is None:
      maximum_number_of_tasks = self._MAXIMUM_NUMBER_OF_TASKS

    if number_of_worker_processes < 1:
      # One worker for each "available" CPU (minus other processes).
      # The number here is derived from the fact that the engine starts up:
      # * A main process.
      #
      # If we want to utilize all CPUs on the system we therefore need to start
      # up workers that amounts to the total number of CPUs - the other
      # processes.
      try:
        cpu_count = multiprocessing.cpu_count() - 1

        if cpu_count <= self._WORKER_PROCESSES_MINIMUM:
          cpu_count = self._WORKER_PROCESSES_MINIMUM

        elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM:
          cpu_count = self._WORKER_PROCESSES_MAXIMUM

      except NotImplementedError:
        logger.error((
            'Unable to determine number of CPUs defaulting to {0:d} worker '
            'processes.').format(self._WORKER_PROCESSES_MINIMUM))
        cpu_count = self._WORKER_PROCESSES_MINIMUM

      number_of_worker_processes = cpu_count

    if worker_memory_limit is None:
      worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT

    if not worker_timeout:
      worker_timeout = definitions.DEFAULT_WORKER_TIMEOUT

    super(TaskMultiProcessEngine, self).__init__()
    self._enable_sigsegv_handler = False
    self._last_worker_number = 0
    self._maximum_number_of_tasks = maximum_number_of_tasks
    self._merge_task = None
    self._merge_task_on_hold = None
    self._number_of_consumed_event_tags = 0
    self._number_of_consumed_events = 0
    self._number_of_consumed_reports = 0
    self._number_of_consumed_sources = 0
    self._number_of_consumed_warnings = 0
    self._number_of_produced_event_tags = 0
    self._number_of_produced_events = 0
    self._number_of_produced_reports = 0
    self._number_of_produced_sources = 0
    self._number_of_produced_warnings = 0
    self._number_of_worker_processes = number_of_worker_processes
    self._path_spec_extractor = extractors.PathSpecExtractor()
    self._processing_configuration = None
    self._redis_client = None
    self._resolver_context = context.Context()
    self._session_identifier = None
    self._status = definitions.STATUS_INDICATOR_IDLE
    self._storage_merge_reader = None
    self._storage_merge_reader_on_hold = None
    self._task_queue = None
    self._task_queue_port = None
    self._task_manager = task_manager.TaskManager()
    self._worker_memory_limit = worker_memory_limit
    self._worker_timeout = worker_timeout
Esempio n. 21
0
  def _StartWorkerProcess(self, process_name, storage_writer):
    """Creates, starts, monitors and registers a worker process.

    Args:
      process_name (str): process name.
      storage_writer (StorageWriter): storage writer for a session storage used
          to create task storage.

    Returns:
      MultiProcessWorkerProcess: extraction worker process or None on error.
    """
    analysis_plugin = self._analysis_plugins.get(process_name, None)
    if not analysis_plugin:
      logger.error('Missing analysis plugin: {0:s}'.format(process_name))
      return None

    if self._use_zeromq:
      queue_name = '{0:s} output event queue'.format(process_name)
      output_event_queue = zeromq_queue.ZeroMQPushBindQueue(
          name=queue_name, timeout_seconds=self._QUEUE_TIMEOUT)
      # Open the queue so it can bind to a random port, and we can get the
      # port number to use in the input queue.
      output_event_queue.Open()

    else:
      output_event_queue = multi_process_queue.MultiProcessingQueue(
          timeout=self._QUEUE_TIMEOUT)

    self._event_queues[process_name] = output_event_queue

    if self._use_zeromq:
      queue_name = '{0:s} input event queue'.format(process_name)
      input_event_queue = zeromq_queue.ZeroMQPullConnectQueue(
          name=queue_name, delay_open=True, port=output_event_queue.port,
          timeout_seconds=self._QUEUE_TIMEOUT)

    else:
      input_event_queue = output_event_queue

    process = analysis_process.AnalysisProcess(
        input_event_queue, storage_writer, self._knowledge_base,
        analysis_plugin, self._processing_configuration,
        data_location=self._data_location,
        event_filter_expression=self._event_filter_expression,
        name=process_name)

    process.start()

    logger.info('Started analysis plugin: {0:s} (PID: {1:d}).'.format(
        process_name, process.pid))

    try:
      self._StartMonitoringProcess(process)
    except (IOError, KeyError) as exception:
      logger.error((
          'Unable to monitor analysis plugin: {0:s} (PID: {1:d}) '
          'with error: {2!s}').format(process_name, process.pid, exception))

      process.terminate()
      return None

    self._RegisterProcess(process)
    return process
Esempio n. 22
0
  def _UpdateProcessingStatus(self, pid, process_status, used_memory):
    """Updates the processing status.

    Args:
      pid (int): process identifier (PID) of the worker process.
      process_status (dict[str, object]): status values received from
          the worker process.
      used_memory (int): size of used memory in bytes.

    Raises:
      KeyError: if the process is not registered with the engine.
    """
    self._RaiseIfNotRegistered(pid)

    if not process_status:
      return

    process = self._processes_per_pid[pid]

    processing_status = process_status.get('processing_status', None)

    self._RaiseIfNotMonitored(pid)

    display_name = process_status.get('display_name', '')
    number_of_consumed_errors = process_status.get(
        'number_of_consumed_errors', None)
    number_of_produced_errors = process_status.get(
        'number_of_produced_errors', None)

    number_of_consumed_event_tags = process_status.get(
        'number_of_consumed_event_tags', None)
    number_of_produced_event_tags = process_status.get(
        'number_of_produced_event_tags', None)

    number_of_consumed_events = process_status.get(
        'number_of_consumed_events', None)
    number_of_produced_events = process_status.get(
        'number_of_produced_events', None)

    number_of_consumed_reports = process_status.get(
        'number_of_consumed_reports', None)
    number_of_produced_reports = process_status.get(
        'number_of_produced_reports', None)

    number_of_consumed_sources = process_status.get(
        'number_of_consumed_sources', None)
    number_of_produced_sources = process_status.get(
        'number_of_produced_sources', None)

    if processing_status != definitions.PROCESSING_STATUS_IDLE:
      last_activity_timestamp = process_status.get(
          'last_activity_timestamp', 0.0)

      if last_activity_timestamp:
        last_activity_timestamp += self._PROCESS_WORKER_TIMEOUT

        current_timestamp = time.time()
        if current_timestamp > last_activity_timestamp:
          logger.error((
              'Process {0:s} (PID: {1:d}) has not reported activity within '
              'the timeout period.').format(process.name, pid))
          processing_status = definitions.PROCESSING_STATUS_NOT_RESPONDING

    self._processing_status.UpdateWorkerStatus(
        process.name, processing_status, pid, used_memory, display_name,
        number_of_consumed_sources, number_of_produced_sources,
        number_of_consumed_events, number_of_produced_events,
        number_of_consumed_event_tags, number_of_produced_event_tags,
        number_of_consumed_errors, number_of_produced_errors,
        number_of_consumed_reports, number_of_produced_reports)
Esempio n. 23
0
  def _GetEventIdentifiers(self, event):
    """Retrieves different identifiers of the event.

    Every event contains event data, which consists of attributes and values.
    These attributes and values can be represented as a string and used for
    sorting and uniquely identifying events. This function determines multiple
    identifiers:
    * an identifier of the attributes and values without the timestamp
      description (or usage). This is referred to as the MACB group
      identifier.
    * an identifier of the attributes and values including the timestamp
      description (or usage). This is referred to as the event content
      identifier.

    The identifier without the timestamp description can be used to group
    events that have the same MACB (modification, access, change, birth)
    timestamps. The PsortEventHeap will store these events individually and
    relies on PsortMultiProcessEngine to do the actual grouping of events.

    Args:
      event (EventObject): event.

    Returns:
      tuple: containing:

        str: identifier of the event MACB group or None if the event cannot
            be grouped.
        str: identifier of the event content.
    """
    attributes = []

    attribute_string = 'data_type: {0:s}'.format(event.data_type)
    attributes.append(attribute_string)

    for attribute_name, attribute_value in sorted(event.GetAttributes()):
      if attribute_name in self._IDENTIFIER_EXCLUDED_ATTRIBUTES:
        continue

      if not attribute_value:
        continue

      if attribute_name == 'pathspec':
        attribute_value = attribute_value.comparable

      elif isinstance(attribute_value, dict):
        attribute_value = sorted(attribute_value.items())

      elif isinstance(attribute_value, set):
        attribute_value = sorted(list(attribute_value))

      elif isinstance(attribute_value, py2to3.BYTES_TYPE):
        attribute_value = repr(attribute_value)

      try:
        attribute_string = '{0:s}: {1!s}'.format(
            attribute_name, attribute_value)
      except UnicodeDecodeError:
        logger.error('Failed to decode attribute {0:s}'.format(
            attribute_name))
      attributes.append(attribute_string)

    # The 'atime', 'ctime', 'crtime', 'mtime' are included for backwards
    # compatibility with the filestat parser.
    if event.timestamp_desc in (
        'atime', 'ctime', 'crtime', 'mtime',
        definitions.TIME_DESCRIPTION_LAST_ACCESS,
        definitions.TIME_DESCRIPTION_CHANGE,
        definitions.TIME_DESCRIPTION_CREATION,
        definitions.TIME_DESCRIPTION_MODIFICATION):
      macb_group_identifier = ', '.join(attributes)
    else:
      macb_group_identifier = None

    attributes.insert(0, event.timestamp_desc)
    content_identifier = ', '.join(attributes)

    return macb_group_identifier, content_identifier
Esempio n. 24
0
  def ProcessSources(
      self, session_identifier, source_path_specs, storage_writer,
      processing_configuration, enable_sigsegv_handler=False,
      number_of_worker_processes=0, status_update_callback=None,
      worker_memory_limit=None):
    """Processes the sources and extract events.

    Args:
      session_identifier (str): identifier of the session.
      source_path_specs (list[dfvfs.PathSpec]): path specifications of
          the sources to process.
      storage_writer (StorageWriter): storage writer for a session storage.
      processing_configuration (ProcessingConfiguration): processing
          configuration.
      enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler
          should be enabled.
      number_of_worker_processes (Optional[int]): number of worker processes.
      status_update_callback (Optional[function]): callback function for status
          updates.
      worker_memory_limit (Optional[int]): maximum amount of memory a worker is
          allowed to consume, where None represents the default memory limit
          and 0 represents no limit.

    Returns:
      ProcessingStatus: processing status.
    """
    if number_of_worker_processes < 1:
      # One worker for each "available" CPU (minus other processes).
      # The number here is derived from the fact that the engine starts up:
      # * A main process.
      #
      # If we want to utilize all CPUs on the system we therefore need to start
      # up workers that amounts to the total number of CPUs - the other
      # processes.
      try:
        cpu_count = multiprocessing.cpu_count() - 1

        if cpu_count <= self._WORKER_PROCESSES_MINIMUM:
          cpu_count = self._WORKER_PROCESSES_MINIMUM

        elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM:
          cpu_count = self._WORKER_PROCESSES_MAXIMUM

      except NotImplementedError:
        logger.error((
            'Unable to determine number of CPUs defaulting to {0:d} worker '
            'processes.').format(self._WORKER_PROCESSES_MINIMUM))
        cpu_count = self._WORKER_PROCESSES_MINIMUM

      number_of_worker_processes = cpu_count

    self._enable_sigsegv_handler = enable_sigsegv_handler
    self._number_of_worker_processes = number_of_worker_processes

    if worker_memory_limit is None:
      self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT
    else:
      self._worker_memory_limit = worker_memory_limit

    # Keep track of certain values so we can spawn new extraction workers.
    self._processing_configuration = processing_configuration

    self._debug_output = processing_configuration.debug_output
    self._log_filename = processing_configuration.log_filename
    self._session_identifier = session_identifier
    self._status_update_callback = status_update_callback
    self._storage_writer = storage_writer

    # Set up the task queue.
    if not self._use_zeromq:
      self._task_queue = multi_process_queue.MultiProcessingQueue(
          maximum_number_of_queued_items=self._maximum_number_of_tasks)

    else:
      task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue(
          delay_open=True, linger_seconds=0, maximum_items=1,
          name='main_task_queue',
          timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS)
      self._task_queue = task_outbound_queue

      # The ZeroMQ backed queue must be started first, so we can save its port.
      # TODO: raises: attribute-defined-outside-init
      # self._task_queue.name = 'Task queue'
      self._task_queue.Open()
      self._task_queue_port = self._task_queue.port

    self._StartProfiling(self._processing_configuration.profiling)
    self._task_manager.StartProfiling(
        self._processing_configuration.profiling, self._name)

    if self._serializers_profiler:
      storage_writer.SetSerializersProfiler(self._serializers_profiler)

    if self._storage_profiler:
      storage_writer.SetStorageProfiler(self._storage_profiler)

    # Set up the storage writer before the worker processes.
    storage_writer.StartTaskStorage()

    for worker_number in range(number_of_worker_processes):
      # First argument to _StartWorkerProcess is not used.
      extraction_process = self._StartWorkerProcess('', storage_writer)
      if not extraction_process:
        logger.error('Unable to create worker process: {0:d}'.format(
            worker_number))

    self._StartStatusUpdateThread()

    try:
      # Open the storage file after creating the worker processes otherwise
      # the ZIP storage file will remain locked as long as the worker processes
      # are alive.
      storage_writer.Open()
      storage_writer.WriteSessionStart()

      try:
        storage_writer.WritePreprocessingInformation(self.knowledge_base)

        self._ProcessSources(source_path_specs, storage_writer)

      finally:
        storage_writer.WriteSessionCompletion(aborted=self._abort)

        storage_writer.Close()

    finally:
      # Stop the status update thread after close of the storage writer
      # so we include the storage sync to disk in the status updates.
      self._StopStatusUpdateThread()

      if self._serializers_profiler:
        storage_writer.SetSerializersProfiler(None)

      if self._storage_profiler:
        storage_writer.SetStorageProfiler(None)

      self._task_manager.StopProfiling()
      self._StopProfiling()

    try:
      self._StopExtractionProcesses(abort=self._abort)

    except KeyboardInterrupt:
      self._AbortKill()

      # The abort can leave the main process unresponsive
      # due to incorrectly finalized IPC.
      self._KillProcess(os.getpid())

    # The task queue should be closed by _StopExtractionProcesses, this
    # close is a failsafe, primarily due to MultiProcessingQueue's
    # blocking behavior.
    self._task_queue.Close(abort=True)

    if self._processing_status.error_path_specs:
      task_storage_abort = True
    else:
      task_storage_abort = self._abort

    try:
      storage_writer.StopTaskStorage(abort=task_storage_abort)
    except (IOError, OSError) as exception:
      logger.error('Unable to stop task storage with error: {0!s}'.format(
          exception))

    if self._abort:
      logger.debug('Processing aborted.')
      self._processing_status.aborted = True
    else:
      logger.debug('Processing completed.')

    # Reset values.
    self._enable_sigsegv_handler = None
    self._number_of_worker_processes = None
    self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT

    self._processing_configuration = None

    self._session_identifier = None
    self._status_update_callback = None
    self._storage_writer = None

    return self._processing_status
Esempio n. 25
0
  def _CheckStatusWorkerProcess(self, pid):
    """Checks the status of a worker process.

    If a worker process is not responding the process is terminated and
    a replacement process is started.

    Args:
      pid (int): process ID (PID) of a registered worker process.

    Raises:
      KeyError: if the process is not registered with the engine.
    """
    # TODO: Refactor this method, simplify and separate concerns (monitoring
    # vs management).
    self._RaiseIfNotRegistered(pid)

    process = self._processes_per_pid[pid]

    process_status = self._QueryProcessStatus(process)
    if process_status is None:
      process_is_alive = False
    else:
      process_is_alive = True

    process_information = self._process_information_per_pid[pid]
    used_memory = process_information.GetUsedMemory() or 0

    if self._worker_memory_limit and used_memory > self._worker_memory_limit:
      logger.warning((
          'Process: {0:s} (PID: {1:d}) killed because it exceeded the '
          'memory limit: {2:d}.').format(
              process.name, pid, self._worker_memory_limit))
      self._KillProcess(pid)

    if isinstance(process_status, dict):
      self._rpc_errors_per_pid[pid] = 0
      status_indicator = process_status.get('processing_status', None)

    else:
      rpc_errors = self._rpc_errors_per_pid.get(pid, 0) + 1
      self._rpc_errors_per_pid[pid] = rpc_errors

      if rpc_errors > self._MAXIMUM_RPC_ERRORS:
        process_is_alive = False

      if process_is_alive:
        rpc_port = process.rpc_port.value
        logger.warning((
            'Unable to retrieve process: {0:s} (PID: {1:d}) status via '
            'RPC socket: http://localhost:{2:d}').format(
                process.name, pid, rpc_port))

        processing_status_string = 'RPC error'
        status_indicator = definitions.PROCESSING_STATUS_RUNNING
      else:
        processing_status_string = 'killed'
        status_indicator = definitions.PROCESSING_STATUS_KILLED

      process_status = {
          'processing_status': processing_status_string}

    self._UpdateProcessingStatus(pid, process_status, used_memory)

    # _UpdateProcessingStatus can also change the status of the worker,
    # So refresh the status if applicable.
    for worker_status in self._processing_status.workers_status:
      if worker_status.pid == pid:
        status_indicator = worker_status.status
        break

    if status_indicator in definitions.PROCESSING_ERROR_STATUS:
      logger.error((
          'Process {0:s} (PID: {1:d}) is not functioning correctly. '
          'Status code: {2!s}.').format(process.name, pid, status_indicator))

      self._TerminateProcessByPid(pid)

      logger.info('Starting replacement worker process for {0:s}'.format(
          process.name))
      replacement_process_attempts = 0
      replacement_process = None
      while replacement_process_attempts < self._MAXIMUM_REPLACEMENT_RETRIES:
        replacement_process_attempts += 1
        replacement_process = self._StartWorkerProcess(
            process.name, self._storage_writer)
        if not replacement_process:
          time.sleep(self._REPLACEMENT_WORKER_RETRY_DELAY)
          break
      if not replacement_process:
        logger.error(
            'Unable to create replacement worker process for: {0:s}'.format(
                process.name))
Esempio n. 26
0
  def _CheckStatusAnalysisProcess(self, pid):
    """Checks the status of an analysis process.

    Args:
      pid (int): process ID (PID) of a registered analysis process.

    Raises:
      KeyError: if the process is not registered with the engine.
    """
    # TODO: Refactor this method, simplify and separate concerns (monitoring
    # vs management).
    self._RaiseIfNotRegistered(pid)

    if pid in self._completed_analysis_processes:
      status_indicator = definitions.STATUS_INDICATOR_COMPLETED
      process_status = {
          'processing_status': status_indicator}
      used_memory = 0

    else:
      process = self._processes_per_pid[pid]

      process_status = self._QueryProcessStatus(process)
      if process_status is None:
        process_is_alive = False
      else:
        process_is_alive = True

      process_information = self._process_information_per_pid[pid]
      used_memory = process_information.GetUsedMemory() or 0

      if self._worker_memory_limit and used_memory > self._worker_memory_limit:
        logger.warning((
            'Process: {0:s} (PID: {1:d}) killed because it exceeded the '
            'memory limit: {2:d}.').format(
                process.name, pid, self._worker_memory_limit))
        self._KillProcess(pid)

      if isinstance(process_status, dict):
        self._rpc_errors_per_pid[pid] = 0
        status_indicator = process_status.get('processing_status', None)

        if status_indicator == definitions.STATUS_INDICATOR_COMPLETED:
          self._completed_analysis_processes.add(pid)

      else:
        rpc_errors = self._rpc_errors_per_pid.get(pid, 0) + 1
        self._rpc_errors_per_pid[pid] = rpc_errors

        if rpc_errors > self._MAXIMUM_RPC_ERRORS:
          process_is_alive = False

        if process_is_alive:
          rpc_port = process.rpc_port.value
          logger.warning((
              'Unable to retrieve process: {0:s} (PID: {1:d}) status via '
              'RPC socket: http://localhost:{2:d}').format(
                  process.name, pid, rpc_port))

          processing_status_string = 'RPC error'
          status_indicator = definitions.STATUS_INDICATOR_RUNNING
        else:
          processing_status_string = 'killed'
          status_indicator = definitions.STATUS_INDICATOR_KILLED

        process_status = {
            'processing_status': processing_status_string}

    self._UpdateProcessingStatus(pid, process_status, used_memory)

    if status_indicator in definitions.ERROR_STATUS_INDICATORS:
      logger.error((
          'Process {0:s} (PID: {1:d}) is not functioning correctly. '
          'Status code: {2!s}.').format(
              process.name, pid, status_indicator))

      self._TerminateProcessByPid(pid)
Esempio n. 27
0
  def _Main(self):
    """The main loop."""
    # We need a resolver context per process to prevent multi processing
    # issues with file objects stored in images.
    resolver_context = context.Context()

    for credential_configuration in self._processing_configuration.credentials:
      resolver.Resolver.key_chain.SetCredential(
          credential_configuration.path_spec,
          credential_configuration.credential_type,
          credential_configuration.credential_data)

    self._parser_mediator = parsers_mediator.ParserMediator(
        None, self._knowledge_base,
        artifacts_filter_helper=self._artifacts_filter_helper,
        preferred_year=self._processing_configuration.preferred_year,
        resolver_context=resolver_context,
        temporary_directory=self._processing_configuration.temporary_directory)

    self._parser_mediator.SetEventExtractionConfiguration(
        self._processing_configuration.event_extraction)

    self._parser_mediator.SetInputSourceConfiguration(
        self._processing_configuration.input_source)

    # We need to initialize the parser and hasher objects after the process
    # has forked otherwise on Windows the "fork" will fail with
    # a PickleError for Python modules that cannot be pickled.
    self._extraction_worker = worker.EventExtractionWorker(
        parser_filter_expression=(
            self._processing_configuration.parser_filter_expression))

    self._extraction_worker.SetExtractionConfiguration(
        self._processing_configuration.extraction)

    self._parser_mediator.StartProfiling(
        self._processing_configuration.profiling, self._name,
        self._process_information)
    self._StartProfiling(self._processing_configuration.profiling)

    if self._processing_profiler:
      self._extraction_worker.SetProcessingProfiler(self._processing_profiler)

    if self._serializers_profiler:
      self._storage_writer.SetSerializersProfiler(self._serializers_profiler)

    if self._storage_profiler:
      self._storage_writer.SetStorageProfiler(self._storage_profiler)

    logger.debug('Worker: {0!s} (PID: {1:d}) started.'.format(
        self._name, self._pid))

    self._status = definitions.STATUS_INDICATOR_RUNNING

    try:
      logger.debug('{0!s} (PID: {1:d}) started monitoring task queue.'.format(
          self._name, self._pid))

      while not self._abort:
        try:
          task = self._task_queue.PopItem()
        except (errors.QueueClose, errors.QueueEmpty) as exception:
          logger.debug('ConsumeItems exiting with exception {0:s}.'.format(
              type(exception)))
          break

        if isinstance(task, plaso_queue.QueueAbort):
          logger.debug('ConsumeItems exiting, dequeued QueueAbort object.')
          break

        self._ProcessTask(task)

      logger.debug('{0!s} (PID: {1:d}) stopped monitoring task queue.'.format(
          self._name, self._pid))

    # All exceptions need to be caught here to prevent the process
    # from being killed by an uncaught exception.
    except Exception as exception:  # pylint: disable=broad-except
      logger.warning(
          'Unhandled exception in process: {0!s} (PID: {1:d}).'.format(
              self._name, self._pid))
      logger.exception(exception)

      self._abort = True

    if self._processing_profiler:
      self._extraction_worker.SetProcessingProfiler(None)

    if self._serializers_profiler:
      self._storage_writer.SetSerializersProfiler(None)

    if self._storage_profiler:
      self._storage_writer.SetStorageProfiler(None)

    self._StopProfiling()
    self._parser_mediator.StopProfiling()

    self._extraction_worker = None
    self._parser_mediator = None
    self._storage_writer = None

    if self._abort:
      self._status = definitions.STATUS_INDICATOR_ABORTED
    else:
      self._status = definitions.STATUS_INDICATOR_COMPLETED

    logger.debug('Worker: {0!s} (PID: {1:d}) stopped.'.format(
        self._name, self._pid))

    try:
      self._task_queue.Close(abort=self._abort)
    except errors.QueueAlreadyClosed:
      logger.error('Queue for {0:s} was already closed.'.format(self.name))
Esempio n. 28
0
  def _MergeTaskStorage(self, storage_writer):
    """Merges a task storage with the session storage.

    This function checks all task stores that are ready to merge and updates
    the scheduled tasks. Note that to prevent this function holding up
    the task scheduling loop only the first available task storage is merged.

    Args:
      storage_writer (StorageWriter): storage writer for a session storage used
          to merge task storage.
    """
    if self._processing_profiler:
      self._processing_profiler.StartTiming('merge_check')

    for task_identifier in storage_writer.GetProcessedTaskIdentifiers():
      try:
        task = self._task_manager.GetProcessedTaskByIdentifier(task_identifier)

        self._task_manager.SampleTaskStatus(task, 'processed')

        to_merge = self._task_manager.CheckTaskToMerge(task)
        if not to_merge:
          storage_writer.RemoveProcessedTaskStorage(task)

          self._task_manager.RemoveTask(task)
          self._task_manager.SampleTaskStatus(task, 'removed_processed')

        else:
          storage_writer.PrepareMergeTaskStorage(task)
          self._task_manager.UpdateTaskAsPendingMerge(task)

      except KeyError:
        logger.error(
            'Unable to retrieve task: {0:s} to prepare it to be merged.'.format(
                task_identifier))
        continue

    if self._processing_profiler:
      self._processing_profiler.StopTiming('merge_check')

    task = None
    if not self._storage_merge_reader_on_hold:
      task = self._task_manager.GetTaskPendingMerge(self._merge_task)

    # Limit the number of attribute containers from a single task-based
    # storage file that are merged per loop to keep tasks flowing.
    if task or self._storage_merge_reader:
      self._status = definitions.STATUS_INDICATOR_MERGING

      if self._processing_profiler:
        self._processing_profiler.StartTiming('merge')

      if task:
        if self._storage_merge_reader:
          self._merge_task_on_hold = self._merge_task
          self._storage_merge_reader_on_hold = self._storage_merge_reader

          self._task_manager.SampleTaskStatus(
              self._merge_task_on_hold, 'merge_on_hold')

        self._merge_task = task
        try:
          self._storage_merge_reader = storage_writer.StartMergeTaskStorage(
              task)

          self._task_manager.SampleTaskStatus(task, 'merge_started')

        except IOError as exception:
          logger.error((
              'Unable to merge results of task: {0:s} '
              'with error: {1!s}').format(task.identifier, exception))
          self._storage_merge_reader = None

      if self._storage_merge_reader:
        fully_merged = self._storage_merge_reader.MergeAttributeContainers(
            maximum_number_of_containers=self._MAXIMUM_NUMBER_OF_CONTAINERS)
      else:
        # TODO: Do something more sensible when this happens, perhaps
        # retrying the task once that is implemented. For now, we mark the task
        # as fully merged because we can't continue with it.
        fully_merged = True

      if self._processing_profiler:
        self._processing_profiler.StopTiming('merge')

      if fully_merged:
        try:
          self._task_manager.CompleteTask(self._merge_task)

        except KeyError as exception:
          logger.error(
              'Unable to complete task: {0:s} with error: {1!s}'.format(
                  self._merge_task.identifier, exception))

        if not self._storage_merge_reader_on_hold:
          self._merge_task = None
          self._storage_merge_reader = None
        else:
          self._merge_task = self._merge_task_on_hold
          self._storage_merge_reader = self._storage_merge_reader_on_hold

          self._merge_task_on_hold = None
          self._storage_merge_reader_on_hold = None

          self._task_manager.SampleTaskStatus(
              self._merge_task, 'merge_resumed')

      self._status = definitions.STATUS_INDICATOR_RUNNING
      self._number_of_produced_events = storage_writer.number_of_events
      self._number_of_produced_sources = storage_writer.number_of_event_sources
      self._number_of_produced_warnings = storage_writer.number_of_warnings
Esempio n. 29
0
  def _Main(self):
    """The main loop."""
    self._StartProfiling(self._processing_configuration.profiling)

    if self._serializers_profiler:
      self._storage_writer.SetSerializersProfiler(self._serializers_profiler)

    if self._storage_profiler:
      self._storage_writer.SetStorageProfiler(self._storage_profiler)

    logger.debug('Analysis plugin: {0!s} (PID: {1:d}) started'.format(
        self._name, self._pid))

    # Creating the threading event in the constructor will cause a pickle
    # error on Windows when an analysis process is created.
    self._foreman_status_wait_event = threading.Event()
    self._status = definitions.PROCESSING_STATUS_ANALYZING

    task = tasks.Task()
    # TODO: temporary solution.
    task.identifier = self._analysis_plugin.plugin_name

    self._task = task

    storage_writer = self._storage_writer.CreateTaskStorage(task)

    if self._serializers_profiler:
      storage_writer.SetSerializersProfiler(self._serializers_profiler)

    if self._storage_profiler:
      storage_writer.SetStorageProfiler(self._storage_profiler)

    storage_writer.Open()

    self._analysis_mediator = analysis_mediator.AnalysisMediator(
        storage_writer, self._knowledge_base, data_location=self._data_location)

    # TODO: set event_filter_expression in mediator.

    storage_writer.WriteTaskStart()

    try:
      logger.debug(
          '{0!s} (PID: {1:d}) started monitoring event queue.'.format(
              self._name, self._pid))

      while not self._abort:
        try:
          event = self._event_queue.PopItem()

        except (errors.QueueClose, errors.QueueEmpty) as exception:
          logger.debug('ConsumeItems exiting with exception {0:s}.'.format(
              type(exception)))
          break

        if isinstance(event, plaso_queue.QueueAbort):
          logger.debug('ConsumeItems exiting, dequeued QueueAbort object.')
          break

        self._ProcessEvent(self._analysis_mediator, event)

        self._number_of_consumed_events += 1

        if self._guppy_memory_profiler:
          self._guppy_memory_profiler.Sample()

      logger.debug(
          '{0!s} (PID: {1:d}) stopped monitoring event queue.'.format(
              self._name, self._pid))

      if not self._abort:
        self._status = definitions.PROCESSING_STATUS_REPORTING

        self._analysis_mediator.ProduceAnalysisReport(self._analysis_plugin)

    # All exceptions need to be caught here to prevent the process
    # from being killed by an uncaught exception.
    except Exception as exception:  # pylint: disable=broad-except
      logger.warning(
          'Unhandled exception in process: {0!s} (PID: {1:d}).'.format(
              self._name, self._pid))
      logger.exception(exception)

      self._abort = True

    finally:
      storage_writer.WriteTaskCompletion(aborted=self._abort)

      storage_writer.Close()

      if self._serializers_profiler:
        storage_writer.SetSerializersProfiler(None)

      if self._storage_profiler:
        storage_writer.SetStorageProfiler(None)

    try:
      self._storage_writer.FinalizeTaskStorage(task)
    except IOError:
      pass

    if self._abort:
      self._status = definitions.PROCESSING_STATUS_ABORTED
    else:
      self._status = definitions.PROCESSING_STATUS_COMPLETED

    self._foreman_status_wait_event.wait(self._FOREMAN_STATUS_WAIT)

    logger.debug('Analysis plugin: {0!s} (PID: {1:d}) stopped'.format(
        self._name, self._pid))

    if self._serializers_profiler:
      self._storage_writer.SetSerializersProfiler(None)

    if self._storage_profiler:
      self._storage_writer.SetStorageProfiler(None)

    self._StopProfiling()

    self._analysis_mediator = None
    self._foreman_status_wait_event = None
    self._storage_writer = None
    self._task = None

    try:
      self._event_queue.Close(abort=self._abort)
    except errors.QueueAlreadyClosed:
      logger.error('Queue for {0:s} was already closed.'.format(self.name))
Esempio n. 30
0
    def _Main(self):
        """The main loop."""
        self._StartProfiling(self._processing_configuration.profiling)

        if self._serializers_profiler:
            self._storage_writer.SetSerializersProfiler(
                self._serializers_profiler)

        if self._storage_profiler:
            self._storage_writer.SetStorageProfiler(self._storage_profiler)

        logger.debug('Analysis plugin: {0!s} (PID: {1:d}) started'.format(
            self._name, self._pid))

        # Creating the threading event in the constructor will cause a pickle
        # error on Windows when an analysis process is created.
        self._foreman_status_wait_event = threading.Event()
        self._status = definitions.PROCESSING_STATUS_ANALYZING

        task = tasks.Task()
        # TODO: temporary solution.
        task.identifier = self._analysis_plugin.plugin_name

        self._task = task

        storage_writer = self._storage_writer.CreateTaskStorage(task)

        if self._serializers_profiler:
            storage_writer.SetSerializersProfiler(self._serializers_profiler)

        if self._storage_profiler:
            storage_writer.SetStorageProfiler(self._storage_profiler)

        storage_writer.Open()

        self._analysis_mediator = analysis_mediator.AnalysisMediator(
            storage_writer,
            self._knowledge_base,
            data_location=self._data_location)

        # TODO: set event_filter_expression in mediator.

        storage_writer.WriteTaskStart()

        try:
            logger.debug(
                '{0!s} (PID: {1:d}) started monitoring event queue.'.format(
                    self._name, self._pid))

            while not self._abort:
                try:
                    event = self._event_queue.PopItem()

                except (errors.QueueClose, errors.QueueEmpty) as exception:
                    logger.debug(
                        'ConsumeItems exiting with exception {0:s}.'.format(
                            type(exception)))
                    break

                if isinstance(event, plaso_queue.QueueAbort):
                    logger.debug(
                        'ConsumeItems exiting, dequeued QueueAbort object.')
                    break

                self._ProcessEvent(self._analysis_mediator, event)

                self._number_of_consumed_events += 1

                if self._guppy_memory_profiler:
                    self._guppy_memory_profiler.Sample()

            logger.debug(
                '{0!s} (PID: {1:d}) stopped monitoring event queue.'.format(
                    self._name, self._pid))

            if not self._abort:
                self._status = definitions.PROCESSING_STATUS_REPORTING

                self._analysis_mediator.ProduceAnalysisReport(
                    self._analysis_plugin)

        # All exceptions need to be caught here to prevent the process
        # from being killed by an uncaught exception.
        except Exception as exception:  # pylint: disable=broad-except
            logger.warning(
                'Unhandled exception in process: {0!s} (PID: {1:d}).'.format(
                    self._name, self._pid))
            logger.exception(exception)

            self._abort = True

        finally:
            storage_writer.WriteTaskCompletion(aborted=self._abort)

            storage_writer.Close()

            if self._serializers_profiler:
                storage_writer.SetSerializersProfiler(None)

            if self._storage_profiler:
                storage_writer.SetStorageProfiler(None)

        try:
            self._storage_writer.FinalizeTaskStorage(task)
        except IOError:
            pass

        if self._abort:
            self._status = definitions.PROCESSING_STATUS_ABORTED
        else:
            self._status = definitions.PROCESSING_STATUS_COMPLETED

        self._foreman_status_wait_event.wait(self._FOREMAN_STATUS_WAIT)

        logger.debug('Analysis plugin: {0!s} (PID: {1:d}) stopped'.format(
            self._name, self._pid))

        if self._serializers_profiler:
            self._storage_writer.SetSerializersProfiler(None)

        if self._storage_profiler:
            self._storage_writer.SetStorageProfiler(None)

        self._StopProfiling()

        self._analysis_mediator = None
        self._foreman_status_wait_event = None
        self._storage_writer = None
        self._task = None

        try:
            self._event_queue.Close(abort=self._abort)
        except errors.QueueAlreadyClosed:
            logger.error('Queue for {0:s} was already closed.'.format(
                self.name))
Esempio n. 31
0
  def _AnalyzeEvents(self, storage_writer, analysis_plugins, event_filter=None):
    """Analyzes events in a plaso storage.

    Args:
      storage_writer (StorageWriter): storage writer.
      analysis_plugins (dict[str, AnalysisPlugin]): analysis plugins that
          should be run and their names.
      event_filter (Optional[FilterObject]): event filter.

    Returns:
      collections.Counter: counter containing information about the events
          processed and filtered.

    Raises:
      RuntimeError: if a non-recoverable situation is encountered.
    """
    self._status = definitions.PROCESSING_STATUS_RUNNING
    self._number_of_consumed_errors = 0
    self._number_of_consumed_events = 0
    self._number_of_consumed_reports = 0
    self._number_of_consumed_sources = 0
    self._number_of_produced_errors = 0
    self._number_of_produced_events = 0
    self._number_of_produced_reports = 0
    self._number_of_produced_sources = 0

    number_of_filtered_events = 0

    logger.debug('Processing events.')

    filter_limit = getattr(event_filter, 'limit', None)

    for event in storage_writer.GetSortedEvents():
      event_data_identifier = event.GetEventDataIdentifier()
      if event_data_identifier:
        event_data = storage_writer.GetEventDataByIdentifier(
            event_data_identifier)
        if event_data:
          for attribute_name, attribute_value in event_data.GetAttributes():
            setattr(event, attribute_name, attribute_value)

      event_identifier = event.GetIdentifier()
      event.tag = self._event_tag_index.GetEventTagByIdentifier(
          storage_writer, event_identifier)

      if event_filter:
        filter_match = event_filter.Match(event)
      else:
        filter_match = None

      # pylint: disable=singleton-comparison
      if filter_match == False:
        number_of_filtered_events += 1
        continue

      for event_queue in self._event_queues.values():
        # TODO: Check for premature exit of analysis plugins.
        event_queue.PushItem(event)

      self._number_of_consumed_events += 1

      if (event_filter and filter_limit and
          filter_limit == self._number_of_consumed_events):
        break

    logger.debug('Finished pushing events to analysis plugins.')
    # Signal that we have finished adding events.
    for event_queue in self._event_queues.values():
      event_queue.PushItem(plaso_queue.QueueAbort(), block=False)

    logger.debug('Processing analysis plugin results.')

    # TODO: use a task based approach.
    plugin_names = [plugin_name for plugin_name in analysis_plugins.keys()]
    while plugin_names:
      for plugin_name in list(plugin_names):
        if self._abort:
          break

        # TODO: temporary solution.
        task = tasks.Task()
        task.identifier = plugin_name

        merge_ready = storage_writer.CheckTaskReadyForMerge(task)
        if merge_ready:
          storage_writer.PrepareMergeTaskStorage(task)
          self._status = definitions.PROCESSING_STATUS_MERGING

          event_queue = self._event_queues[plugin_name]
          del self._event_queues[plugin_name]

          event_queue.Close()

          storage_merge_reader = storage_writer.StartMergeTaskStorage(task)

          storage_merge_reader.MergeAttributeContainers(
              callback=self._MergeEventTag)
          # TODO: temporary solution.
          plugin_names.remove(plugin_name)

          self._status = definitions.PROCESSING_STATUS_RUNNING

          self._number_of_produced_event_tags = (
              storage_writer.number_of_event_tags)
          self._number_of_produced_reports = (
              storage_writer.number_of_analysis_reports)

    try:
      storage_writer.StopTaskStorage(abort=self._abort)
    except (IOError, OSError) as exception:
      logger.error('Unable to stop task storage with error: {0!s}'.format(
          exception))

    if self._abort:
      logger.debug('Processing aborted.')
    else:
      logger.debug('Processing completed.')

    events_counter = collections.Counter()
    events_counter['Events filtered'] = number_of_filtered_events
    events_counter['Events processed'] = self._number_of_consumed_events

    return events_counter
Esempio n. 32
0
  def _Main(self):
    """The main loop."""
    # We need a resolver context per process to prevent multi processing
    # issues with file objects stored in images.
    resolver_context = context.Context()

    for credential_configuration in self._processing_configuration.credentials:
      resolver.Resolver.key_chain.SetCredential(
          credential_configuration.path_spec,
          credential_configuration.credential_type,
          credential_configuration.credential_data)

    self._parser_mediator = parsers_mediator.ParserMediator(
        None, self._knowledge_base,
        preferred_year=self._processing_configuration.preferred_year,
        resolver_context=resolver_context,
        temporary_directory=self._processing_configuration.temporary_directory)

    self._parser_mediator.SetEventExtractionConfiguration(
        self._processing_configuration.event_extraction)

    self._parser_mediator.SetInputSourceConfiguration(
        self._processing_configuration.input_source)

    # We need to initialize the parser and hasher objects after the process
    # has forked otherwise on Windows the "fork" will fail with
    # a PickleError for Python modules that cannot be pickled.
    self._extraction_worker = worker.EventExtractionWorker(
        parser_filter_expression=(
            self._processing_configuration.parser_filter_expression))

    self._extraction_worker.SetExtractionConfiguration(
        self._processing_configuration.extraction)

    self._parser_mediator.StartProfiling(
        self._processing_configuration.profiling, self._name,
        self._process_information)
    self._StartProfiling(self._processing_configuration.profiling)

    if self._processing_profiler:
      self._extraction_worker.SetProcessingProfiler(self._processing_profiler)

    if self._serializers_profiler:
      self._storage_writer.SetSerializersProfiler(self._serializers_profiler)

    if self._storage_profiler:
      self._storage_writer.SetStorageProfiler(self._storage_profiler)

    logger.debug('Worker: {0!s} (PID: {1:d}) started.'.format(
        self._name, self._pid))

    self._status = definitions.PROCESSING_STATUS_RUNNING

    try:
      logger.debug('{0!s} (PID: {1:d}) started monitoring task queue.'.format(
          self._name, self._pid))

      while not self._abort:
        try:
          task = self._task_queue.PopItem()
        except (errors.QueueClose, errors.QueueEmpty) as exception:
          logger.debug('ConsumeItems exiting with exception {0:s}.'.format(
              type(exception)))
          break

        if isinstance(task, plaso_queue.QueueAbort):
          logger.debug('ConsumeItems exiting, dequeued QueueAbort object.')
          break

        self._ProcessTask(task)

      logger.debug('{0!s} (PID: {1:d}) stopped monitoring task queue.'.format(
          self._name, self._pid))

    # All exceptions need to be caught here to prevent the process
    # from being killed by an uncaught exception.
    except Exception as exception:  # pylint: disable=broad-except
      logger.warning(
          'Unhandled exception in process: {0!s} (PID: {1:d}).'.format(
              self._name, self._pid))
      logger.exception(exception)

      self._abort = True

    if self._processing_profiler:
      self._extraction_worker.SetProcessingProfiler(None)

    if self._serializers_profiler:
      self._storage_writer.SetSerializersProfiler(None)

    if self._storage_profiler:
      self._storage_writer.SetStorageProfiler(None)

    self._StopProfiling()
    self._parser_mediator.StopProfiling()

    self._extraction_worker = None
    self._parser_mediator = None
    self._storage_writer = None

    if self._abort:
      self._status = definitions.PROCESSING_STATUS_ABORTED
    else:
      self._status = definitions.PROCESSING_STATUS_COMPLETED

    logger.debug('Worker: {0!s} (PID: {1:d}) stopped.'.format(
        self._name, self._pid))

    try:
      self._task_queue.Close(abort=self._abort)
    except errors.QueueAlreadyClosed:
      logger.error('Queue for {0:s} was already closed.'.format(self.name))
Esempio n. 33
0
  def _CheckStatusWorkerProcess(self, pid):
    """Checks the status of a worker process.

    If a worker process is not responding the process is terminated and
    a replacement process is started.

    Args:
      pid (int): process ID (PID) of a registered worker process.

    Raises:
      KeyError: if the process is not registered with the engine.
    """
    # TODO: Refactor this method, simplify and separate concerns (monitoring
    # vs management).
    self._RaiseIfNotRegistered(pid)

    process = self._processes_per_pid[pid]

    process_status = self._QueryProcessStatus(process)
    if process_status is None:
      process_is_alive = False
    else:
      process_is_alive = True

    process_information = self._process_information_per_pid[pid]
    used_memory = process_information.GetUsedMemory() or 0

    if self._worker_memory_limit and used_memory > self._worker_memory_limit:
      logger.warning((
          'Process: {0:s} (PID: {1:d}) killed because it exceeded the '
          'memory limit: {2:d}.').format(
              process.name, pid, self._worker_memory_limit))
      self._KillProcess(pid)

    if isinstance(process_status, dict):
      self._rpc_errors_per_pid[pid] = 0
      status_indicator = process_status.get('processing_status', None)

    else:
      rpc_errors = self._rpc_errors_per_pid.get(pid, 0) + 1
      self._rpc_errors_per_pid[pid] = rpc_errors

      if rpc_errors > self._MAXIMUM_RPC_ERRORS:
        process_is_alive = False

      if process_is_alive:
        rpc_port = process.rpc_port.value
        logger.warning((
            'Unable to retrieve process: {0:s} (PID: {1:d}) status via '
            'RPC socket: http://localhost:{2:d}').format(
                process.name, pid, rpc_port))

        processing_status_string = 'RPC error'
        status_indicator = definitions.PROCESSING_STATUS_RUNNING
      else:
        processing_status_string = 'killed'
        status_indicator = definitions.PROCESSING_STATUS_KILLED

      process_status = {
          'processing_status': processing_status_string}

    self._UpdateProcessingStatus(pid, process_status, used_memory)

    # _UpdateProcessingStatus can also change the status of the worker,
    # So refresh the status if applicable.
    for worker_status in self._processing_status.workers_status:
      if worker_status.pid == pid:
        status_indicator = worker_status.status
        break

    if status_indicator in definitions.PROCESSING_ERROR_STATUS:
      logger.error((
          'Process {0:s} (PID: {1:d}) is not functioning correctly. '
          'Status code: {2!s}.').format(process.name, pid, status_indicator))

      self._TerminateProcessByPid(pid)

      logger.info('Starting replacement worker process for {0:s}'.format(
          process.name))
      replacement_process_attempts = 0
      replacement_process = None
      while replacement_process_attempts < self._MAXIMUM_REPLACEMENT_RETRIES:
        replacement_process_attempts += 1
        replacement_process = self._StartWorkerProcess(
            process.name, self._storage_writer)
        if not replacement_process:
          time.sleep(self._REPLACEMENT_WORKER_RETRY_DELAY)
          break
      if not replacement_process:
        logger.error(
            'Unable to create replacement worker process for: {0:s}'.format(
                process.name))