Example #1
0
  def _StartAnalysisProcesses(
      self, knowledge_base_object, storage_writer, analysis_plugins,
      data_location, event_filter_expression=None):
    """Starts the analysis processes.

    Args:
      knowledge_base_object (KnowledgeBase): contains information from
          the source data needed for processing.
      storage_writer (StorageWriter): storage writer.
      analysis_plugins (list[AnalysisPlugin]): analysis plugins that should
          be run.
      data_location (str): path to the location that data files should
          be loaded from.
      event_filter_expression (Optional[str]): event filter expression.
    """
    logging.info(u'Starting analysis plugins.')

    for analysis_plugin in analysis_plugins:
      if self._use_zeromq:
        queue_name = u'{0:s} output event queue'.format(analysis_plugin.NAME)
        output_event_queue = zeromq_queue.ZeroMQPushBindQueue(
            name=queue_name, timeout_seconds=self._QUEUE_TIMEOUT)
        # Open the queue so it can bind to a random port, and we can get the
        # port number to use in the input queue.
        output_event_queue.Open()

      else:
        output_event_queue = multi_process_queue.MultiProcessingQueue(
            timeout=self._QUEUE_TIMEOUT)

      self._event_queues[analysis_plugin.NAME] = output_event_queue

      if self._use_zeromq:
        queue_name = u'{0:s} input event queue'.format(analysis_plugin.NAME)
        input_event_queue = zeromq_queue.ZeroMQPullConnectQueue(
            name=queue_name, delay_open=True, port=output_event_queue.port,
            timeout_seconds=self._QUEUE_TIMEOUT)

      else:
        input_event_queue = output_event_queue

      process = analysis_process.AnalysisProcess(
          input_event_queue, storage_writer, knowledge_base_object,
          analysis_plugin, data_location=data_location,
          event_filter_expression=event_filter_expression,
          name=analysis_plugin.plugin_name)

      process.start()

      logging.info(u'Started analysis plugin: {0:s} (PID: {1:d}).'.format(
          analysis_plugin.plugin_name, process.pid))

      self._RegisterProcess(process)
      self._StartMonitoringProcess(process.pid)

    logging.info(u'Analysis plugins running')
Example #2
0
  def testMain(self):
    """Tests the _Main function."""
    task_queue = multi_process_queue.MultiProcessingQueue(timeout=1)

    configuration = configurations.ProcessingConfiguration()

    test_process = worker_process.WorkerProcess(
        task_queue, None, None, None, None, configuration, name='TestWorker')
    test_process._abort = True
    test_process._pid = 0

    test_process._Main()
Example #3
0
  def testPushPopItem(self):
    """Tests the PushItem and PopItem functions."""
    # A timeout is used to prevent the multi processing queue to close and
    # stop blocking the current process
    test_queue = multi_process_queue.MultiProcessingQueue(timeout=0.1)

    for item in self._ITEMS:
      test_queue.PushItem(item)

    test_queue_consumer = TestQueueConsumer(test_queue)
    test_queue_consumer.ConsumeItems()

    self.assertEqual(test_queue_consumer.number_of_items, len(self._ITEMS))
Example #4
0
  def testMain(self):
    """Tests the _Main function."""
    event_queue = multi_process_queue.MultiProcessingQueue(timeout=1)

    session = sessions.Session()
    storage_writer = self._CreateStorageWriter(session)
    analysis_plugin = TestAnalysisPlugin()

    configuration = configurations.ProcessingConfiguration()

    test_process = analysis_process.AnalysisProcess(
        event_queue, storage_writer, None, analysis_plugin, configuration,
        name='TestAnalysis')
    test_process._abort = True
    test_process._FOREMAN_STATUS_WAIT = 1
    test_process._pid = 0

    test_process._Main()
Example #5
0
    def ProcessSources(self,
                       session_identifier,
                       source_path_specs,
                       storage_writer,
                       processing_configuration,
                       enable_sigsegv_handler=False,
                       number_of_worker_processes=0,
                       status_update_callback=None,
                       worker_memory_limit=None):
        """Processes the sources and extract events.

    Args:
      session_identifier (str): identifier of the session.
      source_path_specs (list[dfvfs.PathSpec]): path specifications of
          the sources to process.
      storage_writer (StorageWriter): storage writer for a session storage.
      processing_configuration (ProcessingConfiguration): processing
          configuration.
      enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler
          should be enabled.
      number_of_worker_processes (Optional[int]): number of worker processes.
      status_update_callback (Optional[function]): callback function for status
          updates.
      worker_memory_limit (Optional[int]): maximum amount of memory a worker is
          allowed to consume, where None represents the default memory limit
          and 0 represents no limit.

    Returns:
      ProcessingStatus: processing status.
    """
        if number_of_worker_processes < 1:
            # One worker for each "available" CPU (minus other processes).
            # The number here is derived from the fact that the engine starts up:
            # * A main process.
            #
            # If we want to utilize all CPUs on the system we therefore need to start
            # up workers that amounts to the total number of CPUs - the other
            # processes.
            try:
                cpu_count = multiprocessing.cpu_count() - 1

                if cpu_count <= self._WORKER_PROCESSES_MINIMUM:
                    cpu_count = self._WORKER_PROCESSES_MINIMUM

                elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM:
                    cpu_count = self._WORKER_PROCESSES_MAXIMUM

            except NotImplementedError:
                logger.error((
                    'Unable to determine number of CPUs defaulting to {0:d} worker '
                    'processes.').format(self._WORKER_PROCESSES_MINIMUM))
                cpu_count = self._WORKER_PROCESSES_MINIMUM

            number_of_worker_processes = cpu_count

        self._enable_sigsegv_handler = enable_sigsegv_handler
        self._number_of_worker_processes = number_of_worker_processes

        if worker_memory_limit is None:
            self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT
        else:
            self._worker_memory_limit = worker_memory_limit

        # Keep track of certain values so we can spawn new extraction workers.
        self._processing_configuration = processing_configuration

        self._debug_output = processing_configuration.debug_output
        self._log_filename = processing_configuration.log_filename
        self._session_identifier = session_identifier
        self._status_update_callback = status_update_callback
        self._storage_writer = storage_writer

        # Set up the task queue.
        if not self._use_zeromq:
            self._task_queue = multi_process_queue.MultiProcessingQueue(
                maximum_number_of_queued_items=self._maximum_number_of_tasks)

        else:
            task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue(
                delay_open=True,
                linger_seconds=0,
                maximum_items=1,
                name='main_task_queue',
                timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS)
            self._task_queue = task_outbound_queue

            # The ZeroMQ backed queue must be started first, so we can save its port.
            # TODO: raises: attribute-defined-outside-init
            # self._task_queue.name = 'Task queue'
            self._task_queue.Open()
            self._task_queue_port = self._task_queue.port

        self._StartProfiling(self._processing_configuration.profiling)
        self._task_manager.StartProfiling(
            self._processing_configuration.profiling, self._name)

        if self._serializers_profiler:
            storage_writer.SetSerializersProfiler(self._serializers_profiler)

        if self._storage_profiler:
            storage_writer.SetStorageProfiler(self._storage_profiler)

        # Set up the storage writer before the worker processes.
        storage_writer.StartTaskStorage()

        for worker_number in range(number_of_worker_processes):
            # First argument to _StartWorkerProcess is not used.
            extraction_process = self._StartWorkerProcess('', storage_writer)
            if not extraction_process:
                logger.error('Unable to create worker process: {0:d}'.format(
                    worker_number))

        self._StartStatusUpdateThread()

        try:
            # Open the storage file after creating the worker processes otherwise
            # the ZIP storage file will remain locked as long as the worker processes
            # are alive.
            storage_writer.Open()
            storage_writer.WriteSessionStart()

            try:
                storage_writer.WritePreprocessingInformation(
                    self.knowledge_base)

                self._ProcessSources(source_path_specs, storage_writer)

            finally:
                storage_writer.WriteSessionCompletion(aborted=self._abort)

                storage_writer.Close()

        finally:
            # Stop the status update thread after close of the storage writer
            # so we include the storage sync to disk in the status updates.
            self._StopStatusUpdateThread()

            if self._serializers_profiler:
                storage_writer.SetSerializersProfiler(None)

            if self._storage_profiler:
                storage_writer.SetStorageProfiler(None)

            self._task_manager.StopProfiling()
            self._StopProfiling()

        try:
            self._StopExtractionProcesses(abort=self._abort)

        except KeyboardInterrupt:
            self._AbortKill()

            # The abort can leave the main process unresponsive
            # due to incorrectly finalized IPC.
            self._KillProcess(os.getpid())

        # The task queue should be closed by _StopExtractionProcesses, this
        # close is a failsafe, primarily due to MultiProcessingQueue's
        # blocking behavior.
        self._task_queue.Close(abort=True)

        if self._processing_status.error_path_specs:
            task_storage_abort = True
        else:
            task_storage_abort = self._abort

        try:
            storage_writer.StopTaskStorage(abort=task_storage_abort)
        except (IOError, OSError) as exception:
            logger.error(
                'Unable to stop task storage with error: {0!s}'.format(
                    exception))

        if self._abort:
            logger.debug('Processing aborted.')
            self._processing_status.aborted = True
        else:
            logger.debug('Processing completed.')

        # Reset values.
        self._enable_sigsegv_handler = None
        self._number_of_worker_processes = None
        self._worker_memory_limit = definitions.DEFAULT_WORKER_MEMORY_LIMIT

        self._processing_configuration = None

        self._session_identifier = None
        self._status_update_callback = None
        self._storage_writer = None

        return self._processing_status
Example #6
0
    def _StartWorkerProcess(self, process_name, storage_writer):
        """Creates, starts, monitors and registers a worker process.

    Args:
      process_name (str): process name.
      storage_writer (StorageWriter): storage writer for a session storage used
          to create task storage.

    Returns:
      MultiProcessWorkerProcess: extraction worker process or None on error.
    """
        analysis_plugin = self._analysis_plugins.get(process_name, None)
        if not analysis_plugin:
            logger.error('Missing analysis plugin: {0:s}'.format(process_name))
            return None

        if self._use_zeromq:
            queue_name = '{0:s} output event queue'.format(process_name)
            output_event_queue = zeromq_queue.ZeroMQPushBindQueue(
                name=queue_name, timeout_seconds=self._QUEUE_TIMEOUT)
            # Open the queue so it can bind to a random port, and we can get the
            # port number to use in the input queue.
            output_event_queue.Open()

        else:
            output_event_queue = multi_process_queue.MultiProcessingQueue(
                timeout=self._QUEUE_TIMEOUT)

        self._event_queues[process_name] = output_event_queue

        if self._use_zeromq:
            queue_name = '{0:s} input event queue'.format(process_name)
            input_event_queue = zeromq_queue.ZeroMQPullConnectQueue(
                name=queue_name,
                delay_open=True,
                port=output_event_queue.port,
                timeout_seconds=self._QUEUE_TIMEOUT)

        else:
            input_event_queue = output_event_queue

        process = analysis_process.AnalysisProcess(
            input_event_queue,
            storage_writer,
            self._knowledge_base,
            analysis_plugin,
            self._processing_configuration,
            data_location=self._data_location,
            event_filter_expression=self._event_filter_expression,
            name=process_name)

        process.start()

        logger.info('Started analysis plugin: {0:s} (PID: {1:d}).'.format(
            process_name, process.pid))

        try:
            self._StartMonitoringProcess(process)
        except (IOError, KeyError) as exception:
            logger.error(
                ('Unable to monitor analysis plugin: {0:s} (PID: {1:d}) '
                 'with error: {2!s}').format(process_name, process.pid,
                                             exception))

            process.terminate()
            return None

        self._RegisterProcess(process)
        return process
Example #7
0
  def ProcessSources(
      self, session_identifier, source_path_specs, storage_writer,
      enable_sigsegv_handler=False, filter_find_specs=None,
      filter_object=None, hasher_names_string=None, mount_path=None,
      number_of_worker_processes=0, parser_filter_expression=None,
      preferred_year=None, process_archives=False,
      process_compressed_streams=True, status_update_callback=None,
      show_memory_usage=False, temporary_directory=None, text_prepend=None,
      yara_rules_string=None):
    """Processes the sources and extract event objects.

    Args:
      session_identifier (str): identifier of the session.
      source_path_specs (list[dfvfs.PathSpec]): path specifications of
          the sources to process.
      storage_writer (StorageWriter): storage writer for a session storage.
      enable_sigsegv_handler (Optional[bool]): True if the SIGSEGV handler
          should be enabled.
      filter_find_specs (Optional[list[dfvfs.FindSpec]]): find specifications
          used in path specification extraction.
      filter_object (Optional[objectfilter.Filter]): filter object.
      hasher_names_string (Optional[str]): comma separated string of names
          of hashers to use during processing.
      mount_path (Optional[str]): mount path.
      number_of_worker_processes (Optional[int]): number of worker processes.
      parser_filter_expression (Optional[str]): parser filter expression,
          where None represents all parsers and plugins.
      preferred_year (Optional[int]): preferred year.
      process_archives (Optional[bool]): True if archive files should be
          scanned for file entries.
      process_compressed_streams (Optional[bool]): True if file content in
          compressed streams should be processed.
      show_memory_usage (Optional[bool]): True if memory information should be
          included in status updates.
      status_update_callback (Optional[function]): callback function for status
          updates.
      temporary_directory (Optional[str]): path of the directory for temporary
          files.
      text_prepend (Optional[str]): text to prepend to every event.
      yara_rules_string (Optional[str]): unparsed yara rule definitions.

    Returns:
      ProcessingStatus: processing status.
    """
    if number_of_worker_processes < 1:
      # One worker for each "available" CPU (minus other processes).
      # The number here is derived from the fact that the engine starts up:
      # * A main process.
      #
      # If we want to utilize all CPUs on the system we therefore need to start
      # up workers that amounts to the total number of CPUs - the other
      # processes.
      try:
        cpu_count = multiprocessing.cpu_count() - 1

        if cpu_count <= self._WORKER_PROCESSES_MINIMUM:
          cpu_count = self._WORKER_PROCESSES_MINIMUM

        elif cpu_count >= self._WORKER_PROCESSES_MAXIMUM:
          cpu_count = self._WORKER_PROCESSES_MAXIMUM

      except NotImplementedError:
        logging.error((
            u'Unable to determine number of CPUs defaulting to {0:d} worker '
            u'processes.').format(self._WORKER_PROCESSES_MINIMUM))
        cpu_count = self._WORKER_PROCESSES_MINIMUM

      number_of_worker_processes = cpu_count

    self._enable_sigsegv_handler = enable_sigsegv_handler
    self._number_of_worker_processes = number_of_worker_processes
    self._show_memory_usage = show_memory_usage

    # Keep track of certain values so we can spawn new extraction workers.
    self._filter_find_specs = filter_find_specs
    self._filter_object = filter_object
    self._hasher_names_string = hasher_names_string
    self._mount_path = mount_path
    self._parser_filter_expression = parser_filter_expression
    self._preferred_year = preferred_year
    self._process_archives = process_archives
    self._process_compressed_streams = process_compressed_streams
    self._session_identifier = session_identifier
    self._status_update_callback = status_update_callback
    self._storage_writer = storage_writer
    self._temporary_directory = temporary_directory
    self._text_prepend = text_prepend
    self._yara_rules_string = yara_rules_string

    # Set up the task queue.
    if not self._use_zeromq:
      self._task_queue = multi_process_queue.MultiProcessingQueue(
          maximum_number_of_queued_items=self._maximum_number_of_tasks)

    else:
      task_outbound_queue = zeromq_queue.ZeroMQBufferedReplyBindQueue(
          delay_open=True, linger_seconds=0, maximum_items=1,
          name=u'main_task_queue',
          timeout_seconds=self._ZEROMQ_NO_WORKER_REQUEST_TIME_SECONDS)
      self._task_queue = task_outbound_queue

      # The ZeroMQ backed queue must be started first, so we can save its port.
      # TODO: raises: attribute-defined-outside-init
      # self._task_queue.name = u'Task queue'
      self._task_queue.Open()
      self._task_queue_port = self._task_queue.port

    self._StartProfiling()

    if self._serializers_profiler:
      storage_writer.SetSerializersProfiler(self._serializers_profiler)

    # Set up the storage writer before the worker processes.
    storage_writer.StartTaskStorage()

    for _ in range(number_of_worker_processes):
      extraction_process = self._StartExtractionWorkerProcess(storage_writer)
      self._StartMonitoringProcess(extraction_process.pid)

    self._StartStatusUpdateThread()

    try:
      # Open the storage file after creating the worker processes otherwise
      # the ZIP storage file will remain locked as long as the worker processes
      # are alive.
      storage_writer.Open()
      storage_writer.WriteSessionStart()

      try:
        storage_writer.WritePreprocessingInformation(self.knowledge_base)

        self._ProcessSources(
            source_path_specs, storage_writer,
            filter_find_specs=filter_find_specs)

      finally:
        storage_writer.WriteSessionCompletion(aborted=self._abort)

        storage_writer.Close()

    finally:
      # Stop the status update thread after close of the storage writer
      # so we include the storage sync to disk in the status updates.
      self._StopStatusUpdateThread()

      if self._serializers_profiler:
        storage_writer.SetSerializersProfiler(None)

      self._StopProfiling()

    try:
      self._StopExtractionProcesses(abort=self._abort)

    except KeyboardInterrupt:
      self._AbortKill()

      # The abort can leave the main process unresponsive
      # due to incorrectly finalized IPC.
      self._KillProcess(os.getpid())

    # The task queue should be closed by _StopExtractionProcesses, this
    # close is a failsafe, primarily due to MultiProcessingQueue's
    # blocking behaviour.
    self._task_queue.Close(abort=True)

    if self._processing_status.error_path_specs:
      task_storage_abort = True
    else:
      task_storage_abort = self._abort

    try:
      storage_writer.StopTaskStorage(abort=task_storage_abort)
    except (IOError, OSError) as exception:
      logging.error(u'Unable to stop task storage with error: {0:s}'.format(
          exception))

    if self._abort:
      logging.debug(u'Processing aborted.')
      self._processing_status.aborted = True
    else:
      logging.debug(u'Processing completed.')

    # Reset values.
    self._enable_sigsegv_handler = None
    self._number_of_worker_processes = None
    self._show_memory_usage = None

    self._filter_find_specs = None
    self._filter_object = None
    self._hasher_names_string = None
    self._mount_path = None
    self._parser_filter_expression = None
    self._preferred_year = None
    self._process_archives = None
    self._process_compressed_streams = None
    self._session_identifier = None
    self._status_update_callback = None
    self._storage_writer = None
    self._text_prepend = None

    return self._processing_status