def _StartSingleThread(
      self, pre_obj, filter_find_specs=None, include_directory_stat=True,
      parser_filter_string=None, hasher_names_string=None,
      storage_serializer_format=definitions.SERIALIZER_FORMAT_PROTOBUF):
    """Starts everything up in a single process.

    This should not normally be used, since running the tool in a single
    process buffers up everything into memory until the storage is called.

    Just to make it clear, this starts up the collection, completes that
    before calling the worker that extracts all EventObjects and stores
    them in memory. when that is all done, the storage function is called
    to drain the buffer. Hence the tool's excessive use of memory in this
    mode and the reason why it is not suggested to be used except for
    debugging reasons (and mostly to get into the debugger).

    This is therefore mostly useful during debugging sessions for some
    limited parsing.

    Args:
      pre_obj: the preprocess object (instance of PreprocessObject).
      filter_find_specs: optional list of filter find specifications (instances
                         of dfvfs.FindSpec). The default is None.
      include_directory_stat: Boolean value to indicate whether directory
                              stat information should be collected. The default
                              is True.
      parser_filter_string: optional parser filter string. The default is None.
      hasher_names_string: optional comma separated string of names of
                           hashers to enable. The default is None.
      storage_serializer_format: optional storage serializer format.
                                 The default is protobuf.
    """
    self._collector = self._engine.CreateCollector(
        include_directory_stat, vss_stores=self.vss_stores,
        filter_find_specs=filter_find_specs,
        resolver_context=self._resolver_context)

    if self._output_module:
      storage_writer = storage.BypassStorageWriter(
          self._engine.storage_queue, self._storage_file_path,
          output_module_string=self._output_module, pre_obj=pre_obj)
    else:
      storage_writer = storage.StorageFileWriter(
          self._engine.storage_queue, self._storage_file_path,
          buffer_size=self._buffer_size, pre_obj=pre_obj,
          serializer_format=storage_serializer_format)

      storage_writer.SetEnableProfiling(
          self._enable_profiling,
          profiling_type=self._profiling_type)

    try:
      self._engine.ProcessSource(
          self._collector, storage_writer,
          parser_filter_string=parser_filter_string,
          hasher_names_string=hasher_names_string)

    except KeyboardInterrupt:
      self._CleanUpAfterAbort()
      raise errors.UserAbort(u'Process source aborted.')
  def _ProcessSourceMultiProcessMode(
      self, pre_obj, filter_find_specs=None, include_directory_stat=True,
      number_of_worker_processes=0, parser_filter_string=None,
      hasher_names_string=None,
      storage_serializer_format=definitions.SERIALIZER_FORMAT_PROTOBUF):
    """Processes the source with multiple processes.

    Args:
      pre_obj: the preprocess object (instance of PreprocessObject).
      filter_find_specs: optional list of filter find specifications (instances
                         of dfvfs.FindSpec). The default is None.
      include_directory_stat: Boolean value to indicate whether directory
                              stat information should be collected. The default
                              is True.
      number_of_worker_processes: optional number of worker processes.
                                  The default is 0 which represents deterimine
                                  automatically.
      parser_filter_string: optional parser filter string. The default is None.
      hasher_names_string: optional comma separated string of names of
                           hashers to enable. The default is None.
      storage_serializer_format: optional storage serializer format.
                                 The default is protobuf.
    """
    logging.info(u'Starting extraction in multi process mode.')

    resolver_context = context.Context()

    # TODO: create multi process collector.
    self._collector = self._engine.CreateCollector(
        include_directory_stat, vss_stores=self.vss_stores,
        filter_find_specs=filter_find_specs, resolver_context=resolver_context)

    if self._output_module:
      storage_writer = storage.BypassStorageWriter(
          self._engine.storage_queue, self._storage_file_path,
          output_module_string=self._output_module, pre_obj=pre_obj)
    else:
      storage_writer = storage.StorageFileWriter(
          self._engine.storage_queue, self._storage_file_path,
          buffer_size=self._buffer_size, pre_obj=pre_obj,
          serializer_format=storage_serializer_format)

      storage_writer.SetEnableProfiling(
          self._enable_profiling,
          profiling_type=self._profiling_type)

    try:
      self._engine.ProcessSource(
          self._collector, storage_writer,
          parser_filter_string=parser_filter_string,
          hasher_names_string=hasher_names_string,
          number_of_extraction_workers=number_of_worker_processes,
          show_memory_usage=self._show_worker_memory_information)

    except KeyboardInterrupt:
      self._CleanUpAfterAbort()
      raise errors.UserAbort(u'Process source aborted.')
    def ProcessSources(
            self,
            source_path_specs,
            source_type,
            enable_sigsegv_handler=False,
            filter_file=None,
            hasher_names_string=None,
            parser_filter_string=None,
            preferred_encoding=u'utf-8',
            single_process_mode=False,
            status_update_callback=None,
            storage_serializer_format=definitions.SERIALIZER_FORMAT_PROTOBUF,
            timezone=pytz.UTC):
        """Processes the sources.

    Args:
      source_path_specs: list of path specifications (instances of
                         dfvfs.PathSpec) to process.
      source_type: the dfVFS source type definition.
      enable_sigsegv_handler: optional boolean value to indicate the SIGSEGV
                              handler should be enabled. The default is False.
      filter_file: optional path to a file that contains find specifications.
                   The default is None.
      hasher_names_string: optional comma separated string of names of
                           hashers to enable. The default is None.
      parser_filter_string: optional parser filter string. The default is None.
      preferred_encoding: optional preferred encoding. The default is UTF-8.
      single_process_mode: optional boolean value to indicate if the front-end
                           should run in single process mode. The default is
                           False.
      status_update_callback: optional callback function for status updates.
                              The default is None.
      storage_serializer_format: optional storage serializer format.
                                 The default is protobuf.
      timezone: optional preferred timezone. The default is UTC.

    Returns:
      The processing status (instance of ProcessingStatus) or None.

    Raises:
      SourceScannerError: if the source scanner could not find a supported
                          file system.
      UserAbort: if the user initiated an abort.
    """
        # If the source is a directory or a storage media image
        # run pre-processing.
        # TODO: move source_scanner.SourceScannerContext.SOURCE_TYPE_
        # to definitions.SOURCE_TYPE_.
        if source_type in [
                source_scanner.SourceScannerContext.SOURCE_TYPE_DIRECTORY,
                source_scanner.SourceScannerContext.
                SOURCE_TYPE_STORAGE_MEDIA_DEVICE, source_scanner.
                SourceScannerContext.SOURCE_TYPE_STORAGE_MEDIA_IMAGE
        ]:
            self.SetEnablePreprocessing(True)
        else:
            self.SetEnablePreprocessing(False)

        self._CheckStorageFile(self._storage_file_path)

        self._single_process_mode = single_process_mode
        # TODO: move source_scanner.SourceScannerContext.SOURCE_TYPE_
        # to definitions.SOURCE_TYPE_.
        if source_type == source_scanner.SourceScannerContext.SOURCE_TYPE_FILE:
            # No need to multi process a single file source.
            self._single_process_mode = True

        if self._single_process_mode:
            self._engine = single_process.SingleProcessEngine(self._queue_size)
        else:
            self._engine = multi_process.MultiProcessEngine(
                maximum_number_of_queued_items=self._queue_size)

        self._engine.SetEnableDebugOutput(self._debug_mode)
        self._engine.SetEnableProfiling(
            self._enable_profiling,
            profiling_sample_rate=self._profiling_sample_rate,
            profiling_type=self._profiling_type)

        pre_obj = self._PreprocessSource(source_path_specs, source_type)

        self._operating_system = getattr(pre_obj, u'guessed_os', None)

        if not parser_filter_string:
            guessed_os = self._operating_system
            os_version = getattr(pre_obj, u'osversion', u'')
            parser_filter_string = self._GetParserFilterPreset(
                os_guess=guessed_os, os_version=os_version)

            if parser_filter_string:
                logging.info(
                    u'Parser filter expression changed to: {0:s}'.format(
                        parser_filter_string))

        self._parser_names = []
        for _, parser_class in parsers_manager.ParsersManager.GetParsers(
                parser_filter_string=parser_filter_string):
            self._parser_names.append(parser_class.NAME)

        if u'filestat' in self._parser_names:
            include_directory_stat = True
        else:
            include_directory_stat = False

        self._hasher_names = []
        hasher_manager = hashers_manager.HashersManager
        for hasher_name in hasher_manager.GetHasherNamesFromString(
                hasher_names_string=hasher_names_string):
            self._hasher_names.append(hasher_name)

        self._PreprocessSetTimezone(pre_obj, timezone=timezone)

        if filter_file:
            filter_find_specs = engine_utils.BuildFindSpecsFromFile(
                filter_file, pre_obj=pre_obj)
        else:
            filter_find_specs = None

        self._PreprocessSetCollectionInformation(
            pre_obj,
            source_type,
            self._engine,
            filter_file=filter_file,
            parser_filter_string=parser_filter_string,
            preferred_encoding=preferred_encoding)

        if self._output_module:
            storage_writer = storage.BypassStorageWriter(
                self._engine.event_object_queue,
                self._storage_file_path,
                output_module_string=self._output_module,
                pre_obj=pre_obj)
        else:
            storage_writer = storage.FileStorageWriter(
                self._engine.event_object_queue,
                self._storage_file_path,
                buffer_size=self._buffer_size,
                pre_obj=pre_obj,
                serializer_format=storage_serializer_format)

            storage_writer.SetEnableProfiling(
                self._enable_profiling, profiling_type=self._profiling_type)

        processing_status = None
        try:
            if self._single_process_mode:
                logging.debug(u'Starting extraction in single process mode.')

                processing_status = self._engine.ProcessSources(
                    source_path_specs,
                    storage_writer,
                    filter_find_specs=filter_find_specs,
                    filter_object=self._filter_object,
                    hasher_names_string=hasher_names_string,
                    include_directory_stat=include_directory_stat,
                    mount_path=self._mount_path,
                    parser_filter_string=parser_filter_string,
                    process_archive_files=self._process_archive_files,
                    resolver_context=self._resolver_context,
                    status_update_callback=status_update_callback,
                    text_prepend=self._text_prepend)

            else:
                logging.debug(u'Starting extraction in multi process mode.')

                # TODO: pass number_of_extraction_workers.
                processing_status = self._engine.ProcessSources(
                    source_path_specs,
                    storage_writer,
                    enable_sigsegv_handler=enable_sigsegv_handler,
                    filter_find_specs=filter_find_specs,
                    filter_object=self._filter_object,
                    hasher_names_string=hasher_names_string,
                    include_directory_stat=include_directory_stat,
                    mount_path=self._mount_path,
                    parser_filter_string=parser_filter_string,
                    process_archive_files=self._process_archive_files,
                    status_update_callback=status_update_callback,
                    show_memory_usage=self._show_worker_memory_information,
                    text_prepend=self._text_prepend)

        except KeyboardInterrupt:
            self._CleanUpAfterAbort()
            raise errors.UserAbort

        # TODO: check if this still works and if still needed.
        except Exception as exception:
            if not self._single_process_mode:
                raise

            # The tool should generally not be run in single process mode
            # for other reasons than to debug. Hence the general error
            # catching.
            logging.error(
                u'An uncaught exception occurred: {0:s}.\n{1:s}'.format(
                    exception, traceback.format_exc()))
            if self._debug_mode:
                pdb.post_mortem()

        return processing_status
Exemple #4
0
  def _StartSingleThread(self, options):
    """Starts everything up in a single process.

    This should not normally be used, since running the tool in a single
    process buffers up everything into memory until the storage is called.

    Just to make it clear, this starts up the collection, completes that
    before calling the worker that extracts all EventObjects and stores
    them in memory. when that is all done, the storage function is called
    to drain the buffer. Hence the tool's excessive use of memory in this
    mode and the reason why it is not suggested to be used except for
    debugging reasons (and mostly to get into the debugger).

    This is therefore mostly useful during debugging sessions for some
    limited parsing.

    Args:
      options: the command line arguments (instance of argparse.Namespace).
    """
    self._engine = single_process.SingleProcessEngine(self._queue_size)
    self._engine.SetEnableDebugOutput(self._debug_mode)
    self._engine.SetEnableProfiling(
        self._enable_profiling,
        profiling_sample_rate=self._profiling_sample_rate)
    self._engine.SetProcessArchiveFiles(self._process_archive_files)

    if self._filter_object:
      self._engine.SetFilterObject(self._filter_object)

    if self._mount_path:
      self._engine.SetMountPath(self._mount_path)

    if self._text_prepend:
      self._engine.SetTextPrepend(self._text_prepend)

    # TODO: add support to handle multiple partitions.
    self._engine.SetSource(
        self.GetSourcePathSpec(), resolver_context=self._resolver_context)

    logging.debug(u'Starting preprocessing.')
    pre_obj = self.PreprocessSource(options)

    logging.debug(u'Preprocessing done.')

    # TODO: make sure parsers option is not set by preprocessing.
    parser_filter_string = getattr(options, 'parsers', '')

    self._parser_names = []
    for _, parser_class in parsers_manager.ParsersManager.GetParsers(
        parser_filter_string=parser_filter_string):
      self._parser_names.append(parser_class.NAME)

    self._PreprocessSetCollectionInformation(options, pre_obj)

    if 'filestat' in self._parser_names:
      include_directory_stat = True
    else:
      include_directory_stat = False

    filter_file = getattr(options, 'file_filter', None)
    if filter_file:
      filter_find_specs = engine_utils.BuildFindSpecsFromFile(
          filter_file, pre_obj=pre_obj)
    else:
      filter_find_specs = None

    self._collector = self._engine.CreateCollector(
        include_directory_stat, vss_stores=self._vss_stores,
        filter_find_specs=filter_find_specs,
        resolver_context=self._resolver_context)

    self._DebugPrintCollector(options)

    if self._output_module:
      storage_writer = storage.BypassStorageWriter(
          self._engine.storage_queue, self._storage_file_path,
          output_module_string=self._output_module, pre_obj=pre_obj)
    else:
      storage_writer = storage.StorageFileWriter(
          self._engine.storage_queue, self._storage_file_path,
          buffer_size=self._buffer_size, pre_obj=pre_obj,
          serializer_format=self._storage_serializer_format)

    hasher_names_string = getattr(options, u'hashers', u'')

    try:
      self._engine.ProcessSource(
          self._collector, storage_writer,
          parser_filter_string=parser_filter_string,
          hasher_names_string=hasher_names_string)

    except KeyboardInterrupt:
      self._CleanUpAfterAbort()
      raise errors.UserAbort(u'Process source aborted.')

    finally:
      self._resolver_context.Empty()
Exemple #5
0
  def _ProcessSourceMultiProcessMode(self, options):
    """Processes the source in a multiple process.

    Multiprocessing is used to start up separate processes.

    Args:
      options: the command line arguments (instance of argparse.Namespace).
    """
    # TODO: replace by an option.
    start_collection_process = True

    self._number_of_worker_processes = getattr(options, 'workers', 0)

    logging.info(u'Starting extraction in multi process mode.')

    self._engine = multi_process.MultiProcessEngine(
        maximum_number_of_queued_items=self._queue_size)

    self._engine.SetEnableDebugOutput(self._debug_mode)
    self._engine.SetEnableProfiling(
        self._enable_profiling,
        profiling_sample_rate=self._profiling_sample_rate)
    self._engine.SetProcessArchiveFiles(self._process_archive_files)

    if self._filter_object:
      self._engine.SetFilterObject(self._filter_object)

    if self._mount_path:
      self._engine.SetMountPath(self._mount_path)

    if self._text_prepend:
      self._engine.SetTextPrepend(self._text_prepend)
    # TODO: add support to handle multiple partitions.
    self._engine.SetSource(
        self.GetSourcePathSpec(), resolver_context=self._resolver_context)

    logging.debug(u'Starting preprocessing.')
    pre_obj = self.PreprocessSource(options)
    logging.debug(u'Preprocessing done.')

    # TODO: make sure parsers option is not set by preprocessing.
    parser_filter_string = getattr(options, 'parsers', '')

    self._parser_names = []
    for _, parser_class in parsers_manager.ParsersManager.GetParsers(
        parser_filter_string=parser_filter_string):
      self._parser_names.append(parser_class.NAME)

    hasher_names_string = getattr(options, u'hashers', u'')

    self._hasher_names = []
    hasher_manager = hashers_manager.HashersManager
    for hasher_name in hasher_manager.GetHasherNamesFromString(
        hasher_names_string=hasher_names_string):
      self._hasher_names.append(hasher_name)

    self._PreprocessSetCollectionInformation(options, pre_obj)

    if 'filestat' in self._parser_names:
      include_directory_stat = True
    else:
      include_directory_stat = False

    filter_file = getattr(options, 'file_filter', None)
    if filter_file:
      filter_find_specs = engine_utils.BuildFindSpecsFromFile(
          filter_file, pre_obj=pre_obj)
    else:
      filter_find_specs = None

    if start_collection_process:
      resolver_context = context.Context()
    else:
      resolver_context = self._resolver_context

    # TODO: create multi process collector.
    self._collector = self._engine.CreateCollector(
        include_directory_stat, vss_stores=self._vss_stores,
        filter_find_specs=filter_find_specs, resolver_context=resolver_context)

    self._DebugPrintCollector(options)

    if self._output_module:
      storage_writer = storage.BypassStorageWriter(
          self._engine.storage_queue, self._storage_file_path,
          output_module_string=self._output_module, pre_obj=pre_obj)
    else:
      storage_writer = storage.StorageFileWriter(
          self._engine.storage_queue, self._storage_file_path,
          buffer_size=self._buffer_size, pre_obj=pre_obj,
          serializer_format=self._storage_serializer_format)

    try:
      self._engine.ProcessSource(
          self._collector, storage_writer,
          parser_filter_string=parser_filter_string,
          hasher_names_string=hasher_names_string,
          number_of_extraction_workers=self._number_of_worker_processes,
          have_collection_process=start_collection_process,
          have_foreman_process=self._run_foreman,
          show_memory_usage=self._show_worker_memory_information)

    except KeyboardInterrupt:
      self._CleanUpAfterAbort()
      raise errors.UserAbort(u'Process source aborted.')