def _StartSingleThread( self, pre_obj, filter_find_specs=None, include_directory_stat=True, parser_filter_string=None, hasher_names_string=None, storage_serializer_format=definitions.SERIALIZER_FORMAT_PROTOBUF): """Starts everything up in a single process. This should not normally be used, since running the tool in a single process buffers up everything into memory until the storage is called. Just to make it clear, this starts up the collection, completes that before calling the worker that extracts all EventObjects and stores them in memory. when that is all done, the storage function is called to drain the buffer. Hence the tool's excessive use of memory in this mode and the reason why it is not suggested to be used except for debugging reasons (and mostly to get into the debugger). This is therefore mostly useful during debugging sessions for some limited parsing. Args: pre_obj: the preprocess object (instance of PreprocessObject). filter_find_specs: optional list of filter find specifications (instances of dfvfs.FindSpec). The default is None. include_directory_stat: Boolean value to indicate whether directory stat information should be collected. The default is True. parser_filter_string: optional parser filter string. The default is None. hasher_names_string: optional comma separated string of names of hashers to enable. The default is None. storage_serializer_format: optional storage serializer format. The default is protobuf. """ self._collector = self._engine.CreateCollector( include_directory_stat, vss_stores=self.vss_stores, filter_find_specs=filter_find_specs, resolver_context=self._resolver_context) if self._output_module: storage_writer = storage.BypassStorageWriter( self._engine.storage_queue, self._storage_file_path, output_module_string=self._output_module, pre_obj=pre_obj) else: storage_writer = storage.StorageFileWriter( self._engine.storage_queue, self._storage_file_path, buffer_size=self._buffer_size, pre_obj=pre_obj, serializer_format=storage_serializer_format) storage_writer.SetEnableProfiling( self._enable_profiling, profiling_type=self._profiling_type) try: self._engine.ProcessSource( self._collector, storage_writer, parser_filter_string=parser_filter_string, hasher_names_string=hasher_names_string) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort(u'Process source aborted.')
def _ProcessSourceMultiProcessMode( self, pre_obj, filter_find_specs=None, include_directory_stat=True, number_of_worker_processes=0, parser_filter_string=None, hasher_names_string=None, storage_serializer_format=definitions.SERIALIZER_FORMAT_PROTOBUF): """Processes the source with multiple processes. Args: pre_obj: the preprocess object (instance of PreprocessObject). filter_find_specs: optional list of filter find specifications (instances of dfvfs.FindSpec). The default is None. include_directory_stat: Boolean value to indicate whether directory stat information should be collected. The default is True. number_of_worker_processes: optional number of worker processes. The default is 0 which represents deterimine automatically. parser_filter_string: optional parser filter string. The default is None. hasher_names_string: optional comma separated string of names of hashers to enable. The default is None. storage_serializer_format: optional storage serializer format. The default is protobuf. """ logging.info(u'Starting extraction in multi process mode.') resolver_context = context.Context() # TODO: create multi process collector. self._collector = self._engine.CreateCollector( include_directory_stat, vss_stores=self.vss_stores, filter_find_specs=filter_find_specs, resolver_context=resolver_context) if self._output_module: storage_writer = storage.BypassStorageWriter( self._engine.storage_queue, self._storage_file_path, output_module_string=self._output_module, pre_obj=pre_obj) else: storage_writer = storage.StorageFileWriter( self._engine.storage_queue, self._storage_file_path, buffer_size=self._buffer_size, pre_obj=pre_obj, serializer_format=storage_serializer_format) storage_writer.SetEnableProfiling( self._enable_profiling, profiling_type=self._profiling_type) try: self._engine.ProcessSource( self._collector, storage_writer, parser_filter_string=parser_filter_string, hasher_names_string=hasher_names_string, number_of_extraction_workers=number_of_worker_processes, show_memory_usage=self._show_worker_memory_information) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort(u'Process source aborted.')
def ProcessSources( self, source_path_specs, source_type, enable_sigsegv_handler=False, filter_file=None, hasher_names_string=None, parser_filter_string=None, preferred_encoding=u'utf-8', single_process_mode=False, status_update_callback=None, storage_serializer_format=definitions.SERIALIZER_FORMAT_PROTOBUF, timezone=pytz.UTC): """Processes the sources. Args: source_path_specs: list of path specifications (instances of dfvfs.PathSpec) to process. source_type: the dfVFS source type definition. enable_sigsegv_handler: optional boolean value to indicate the SIGSEGV handler should be enabled. The default is False. filter_file: optional path to a file that contains find specifications. The default is None. hasher_names_string: optional comma separated string of names of hashers to enable. The default is None. parser_filter_string: optional parser filter string. The default is None. preferred_encoding: optional preferred encoding. The default is UTF-8. single_process_mode: optional boolean value to indicate if the front-end should run in single process mode. The default is False. status_update_callback: optional callback function for status updates. The default is None. storage_serializer_format: optional storage serializer format. The default is protobuf. timezone: optional preferred timezone. The default is UTC. Returns: The processing status (instance of ProcessingStatus) or None. Raises: SourceScannerError: if the source scanner could not find a supported file system. UserAbort: if the user initiated an abort. """ # If the source is a directory or a storage media image # run pre-processing. # TODO: move source_scanner.SourceScannerContext.SOURCE_TYPE_ # to definitions.SOURCE_TYPE_. if source_type in [ source_scanner.SourceScannerContext.SOURCE_TYPE_DIRECTORY, source_scanner.SourceScannerContext. SOURCE_TYPE_STORAGE_MEDIA_DEVICE, source_scanner. SourceScannerContext.SOURCE_TYPE_STORAGE_MEDIA_IMAGE ]: self.SetEnablePreprocessing(True) else: self.SetEnablePreprocessing(False) self._CheckStorageFile(self._storage_file_path) self._single_process_mode = single_process_mode # TODO: move source_scanner.SourceScannerContext.SOURCE_TYPE_ # to definitions.SOURCE_TYPE_. if source_type == source_scanner.SourceScannerContext.SOURCE_TYPE_FILE: # No need to multi process a single file source. self._single_process_mode = True if self._single_process_mode: self._engine = single_process.SingleProcessEngine(self._queue_size) else: self._engine = multi_process.MultiProcessEngine( maximum_number_of_queued_items=self._queue_size) self._engine.SetEnableDebugOutput(self._debug_mode) self._engine.SetEnableProfiling( self._enable_profiling, profiling_sample_rate=self._profiling_sample_rate, profiling_type=self._profiling_type) pre_obj = self._PreprocessSource(source_path_specs, source_type) self._operating_system = getattr(pre_obj, u'guessed_os', None) if not parser_filter_string: guessed_os = self._operating_system os_version = getattr(pre_obj, u'osversion', u'') parser_filter_string = self._GetParserFilterPreset( os_guess=guessed_os, os_version=os_version) if parser_filter_string: logging.info( u'Parser filter expression changed to: {0:s}'.format( parser_filter_string)) self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_string=parser_filter_string): self._parser_names.append(parser_class.NAME) if u'filestat' in self._parser_names: include_directory_stat = True else: include_directory_stat = False self._hasher_names = [] hasher_manager = hashers_manager.HashersManager for hasher_name in hasher_manager.GetHasherNamesFromString( hasher_names_string=hasher_names_string): self._hasher_names.append(hasher_name) self._PreprocessSetTimezone(pre_obj, timezone=timezone) if filter_file: filter_find_specs = engine_utils.BuildFindSpecsFromFile( filter_file, pre_obj=pre_obj) else: filter_find_specs = None self._PreprocessSetCollectionInformation( pre_obj, source_type, self._engine, filter_file=filter_file, parser_filter_string=parser_filter_string, preferred_encoding=preferred_encoding) if self._output_module: storage_writer = storage.BypassStorageWriter( self._engine.event_object_queue, self._storage_file_path, output_module_string=self._output_module, pre_obj=pre_obj) else: storage_writer = storage.FileStorageWriter( self._engine.event_object_queue, self._storage_file_path, buffer_size=self._buffer_size, pre_obj=pre_obj, serializer_format=storage_serializer_format) storage_writer.SetEnableProfiling( self._enable_profiling, profiling_type=self._profiling_type) processing_status = None try: if self._single_process_mode: logging.debug(u'Starting extraction in single process mode.') processing_status = self._engine.ProcessSources( source_path_specs, storage_writer, filter_find_specs=filter_find_specs, filter_object=self._filter_object, hasher_names_string=hasher_names_string, include_directory_stat=include_directory_stat, mount_path=self._mount_path, parser_filter_string=parser_filter_string, process_archive_files=self._process_archive_files, resolver_context=self._resolver_context, status_update_callback=status_update_callback, text_prepend=self._text_prepend) else: logging.debug(u'Starting extraction in multi process mode.') # TODO: pass number_of_extraction_workers. processing_status = self._engine.ProcessSources( source_path_specs, storage_writer, enable_sigsegv_handler=enable_sigsegv_handler, filter_find_specs=filter_find_specs, filter_object=self._filter_object, hasher_names_string=hasher_names_string, include_directory_stat=include_directory_stat, mount_path=self._mount_path, parser_filter_string=parser_filter_string, process_archive_files=self._process_archive_files, status_update_callback=status_update_callback, show_memory_usage=self._show_worker_memory_information, text_prepend=self._text_prepend) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort # TODO: check if this still works and if still needed. except Exception as exception: if not self._single_process_mode: raise # The tool should generally not be run in single process mode # for other reasons than to debug. Hence the general error # catching. logging.error( u'An uncaught exception occurred: {0:s}.\n{1:s}'.format( exception, traceback.format_exc())) if self._debug_mode: pdb.post_mortem() return processing_status
def _StartSingleThread(self, options): """Starts everything up in a single process. This should not normally be used, since running the tool in a single process buffers up everything into memory until the storage is called. Just to make it clear, this starts up the collection, completes that before calling the worker that extracts all EventObjects and stores them in memory. when that is all done, the storage function is called to drain the buffer. Hence the tool's excessive use of memory in this mode and the reason why it is not suggested to be used except for debugging reasons (and mostly to get into the debugger). This is therefore mostly useful during debugging sessions for some limited parsing. Args: options: the command line arguments (instance of argparse.Namespace). """ self._engine = single_process.SingleProcessEngine(self._queue_size) self._engine.SetEnableDebugOutput(self._debug_mode) self._engine.SetEnableProfiling( self._enable_profiling, profiling_sample_rate=self._profiling_sample_rate) self._engine.SetProcessArchiveFiles(self._process_archive_files) if self._filter_object: self._engine.SetFilterObject(self._filter_object) if self._mount_path: self._engine.SetMountPath(self._mount_path) if self._text_prepend: self._engine.SetTextPrepend(self._text_prepend) # TODO: add support to handle multiple partitions. self._engine.SetSource( self.GetSourcePathSpec(), resolver_context=self._resolver_context) logging.debug(u'Starting preprocessing.') pre_obj = self.PreprocessSource(options) logging.debug(u'Preprocessing done.') # TODO: make sure parsers option is not set by preprocessing. parser_filter_string = getattr(options, 'parsers', '') self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_string=parser_filter_string): self._parser_names.append(parser_class.NAME) self._PreprocessSetCollectionInformation(options, pre_obj) if 'filestat' in self._parser_names: include_directory_stat = True else: include_directory_stat = False filter_file = getattr(options, 'file_filter', None) if filter_file: filter_find_specs = engine_utils.BuildFindSpecsFromFile( filter_file, pre_obj=pre_obj) else: filter_find_specs = None self._collector = self._engine.CreateCollector( include_directory_stat, vss_stores=self._vss_stores, filter_find_specs=filter_find_specs, resolver_context=self._resolver_context) self._DebugPrintCollector(options) if self._output_module: storage_writer = storage.BypassStorageWriter( self._engine.storage_queue, self._storage_file_path, output_module_string=self._output_module, pre_obj=pre_obj) else: storage_writer = storage.StorageFileWriter( self._engine.storage_queue, self._storage_file_path, buffer_size=self._buffer_size, pre_obj=pre_obj, serializer_format=self._storage_serializer_format) hasher_names_string = getattr(options, u'hashers', u'') try: self._engine.ProcessSource( self._collector, storage_writer, parser_filter_string=parser_filter_string, hasher_names_string=hasher_names_string) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort(u'Process source aborted.') finally: self._resolver_context.Empty()
def _ProcessSourceMultiProcessMode(self, options): """Processes the source in a multiple process. Multiprocessing is used to start up separate processes. Args: options: the command line arguments (instance of argparse.Namespace). """ # TODO: replace by an option. start_collection_process = True self._number_of_worker_processes = getattr(options, 'workers', 0) logging.info(u'Starting extraction in multi process mode.') self._engine = multi_process.MultiProcessEngine( maximum_number_of_queued_items=self._queue_size) self._engine.SetEnableDebugOutput(self._debug_mode) self._engine.SetEnableProfiling( self._enable_profiling, profiling_sample_rate=self._profiling_sample_rate) self._engine.SetProcessArchiveFiles(self._process_archive_files) if self._filter_object: self._engine.SetFilterObject(self._filter_object) if self._mount_path: self._engine.SetMountPath(self._mount_path) if self._text_prepend: self._engine.SetTextPrepend(self._text_prepend) # TODO: add support to handle multiple partitions. self._engine.SetSource( self.GetSourcePathSpec(), resolver_context=self._resolver_context) logging.debug(u'Starting preprocessing.') pre_obj = self.PreprocessSource(options) logging.debug(u'Preprocessing done.') # TODO: make sure parsers option is not set by preprocessing. parser_filter_string = getattr(options, 'parsers', '') self._parser_names = [] for _, parser_class in parsers_manager.ParsersManager.GetParsers( parser_filter_string=parser_filter_string): self._parser_names.append(parser_class.NAME) hasher_names_string = getattr(options, u'hashers', u'') self._hasher_names = [] hasher_manager = hashers_manager.HashersManager for hasher_name in hasher_manager.GetHasherNamesFromString( hasher_names_string=hasher_names_string): self._hasher_names.append(hasher_name) self._PreprocessSetCollectionInformation(options, pre_obj) if 'filestat' in self._parser_names: include_directory_stat = True else: include_directory_stat = False filter_file = getattr(options, 'file_filter', None) if filter_file: filter_find_specs = engine_utils.BuildFindSpecsFromFile( filter_file, pre_obj=pre_obj) else: filter_find_specs = None if start_collection_process: resolver_context = context.Context() else: resolver_context = self._resolver_context # TODO: create multi process collector. self._collector = self._engine.CreateCollector( include_directory_stat, vss_stores=self._vss_stores, filter_find_specs=filter_find_specs, resolver_context=resolver_context) self._DebugPrintCollector(options) if self._output_module: storage_writer = storage.BypassStorageWriter( self._engine.storage_queue, self._storage_file_path, output_module_string=self._output_module, pre_obj=pre_obj) else: storage_writer = storage.StorageFileWriter( self._engine.storage_queue, self._storage_file_path, buffer_size=self._buffer_size, pre_obj=pre_obj, serializer_format=self._storage_serializer_format) try: self._engine.ProcessSource( self._collector, storage_writer, parser_filter_string=parser_filter_string, hasher_names_string=hasher_names_string, number_of_extraction_workers=self._number_of_worker_processes, have_collection_process=start_collection_process, have_foreman_process=self._run_foreman, show_memory_usage=self._show_worker_memory_information) except KeyboardInterrupt: self._CleanUpAfterAbort() raise errors.UserAbort(u'Process source aborted.')