def _ProcessArchiveTypes(self, mediator, path_spec, type_indicators): """Processes a data stream containing archive types such as: TAR or ZIP. Args: mediator (ParserMediator): mediates the interactions between parsers and other components, such as storage and abort signals. path_spec (dfvfs.PathSpec): path specification. type_indicators(list[str]): dfVFS archive type indicators found in the data stream. """ number_of_type_indicators = len(type_indicators) if number_of_type_indicators == 0: return self.processing_status = definitions.PROCESSING_STATUS_COLLECTING if number_of_type_indicators > 1: display_name = mediator.GetDisplayName() logging.debug( (u'Found multiple format type indicators: {0:s} for ' u'archive file: {1:s}').format(type_indicators, display_name)) for type_indicator in type_indicators: if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TAR: archive_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_TAR, location=u'/', parent=path_spec) elif type_indicator == dfvfs_definitions.TYPE_INDICATOR_ZIP: archive_path_spec = path_spec_factory.Factory.NewPathSpec( dfvfs_definitions.TYPE_INDICATOR_ZIP, location=u'/', parent=path_spec) else: archive_path_spec = None error_message = ( u'unsupported archive format type indicator: {0:s}' ).format(type_indicator) mediator.ProduceExtractionError(error_message, path_spec=path_spec) if archive_path_spec: try: path_spec_generator = self._path_spec_extractor.ExtractPathSpecs( [archive_path_spec], resolver_context=mediator.resolver_context) for path_spec in path_spec_generator: if self._abort: break event_source = event_sources.FileEntryEventSource( path_spec=path_spec) event_source.file_entry_type = ( dfvfs_definitions.FILE_ENTRY_TYPE_FILE) mediator.ProduceEventSource(event_source) self.last_activity_timestamp = time.time() except (IOError, errors.MaximumRecursionDepth) as exception: error_message = ( u'unable to process archive file with error: {0:s}' ).format(exception) mediator.ProduceExtractionError(error_message, path_spec=path_spec)
def _ProcessSources(self, source_path_specs, storage_writer): """Processes the sources. Args: source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. """ if self._processing_profiler: self._processing_profiler.StartTiming('process_sources') self._status = definitions.STATUS_INDICATOR_COLLECTING self._number_of_consumed_event_tags = 0 self._number_of_consumed_events = 0 self._number_of_consumed_extraction_warnings = 0 self._number_of_consumed_reports = 0 self._number_of_consumed_sources = 0 self._number_of_produced_event_tags = 0 self._number_of_produced_events = 0 self._number_of_produced_extraction_warnings = 0 self._number_of_produced_reports = 0 self._number_of_produced_sources = 0 find_specs = None if self.collection_filters_helper: find_specs = ( self.collection_filters_helper.included_file_system_find_specs) path_spec_generator = self._path_spec_extractor.ExtractPathSpecs( source_path_specs, find_specs=find_specs, recurse_file_system=False, resolver_context=self._resolver_context) for path_spec in path_spec_generator: if self._abort: break # TODO: determine if event sources should be DataStream or FileEntry # or both. event_source = event_sources.FileEntryEventSource( path_spec=path_spec) storage_writer.AddAttributeContainer(event_source) self._number_of_produced_sources = storage_writer.number_of_event_sources # Update the foreman process status in case we are using a filter file. self._UpdateForemanProcessStatus() if self._status_update_callback: self._status_update_callback(self._processing_status) self._ScheduleTasks(storage_writer, self._session.identifier) if self._abort: self._status = definitions.STATUS_INDICATOR_ABORTED else: self._status = definitions.STATUS_INDICATOR_COMPLETED self._number_of_produced_events = storage_writer.number_of_events self._number_of_produced_extraction_warnings = ( storage_writer.number_of_extraction_warnings) self._number_of_produced_sources = storage_writer.number_of_event_sources if self._processing_profiler: self._processing_profiler.StopTiming('process_sources') # Update the foreman process and task status in case we are using # a filter file. self._UpdateForemanProcessStatus() tasks_status = self._task_manager.GetStatusInformation() if self._task_queue_profiler: self._task_queue_profiler.Sample(tasks_status) self._processing_status.UpdateTasksStatus(tasks_status) if self._status_update_callback: self._status_update_callback(self._processing_status)
def _ProcessSources(self, source_path_specs, storage_writer, filter_find_specs=None): """Processes the sources. Args: source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. filter_find_specs (Optional[list[dfvfs.FindSpec]]): find specifications used in path specification extraction. If set, path specifications that match the find specification will be processed. """ if self._processing_profiler: self._processing_profiler.StartTiming('process_sources') self._status = definitions.PROCESSING_STATUS_COLLECTING self._number_of_consumed_errors = 0 self._number_of_consumed_event_tags = 0 self._number_of_consumed_events = 0 self._number_of_consumed_reports = 0 self._number_of_consumed_sources = 0 self._number_of_produced_errors = 0 self._number_of_produced_event_tags = 0 self._number_of_produced_events = 0 self._number_of_produced_reports = 0 self._number_of_produced_sources = 0 path_spec_generator = self._path_spec_extractor.ExtractPathSpecs( source_path_specs, find_specs=filter_find_specs, recurse_file_system=False, resolver_context=self._resolver_context) for path_spec in path_spec_generator: if self._abort: break # TODO: determine if event sources should be DataStream or FileEntry # or both. event_source = event_sources.FileEntryEventSource( path_spec=path_spec) storage_writer.AddEventSource(event_source) self._number_of_produced_sources = storage_writer.number_of_event_sources # Update the foreman process status in case we are using a filter file. self._UpdateForemanProcessStatus() if self._status_update_callback: self._status_update_callback(self._processing_status) self._ScheduleTasks(storage_writer) if self._abort: self._status = definitions.PROCESSING_STATUS_ABORTED else: self._status = definitions.PROCESSING_STATUS_COMPLETED self._number_of_produced_errors = storage_writer.number_of_errors self._number_of_produced_events = storage_writer.number_of_events self._number_of_produced_sources = storage_writer.number_of_event_sources if self._processing_profiler: self._processing_profiler.StopTiming('process_sources') # Update the foreman process and task status in case we are using # a filter file. self._UpdateForemanProcessStatus() tasks_status = self._task_manager.GetStatusInformation() if self._task_queue_profiler: self._task_queue_profiler.Sample(tasks_status) self._processing_status.UpdateTasksStatus(tasks_status) if self._status_update_callback: self._status_update_callback(self._processing_status)
def _ProcessSources(self, source_path_specs, extraction_worker, parser_mediator, storage_writer, filter_find_specs=None): """Processes the sources. Args: source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. extraction_worker (worker.ExtractionWorker): extraction worker. parser_mediator (ParserMediator): parser mediator. storage_writer (StorageWriter): storage writer for a session storage. filter_find_specs (Optional[list[dfvfs.FindSpec]]): find specifications used in path specification extraction. """ if self._processing_profiler: self._processing_profiler.StartTiming(u'process_sources') number_of_consumed_sources = 0 self._UpdateStatus(definitions.PROCESSING_STATUS_COLLECTING, u'', number_of_consumed_sources, storage_writer) display_name = u'' path_spec_generator = self._path_spec_extractor.ExtractPathSpecs( source_path_specs, find_specs=filter_find_specs, recurse_file_system=False, resolver_context=parser_mediator.resolver_context) for path_spec in path_spec_generator: if self._abort: break display_name = parser_mediator.GetDisplayNameForPathSpec(path_spec) # TODO: determine if event sources should be DataStream or FileEntry # or both. event_source = event_sources.FileEntryEventSource( path_spec=path_spec) storage_writer.AddEventSource(event_source) self._UpdateStatus(definitions.PROCESSING_STATUS_COLLECTING, display_name, number_of_consumed_sources, storage_writer) # Force the status update here to make sure the status is up to date. self._UpdateStatus(definitions.PROCESSING_STATUS_RUNNING, display_name, number_of_consumed_sources, storage_writer, force=True) if self._processing_profiler: self._processing_profiler.StartTiming(u'get_event_source') event_source = storage_writer.GetFirstWrittenEventSource() if self._processing_profiler: self._processing_profiler.StopTiming(u'get_event_source') while event_source: if self._abort: break self._ProcessPathSpec(extraction_worker, parser_mediator, event_source.path_spec) number_of_consumed_sources += 1 if self._guppy_memory_profiler: self._guppy_memory_profiler.Sample() self._UpdateStatus(extraction_worker.processing_status, self._current_display_name, number_of_consumed_sources, storage_writer) if self._processing_profiler: self._processing_profiler.StartTiming(u'get_event_source') event_source = storage_writer.GetNextWrittenEventSource() if self._processing_profiler: self._processing_profiler.StopTiming(u'get_event_source') if self._abort: status = definitions.PROCESSING_STATUS_ABORTED else: status = definitions.PROCESSING_STATUS_COMPLETED # Force the status update here to make sure the status is up to date # on exit. self._UpdateStatus(status, u'', number_of_consumed_sources, storage_writer, force=True) if self._processing_profiler: self._processing_profiler.StopTiming(u'process_sources')
def _ProcessSources(self, source_configurations, parser_mediator): """Processes the sources. Args: source_configurations (list[SourceConfigurationArtifact]): configurations of the sources to process. parser_mediator (ParserMediator): parser mediator. """ if self._processing_profiler: self._processing_profiler.StartTiming('process_sources') self._status = definitions.STATUS_INDICATOR_COLLECTING self._current_display_name = '' self._number_of_consumed_sources = 0 find_specs = None if self.collection_filters_helper: find_specs = ( self.collection_filters_helper.included_file_system_find_specs) source_path_specs = [ configuration.path_spec for configuration in source_configurations ] path_spec_generator = self._path_spec_extractor.ExtractPathSpecs( source_path_specs, find_specs=find_specs, recurse_file_system=False, resolver_context=parser_mediator.resolver_context) for path_spec in path_spec_generator: if self._abort: break self._status = definitions.STATUS_INDICATOR_COLLECTING self._current_display_name = parser_mediator.GetDisplayNameForPathSpec( path_spec) # TODO: determine if event sources should be DataStream or FileEntry # or both. event_source = event_sources.FileEntryEventSource( path_spec=path_spec) self._storage_writer.AddAttributeContainer(event_source) self._status = definitions.STATUS_INDICATOR_RUNNING if self._processing_profiler: self._processing_profiler.StartTiming('get_event_source') event_source = self._storage_writer.GetFirstWrittenEventSource() if self._processing_profiler: self._processing_profiler.StopTiming('get_event_source') while event_source: if self._abort: break self._ProcessPathSpec(self._extraction_worker, parser_mediator, event_source.path_spec) self._number_of_consumed_sources += 1 if self._processing_profiler: self._processing_profiler.StartTiming('get_event_source') event_source = self._storage_writer.GetNextWrittenEventSource() if self._processing_profiler: self._processing_profiler.StopTiming('get_event_source') if self._abort: self._status = definitions.STATUS_INDICATOR_ABORTED else: self._status = definitions.STATUS_INDICATOR_COMPLETED self._current_display_name = '' if self._processing_profiler: self._processing_profiler.StopTiming('process_sources')
def _ProcessSources(self, source_configurations, storage_writer, session_identifier): """Processes the sources. Args: source_configurations (list[SourceConfigurationArtifact]): configurations of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. session_identifier (str): the identifier of the session the tasks are part of. """ if self._processing_profiler: self._processing_profiler.StartTiming('process_sources') self._status = definitions.STATUS_INDICATOR_COLLECTING self._number_of_consumed_event_tags = 0 self._number_of_consumed_events = 0 self._number_of_consumed_reports = 0 self._number_of_consumed_sources = 0 self._number_of_produced_event_tags = 0 self._number_of_produced_events = 0 self._number_of_produced_reports = 0 self._number_of_produced_sources = 0 stored_parsers_counter = collections.Counter({ parser_count.name: parser_count for parser_count in storage_writer.GetAttributeContainers( 'parser_count') }) find_specs = None if self.collection_filters_helper: find_specs = ( self.collection_filters_helper.included_file_system_find_specs) source_path_specs = [ configuration.path_spec for configuration in source_configurations ] path_spec_generator = self._path_spec_extractor.ExtractPathSpecs( source_path_specs, find_specs=find_specs, recurse_file_system=False, resolver_context=self._resolver_context) for path_spec in path_spec_generator: if self._abort: break # TODO: determine if event sources should be DataStream or FileEntry # or both. event_source = event_sources.FileEntryEventSource( path_spec=path_spec) storage_writer.AddAttributeContainer(event_source) self._number_of_produced_sources += 1 # Update the foreman process status in case we are using a filter file. self._UpdateForemanProcessStatus() if self._status_update_callback: self._status_update_callback(self._processing_status) self._ScheduleTasks(storage_writer, session_identifier) if self._abort: self._status = definitions.STATUS_INDICATOR_ABORTED else: self._status = definitions.STATUS_INDICATOR_COMPLETED for key, value in self._parsers_counter.items(): parser_count = stored_parsers_counter.get(key, None) if parser_count: parser_count.number_of_events += value storage_writer.UpdateAttributeContainer(parser_count) else: parser_count = counts.ParserCount(name=key, number_of_events=value) storage_writer.AddAttributeContainer(parser_count) if self._processing_profiler: self._processing_profiler.StopTiming('process_sources') # Update the foreman process and task status in case we are using # a filter file. self._UpdateForemanProcessStatus() tasks_status = self._task_manager.GetStatusInformation() if self._task_queue_profiler: self._task_queue_profiler.Sample(tasks_status) self._processing_status.UpdateTasksStatus(tasks_status) if self._status_update_callback: self._status_update_callback(self._processing_status)