Ejemplo n.º 1
0
    def _ProcessArchiveTypes(self, mediator, path_spec, type_indicators):
        """Processes a data stream containing archive types such as: TAR or ZIP.

    Args:
      mediator (ParserMediator): mediates the interactions between
          parsers and other components, such as storage and abort signals.
      path_spec (dfvfs.PathSpec): path specification.
      type_indicators(list[str]): dfVFS archive type indicators found in
          the data stream.
    """
        number_of_type_indicators = len(type_indicators)
        if number_of_type_indicators == 0:
            return

        self.processing_status = definitions.PROCESSING_STATUS_COLLECTING

        if number_of_type_indicators > 1:
            display_name = mediator.GetDisplayName()
            logging.debug(
                (u'Found multiple format type indicators: {0:s} for '
                 u'archive file: {1:s}').format(type_indicators, display_name))

        for type_indicator in type_indicators:
            if type_indicator == dfvfs_definitions.TYPE_INDICATOR_TAR:
                archive_path_spec = path_spec_factory.Factory.NewPathSpec(
                    dfvfs_definitions.TYPE_INDICATOR_TAR,
                    location=u'/',
                    parent=path_spec)

            elif type_indicator == dfvfs_definitions.TYPE_INDICATOR_ZIP:
                archive_path_spec = path_spec_factory.Factory.NewPathSpec(
                    dfvfs_definitions.TYPE_INDICATOR_ZIP,
                    location=u'/',
                    parent=path_spec)

            else:
                archive_path_spec = None

                error_message = (
                    u'unsupported archive format type indicator: {0:s}'
                ).format(type_indicator)
                mediator.ProduceExtractionError(error_message,
                                                path_spec=path_spec)

            if archive_path_spec:
                try:
                    path_spec_generator = self._path_spec_extractor.ExtractPathSpecs(
                        [archive_path_spec],
                        resolver_context=mediator.resolver_context)

                    for path_spec in path_spec_generator:
                        if self._abort:
                            break

                        event_source = event_sources.FileEntryEventSource(
                            path_spec=path_spec)
                        event_source.file_entry_type = (
                            dfvfs_definitions.FILE_ENTRY_TYPE_FILE)
                        mediator.ProduceEventSource(event_source)

                        self.last_activity_timestamp = time.time()

                except (IOError, errors.MaximumRecursionDepth) as exception:
                    error_message = (
                        u'unable to process archive file with error: {0:s}'
                    ).format(exception)
                    mediator.ProduceExtractionError(error_message,
                                                    path_spec=path_spec)
Ejemplo n.º 2
0
    def _ProcessSources(self, source_path_specs, storage_writer):
        """Processes the sources.

    Args:
      source_path_specs (list[dfvfs.PathSpec]): path specifications of
          the sources to process.
      storage_writer (StorageWriter): storage writer for a session storage.
    """
        if self._processing_profiler:
            self._processing_profiler.StartTiming('process_sources')

        self._status = definitions.STATUS_INDICATOR_COLLECTING
        self._number_of_consumed_event_tags = 0
        self._number_of_consumed_events = 0
        self._number_of_consumed_extraction_warnings = 0
        self._number_of_consumed_reports = 0
        self._number_of_consumed_sources = 0
        self._number_of_produced_event_tags = 0
        self._number_of_produced_events = 0
        self._number_of_produced_extraction_warnings = 0
        self._number_of_produced_reports = 0
        self._number_of_produced_sources = 0

        find_specs = None
        if self.collection_filters_helper:
            find_specs = (
                self.collection_filters_helper.included_file_system_find_specs)

        path_spec_generator = self._path_spec_extractor.ExtractPathSpecs(
            source_path_specs,
            find_specs=find_specs,
            recurse_file_system=False,
            resolver_context=self._resolver_context)

        for path_spec in path_spec_generator:
            if self._abort:
                break

            # TODO: determine if event sources should be DataStream or FileEntry
            # or both.
            event_source = event_sources.FileEntryEventSource(
                path_spec=path_spec)
            storage_writer.AddAttributeContainer(event_source)

            self._number_of_produced_sources = storage_writer.number_of_event_sources

            # Update the foreman process status in case we are using a filter file.
            self._UpdateForemanProcessStatus()

            if self._status_update_callback:
                self._status_update_callback(self._processing_status)

        self._ScheduleTasks(storage_writer, self._session.identifier)

        if self._abort:
            self._status = definitions.STATUS_INDICATOR_ABORTED
        else:
            self._status = definitions.STATUS_INDICATOR_COMPLETED

        self._number_of_produced_events = storage_writer.number_of_events
        self._number_of_produced_extraction_warnings = (
            storage_writer.number_of_extraction_warnings)
        self._number_of_produced_sources = storage_writer.number_of_event_sources

        if self._processing_profiler:
            self._processing_profiler.StopTiming('process_sources')

        # Update the foreman process and task status in case we are using
        # a filter file.
        self._UpdateForemanProcessStatus()

        tasks_status = self._task_manager.GetStatusInformation()
        if self._task_queue_profiler:
            self._task_queue_profiler.Sample(tasks_status)

        self._processing_status.UpdateTasksStatus(tasks_status)

        if self._status_update_callback:
            self._status_update_callback(self._processing_status)
Ejemplo n.º 3
0
    def _ProcessSources(self,
                        source_path_specs,
                        storage_writer,
                        filter_find_specs=None):
        """Processes the sources.

    Args:
      source_path_specs (list[dfvfs.PathSpec]): path specifications of
          the sources to process.
      storage_writer (StorageWriter): storage writer for a session storage.
      filter_find_specs (Optional[list[dfvfs.FindSpec]]): find specifications
          used in path specification extraction. If set, path specifications
          that match the find specification will be processed.
    """
        if self._processing_profiler:
            self._processing_profiler.StartTiming('process_sources')

        self._status = definitions.PROCESSING_STATUS_COLLECTING
        self._number_of_consumed_errors = 0
        self._number_of_consumed_event_tags = 0
        self._number_of_consumed_events = 0
        self._number_of_consumed_reports = 0
        self._number_of_consumed_sources = 0
        self._number_of_produced_errors = 0
        self._number_of_produced_event_tags = 0
        self._number_of_produced_events = 0
        self._number_of_produced_reports = 0
        self._number_of_produced_sources = 0

        path_spec_generator = self._path_spec_extractor.ExtractPathSpecs(
            source_path_specs,
            find_specs=filter_find_specs,
            recurse_file_system=False,
            resolver_context=self._resolver_context)

        for path_spec in path_spec_generator:
            if self._abort:
                break

            # TODO: determine if event sources should be DataStream or FileEntry
            # or both.
            event_source = event_sources.FileEntryEventSource(
                path_spec=path_spec)
            storage_writer.AddEventSource(event_source)

            self._number_of_produced_sources = storage_writer.number_of_event_sources

            # Update the foreman process status in case we are using a filter file.
            self._UpdateForemanProcessStatus()

            if self._status_update_callback:
                self._status_update_callback(self._processing_status)

        self._ScheduleTasks(storage_writer)

        if self._abort:
            self._status = definitions.PROCESSING_STATUS_ABORTED
        else:
            self._status = definitions.PROCESSING_STATUS_COMPLETED

        self._number_of_produced_errors = storage_writer.number_of_errors
        self._number_of_produced_events = storage_writer.number_of_events
        self._number_of_produced_sources = storage_writer.number_of_event_sources

        if self._processing_profiler:
            self._processing_profiler.StopTiming('process_sources')

        # Update the foreman process and task status in case we are using
        # a filter file.
        self._UpdateForemanProcessStatus()

        tasks_status = self._task_manager.GetStatusInformation()
        if self._task_queue_profiler:
            self._task_queue_profiler.Sample(tasks_status)

        self._processing_status.UpdateTasksStatus(tasks_status)

        if self._status_update_callback:
            self._status_update_callback(self._processing_status)
Ejemplo n.º 4
0
    def _ProcessSources(self,
                        source_path_specs,
                        extraction_worker,
                        parser_mediator,
                        storage_writer,
                        filter_find_specs=None):
        """Processes the sources.

    Args:
      source_path_specs (list[dfvfs.PathSpec]): path specifications of
          the sources to process.
      extraction_worker (worker.ExtractionWorker): extraction worker.
      parser_mediator (ParserMediator): parser mediator.
      storage_writer (StorageWriter): storage writer for a session storage.
      filter_find_specs (Optional[list[dfvfs.FindSpec]]): find specifications
          used in path specification extraction.
    """
        if self._processing_profiler:
            self._processing_profiler.StartTiming(u'process_sources')

        number_of_consumed_sources = 0

        self._UpdateStatus(definitions.PROCESSING_STATUS_COLLECTING, u'',
                           number_of_consumed_sources, storage_writer)

        display_name = u''
        path_spec_generator = self._path_spec_extractor.ExtractPathSpecs(
            source_path_specs,
            find_specs=filter_find_specs,
            recurse_file_system=False,
            resolver_context=parser_mediator.resolver_context)

        for path_spec in path_spec_generator:
            if self._abort:
                break

            display_name = parser_mediator.GetDisplayNameForPathSpec(path_spec)

            # TODO: determine if event sources should be DataStream or FileEntry
            # or both.
            event_source = event_sources.FileEntryEventSource(
                path_spec=path_spec)
            storage_writer.AddEventSource(event_source)

            self._UpdateStatus(definitions.PROCESSING_STATUS_COLLECTING,
                               display_name, number_of_consumed_sources,
                               storage_writer)

        # Force the status update here to make sure the status is up to date.
        self._UpdateStatus(definitions.PROCESSING_STATUS_RUNNING,
                           display_name,
                           number_of_consumed_sources,
                           storage_writer,
                           force=True)

        if self._processing_profiler:
            self._processing_profiler.StartTiming(u'get_event_source')

        event_source = storage_writer.GetFirstWrittenEventSource()

        if self._processing_profiler:
            self._processing_profiler.StopTiming(u'get_event_source')

        while event_source:
            if self._abort:
                break

            self._ProcessPathSpec(extraction_worker, parser_mediator,
                                  event_source.path_spec)
            number_of_consumed_sources += 1

            if self._guppy_memory_profiler:
                self._guppy_memory_profiler.Sample()

            self._UpdateStatus(extraction_worker.processing_status,
                               self._current_display_name,
                               number_of_consumed_sources, storage_writer)

            if self._processing_profiler:
                self._processing_profiler.StartTiming(u'get_event_source')

            event_source = storage_writer.GetNextWrittenEventSource()

            if self._processing_profiler:
                self._processing_profiler.StopTiming(u'get_event_source')

        if self._abort:
            status = definitions.PROCESSING_STATUS_ABORTED
        else:
            status = definitions.PROCESSING_STATUS_COMPLETED

        # Force the status update here to make sure the status is up to date
        # on exit.
        self._UpdateStatus(status,
                           u'',
                           number_of_consumed_sources,
                           storage_writer,
                           force=True)

        if self._processing_profiler:
            self._processing_profiler.StopTiming(u'process_sources')
Ejemplo n.º 5
0
    def _ProcessSources(self, source_configurations, parser_mediator):
        """Processes the sources.

    Args:
      source_configurations (list[SourceConfigurationArtifact]): configurations
          of the sources to process.
      parser_mediator (ParserMediator): parser mediator.
    """
        if self._processing_profiler:
            self._processing_profiler.StartTiming('process_sources')

        self._status = definitions.STATUS_INDICATOR_COLLECTING
        self._current_display_name = ''
        self._number_of_consumed_sources = 0

        find_specs = None
        if self.collection_filters_helper:
            find_specs = (
                self.collection_filters_helper.included_file_system_find_specs)

        source_path_specs = [
            configuration.path_spec for configuration in source_configurations
        ]

        path_spec_generator = self._path_spec_extractor.ExtractPathSpecs(
            source_path_specs,
            find_specs=find_specs,
            recurse_file_system=False,
            resolver_context=parser_mediator.resolver_context)

        for path_spec in path_spec_generator:
            if self._abort:
                break

            self._status = definitions.STATUS_INDICATOR_COLLECTING
            self._current_display_name = parser_mediator.GetDisplayNameForPathSpec(
                path_spec)

            # TODO: determine if event sources should be DataStream or FileEntry
            # or both.
            event_source = event_sources.FileEntryEventSource(
                path_spec=path_spec)
            self._storage_writer.AddAttributeContainer(event_source)

        self._status = definitions.STATUS_INDICATOR_RUNNING

        if self._processing_profiler:
            self._processing_profiler.StartTiming('get_event_source')

        event_source = self._storage_writer.GetFirstWrittenEventSource()

        if self._processing_profiler:
            self._processing_profiler.StopTiming('get_event_source')

        while event_source:
            if self._abort:
                break

            self._ProcessPathSpec(self._extraction_worker, parser_mediator,
                                  event_source.path_spec)

            self._number_of_consumed_sources += 1

            if self._processing_profiler:
                self._processing_profiler.StartTiming('get_event_source')

            event_source = self._storage_writer.GetNextWrittenEventSource()

            if self._processing_profiler:
                self._processing_profiler.StopTiming('get_event_source')

        if self._abort:
            self._status = definitions.STATUS_INDICATOR_ABORTED
        else:
            self._status = definitions.STATUS_INDICATOR_COMPLETED

        self._current_display_name = ''

        if self._processing_profiler:
            self._processing_profiler.StopTiming('process_sources')
Ejemplo n.º 6
0
    def _ProcessSources(self, source_configurations, storage_writer,
                        session_identifier):
        """Processes the sources.

    Args:
      source_configurations (list[SourceConfigurationArtifact]): configurations
          of the sources to process.
      storage_writer (StorageWriter): storage writer for a session storage.
      session_identifier (str): the identifier of the session the tasks are
          part of.
    """
        if self._processing_profiler:
            self._processing_profiler.StartTiming('process_sources')

        self._status = definitions.STATUS_INDICATOR_COLLECTING
        self._number_of_consumed_event_tags = 0
        self._number_of_consumed_events = 0
        self._number_of_consumed_reports = 0
        self._number_of_consumed_sources = 0
        self._number_of_produced_event_tags = 0
        self._number_of_produced_events = 0
        self._number_of_produced_reports = 0
        self._number_of_produced_sources = 0

        stored_parsers_counter = collections.Counter({
            parser_count.name: parser_count
            for parser_count in storage_writer.GetAttributeContainers(
                'parser_count')
        })

        find_specs = None
        if self.collection_filters_helper:
            find_specs = (
                self.collection_filters_helper.included_file_system_find_specs)

        source_path_specs = [
            configuration.path_spec for configuration in source_configurations
        ]

        path_spec_generator = self._path_spec_extractor.ExtractPathSpecs(
            source_path_specs,
            find_specs=find_specs,
            recurse_file_system=False,
            resolver_context=self._resolver_context)

        for path_spec in path_spec_generator:
            if self._abort:
                break

            # TODO: determine if event sources should be DataStream or FileEntry
            # or both.
            event_source = event_sources.FileEntryEventSource(
                path_spec=path_spec)
            storage_writer.AddAttributeContainer(event_source)

            self._number_of_produced_sources += 1

            # Update the foreman process status in case we are using a filter file.
            self._UpdateForemanProcessStatus()

            if self._status_update_callback:
                self._status_update_callback(self._processing_status)

        self._ScheduleTasks(storage_writer, session_identifier)

        if self._abort:
            self._status = definitions.STATUS_INDICATOR_ABORTED
        else:
            self._status = definitions.STATUS_INDICATOR_COMPLETED

        for key, value in self._parsers_counter.items():
            parser_count = stored_parsers_counter.get(key, None)
            if parser_count:
                parser_count.number_of_events += value
                storage_writer.UpdateAttributeContainer(parser_count)
            else:
                parser_count = counts.ParserCount(name=key,
                                                  number_of_events=value)
                storage_writer.AddAttributeContainer(parser_count)

        if self._processing_profiler:
            self._processing_profiler.StopTiming('process_sources')

        # Update the foreman process and task status in case we are using
        # a filter file.
        self._UpdateForemanProcessStatus()

        tasks_status = self._task_manager.GetStatusInformation()
        if self._task_queue_profiler:
            self._task_queue_profiler.Sample(tasks_status)

        self._processing_status.UpdateTasksStatus(tasks_status)

        if self._status_update_callback:
            self._status_update_callback(self._processing_status)