def ParseFile(file_entry): """Parse a file given a file entry and yield results.""" if not file_entry: return # Create the necessary items. proc_queue = queue.SingleThreadedQueue() storage_queue = queue.SingleThreadedQueue() storage_queue_producer = queue.EventObjectQueueProducer(storage_queue) pre_obj = event.PreprocessObject() all_parsers = putils.FindAllParsers(pre_obj) # Create a worker. worker_object = worker.EventExtractionWorker( 'my_worker', proc_queue, storage_queue_producer, pre_obj, all_parsers) # Parse the file. worker_object.ParseFile(file_entry) storage_queue.SignalEndOfInput() proc_queue.SignalEndOfInput() while True: try: item = storage_queue.PopItem() except errors.QueueEmpty: break if isinstance(item, queue.QueueEndOfInput): break yield item
def testExtractionWorkerHashing(self): """Test that the worker sets up and runs hashing code correctly.""" extraction_worker = worker.EventExtractionWorker() extraction_worker._SetHashers('md5') self.assertIn('hashing', extraction_worker.GetAnalyzerNames()) knowledge_base_values = {'year': 2016} session = sessions.Session() path_spec = self._GetTestFilePathSpec(['empty_file']) storage_writer = fake_storage.FakeStorageWriter(session) self._TestProcessPathSpec(storage_writer, path_spec, extraction_worker=extraction_worker, knowledge_base_values=knowledge_base_values) storage_writer.Open() empty_file_md5 = 'd41d8cd98f00b204e9800998ecf8427e' for event in storage_writer.GetSortedEvents(): md5_hash = getattr(event, 'md5_hash', None) self.assertEqual(md5_hash, empty_file_md5) storage_writer.Close()
def testExtractionWorkerYara(self): """Tests that the worker applies Yara matching code correctly.""" extraction_worker = worker.EventExtractionWorker() rule_path = self._GetTestFilePath(['yara.rules']) with open(rule_path, 'r') as rule_file: rule_string = rule_file.read() extraction_worker._SetYaraRules(rule_string) self.assertIn('yara', extraction_worker.GetAnalyzerNames()) knowledge_base_values = {'year': 2016} session = sessions.Session() path_spec = self._GetTestFilePathSpec(['test_pe.exe']) storage_writer = fake_storage.FakeStorageWriter(session) self._TestProcessPathSpec(storage_writer, path_spec, extraction_worker=extraction_worker, knowledge_base_values=knowledge_base_values) storage_writer.Open() expected_yara_match = 'PEfileBasic,PEfile' for event in storage_writer.GetSortedEvents(): yara_match = getattr(event, 'yara_match', None) self.assertEqual(yara_match, expected_yara_match) storage_writer.Close()
def CreateExtractionWorker(self, worker_number, pre_obj, parsers, rpc_proxy=None): """Creates an extraction worker object. Args: worker_number: number that identifies the worker. pre_obj: The preprocessing object (instance of PreprocessObject). parsers: A list of parser objects to use for processing. rpc_proxy: A proxy object (instance of proxy.ProxyServer) that can be used to setup RPC functionality for the worker. This is optional and if not provided the worker will not listen to RPC requests. Returns: An extraction worker (instance of worker.ExtractionWorker). """ return worker.EventExtractionWorker(worker_number, self._collection_queue, self._storage_queue_producer, pre_obj, parsers, rpc_proxy=rpc_proxy)
def testExtractionWorkerHashing(self): """Test that the worker sets up and runs hashing code correctly.""" extraction_worker = worker.EventExtractionWorker() extraction_worker._SetHashers('md5') self.assertIn('hashing', extraction_worker.GetAnalyzerNames()) knowledge_base_values = {'year': 2016} session = sessions.Session() path_spec = self._GetTestFilePathSpec(['empty_file']) storage_writer = fake_writer.FakeStorageWriter(session) # Typically there are 3 filestat events, but there can be 4 on platforms # that support os.stat_result st_birthtime. expected_event_counters = {'fs:stat': [3, 4]} self._TestProcessPathSpec(storage_writer, path_spec, expected_event_counters, extraction_worker=extraction_worker, knowledge_base_values=knowledge_base_values) storage_writer.Open() empty_file_md5 = 'd41d8cd98f00b204e9800998ecf8427e' for event in storage_writer.GetSortedEvents(): event_data = self._GetEventDataOfEvent(storage_writer, event) event_data_stream = self._GetEventDataStreamOfEventData( storage_writer, event_data) self.assertEqual(event_data_stream.md5_hash, empty_file_md5) storage_writer.Close()
def testCanSkipContentExtraction(self): """Tests the _CanSkipContentExtraction function.""" extraction_worker = worker.EventExtractionWorker() file_entry = self._GetTestFileEntry(['syslog.tgz']) result = extraction_worker._CanSkipContentExtraction(file_entry) self.assertFalse(result)
def testIsMetadataFile(self): """Tests the _IsMetadataFile function.""" extraction_worker = worker.EventExtractionWorker() file_entry = self._GetTestFileEntry(['syslog.tgz']) result = extraction_worker._IsMetadataFile(file_entry) self.assertFalse(result)
def testCanSkipDataStream(self): """Tests the _CanSkipDataStream function.""" extraction_worker = worker.EventExtractionWorker() file_entry = self._GetTestFileEntry(['syslog.tgz']) result = extraction_worker._CanSkipDataStream(file_entry, None) self.assertFalse(result)
def _TestProcessPathSpec(self, storage_writer, path_spec, expected_event_counters, extraction_worker=None, knowledge_base_values=None, process_archives=False): """Tests processing a path specification. Args: storage_writer (StorageWriter): storage writer. path_spec (dfvfs.PathSpec): path specification. expected_event_counters (dict[str, int|list[int]]): expected event counters per event data type. extraction_worker (Optional[EventExtractorWorker]): worker to process the path specification. If None, a new worker will be created. knowledge_base_values (Optional[dict]): knowledge base values. process_archives (Optional[bool]): whether archive files should be processed. """ knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in knowledge_base_values.items(): knowledge_base_object.SetValue(identifier, value) resolver_context = context.Context() mediator = parsers_mediator.ParserMediator( storage_writer, knowledge_base_object, resolver_context=resolver_context) if not extraction_worker: configuration = configurations.ExtractionConfiguration() configuration.process_archives = process_archives extraction_worker = worker.EventExtractionWorker() extraction_worker.SetExtractionConfiguration(configuration) storage_writer.Open() try: storage_writer.WriteSessionStart() extraction_worker.ProcessPathSpec(mediator, path_spec) event_source = storage_writer.GetFirstWrittenEventSource() while event_source: extraction_worker.ProcessPathSpec(mediator, event_source.path_spec) event_source = storage_writer.GetNextWrittenEventSource() storage_writer.WriteSessionCompletion() if expected_event_counters: self.CheckEventCounters(storage_writer, expected_event_counters) finally: storage_writer.Close()
def testGetCompressedStreamTypes(self): """Tests the _GetCompressedStreamTypes function.""" knowledge_base_values = {'year': 2016} session = sessions.Session() storage_writer = fake_writer.FakeStorageWriter() knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in knowledge_base_values.items(): knowledge_base_object.SetValue(identifier, value) resolver_context = context.Context() parser_mediator = parsers_mediator.ParserMediator( knowledge_base_object, resolver_context=resolver_context) parser_mediator.SetPreferredYear(2016) parser_mediator.SetStorageWriter(storage_writer) extraction_worker = worker.EventExtractionWorker() test_analyzer = analyzers_manager_test.TestAnalyzer() self.assertEqual(len(test_analyzer.GetResults()), 0) extraction_worker._analyzers = [test_analyzer] storage_writer.Open() session_start = session.CreateSessionStart() storage_writer.AddAttributeContainer(session_start) extraction_worker = worker.EventExtractionWorker() path_spec = self._GetTestFilePathSpec(['syslog.tgz']) type_indicators = extraction_worker._GetCompressedStreamTypes( parser_mediator, path_spec) self.assertEqual(type_indicators, [dfvfs_definitions.TYPE_INDICATOR_GZIP]) session_completion = session.CreateSessionCompletion() storage_writer.AddAttributeContainer(session_completion) storage_writer.Close()
def testGetArchiveTypes(self): """Tests the _GetArchiveTypes function.""" knowledge_base_values = {'year': 2016} session = sessions.Session() storage_writer = fake_writer.FakeStorageWriter(session) knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in knowledge_base_values.items(): knowledge_base_object.SetValue(identifier, value) resolver_context = context.Context() mediator = parsers_mediator.ParserMediator( storage_writer, knowledge_base_object, preferred_year=2016, resolver_context=resolver_context) extraction_worker = worker.EventExtractionWorker() test_analyzer = analyzers_manager_test.TestAnalyzer() self.assertEqual(len(test_analyzer.GetResults()), 0) extraction_worker._analyzers = [test_analyzer] storage_writer.Open() storage_writer.WriteSessionStart() extraction_worker = worker.EventExtractionWorker() path_spec = self._GetTestFilePathSpec(['syslog.tar']) type_indicators = extraction_worker._GetArchiveTypes( mediator, path_spec) self.assertEqual(type_indicators, [dfvfs_definitions.TYPE_INDICATOR_TAR]) storage_writer.WriteSessionCompletion() storage_writer.Close()
def testAnalyzeFileObject(self): """Tests the _AnalyzeFileObject function.""" knowledge_base_values = {'year': 2016} session = sessions.Session() storage_writer = fake_writer.FakeStorageWriter(session) knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in knowledge_base_values.items(): knowledge_base_object.SetValue(identifier, value) resolver_context = context.Context() mediator = parsers_mediator.ParserMediator( storage_writer, knowledge_base_object, preferred_year=2016, resolver_context=resolver_context) extraction_worker = worker.EventExtractionWorker() test_analyzer = analyzers_manager_test.TestAnalyzer() self.assertEqual(len(test_analyzer.GetResults()), 0) extraction_worker._analyzers = [test_analyzer] storage_writer.Open() storage_writer.WriteSessionStart() file_entry = self._GetTestFileEntry(['ímynd.dd']) mediator.SetFileEntry(file_entry) file_object = file_entry.GetFileObject() display_name = mediator.GetDisplayName() event_data_stream = events.EventDataStream() try: extraction_worker._AnalyzeFileObject(file_object, display_name, event_data_stream) finally: file_object.close() storage_writer.WriteSessionCompletion() storage_writer.Close() self.assertIsNotNone(event_data_stream) event_attribute = getattr(event_data_stream, 'test_result', None) self.assertEqual(event_attribute, 'is_vegetable')
def testAnalyzeDataStream(self): """Tests the _AnalyzeDataStream function.""" knowledge_base_values = {'year': 2016} session = sessions.Session() storage_writer = fake_writer.FakeStorageWriter() knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in knowledge_base_values.items(): knowledge_base_object.SetValue(identifier, value) resolver_context = context.Context() parser_mediator = parsers_mediator.ParserMediator( knowledge_base_object, resolver_context=resolver_context) parser_mediator.SetPreferredYear(2016) parser_mediator.SetStorageWriter(storage_writer) extraction_worker = worker.EventExtractionWorker() test_analyzer = analyzers_manager_test.TestAnalyzer() self.assertEqual(len(test_analyzer.GetResults()), 0) extraction_worker._analyzers = [test_analyzer] storage_writer.Open() session_start = session.CreateSessionStart() storage_writer.AddAttributeContainer(session_start) file_entry = self._GetTestFileEntry(['syslog.tgz']) parser_mediator.SetFileEntry(file_entry) display_name = parser_mediator.GetDisplayName() event_data_stream = events.EventDataStream() extraction_worker._AnalyzeDataStream(file_entry, '', display_name, event_data_stream) session_completion = session.CreateSessionCompletion() storage_writer.AddAttributeContainer(session_completion) storage_writer.Close() self.assertIsNotNone(event_data_stream) event_attribute = getattr(event_data_stream, 'test_result', None) self.assertEqual(event_attribute, 'is_vegetable')
def CreateExtractionWorker(self, worker_number, rpc_proxy=None): """Creates an extraction worker object. Args: worker_number: A number that identifies the worker. rpc_proxy: A proxy object (instance of proxy.ProxyServer) that can be used to setup RPC functionality for the worker. This is optional and if not provided the worker will not listen to RPC requests. Returns: An extraction worker (instance of worker.ExtractionWorker). """ return worker.EventExtractionWorker( worker_number, self._collection_queue, self._event_queue_producer, self._parse_error_queue_producer, self.knowledge_base, rpc_proxy=rpc_proxy)
def testExtractionWorkerYara(self): """Tests that the worker applies Yara matching code correctly.""" yara_rule_path = self._GetTestFilePath(['yara.rules']) self._SkipIfPathNotExists(yara_rule_path) with open(yara_rule_path, 'r') as file_object: rule_string = file_object.read() extraction_worker = worker.EventExtractionWorker() extraction_worker._SetYaraRules(rule_string) self.assertIn('yara', extraction_worker.GetAnalyzerNames()) knowledge_base_values = {'year': 2016} session = sessions.Session() path_spec = self._GetTestFilePathSpec(['test_pe.exe']) storage_writer = fake_writer.FakeStorageWriter(session) # Typically there are 3 filestat events, but there can be 4 on platforms # that support os.stat_result st_birthtime. expected_event_counters = { 'fs:stat': [3, 4], 'pe:compilation:compilation_time': 1, 'pe:delay_import:import_time': 1, 'pe:import:import_time': 1 } self._TestProcessPathSpec(storage_writer, path_spec, expected_event_counters, extraction_worker=extraction_worker, knowledge_base_values=knowledge_base_values) storage_writer.Open() expected_yara_match = 'PEfileBasic,PEfile' for event in storage_writer.GetSortedEvents(): event_data = self._GetEventDataOfEvent(storage_writer, event) event_data_stream = self._GetEventDataStreamOfEventData( storage_writer, event_data) self.assertEqual(event_data_stream.yara_match, expected_yara_match) storage_writer.Close()
def _TestProcessPathSpec(self, storage_writer, path_spec, extraction_worker=None, knowledge_base_values=None, process_archives=False): """Tests processing a path specification. Args: storage_writer (StorageWriter): storage writer. path_spec (dfvfs.PathSpec): path specification. extraction_worker (Optional[EventExtractorWorker]): worker to process the pathspec. If None, a new worker will be created. knowledge_base_values (Optional[dict]): knowledge base values. process_archives (Optional[bool]): whether archive files should be processed. """ knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in iter(knowledge_base_values.items()): knowledge_base_object.SetValue(identifier, value) mediator = parsers_mediator.ParserMediator(storage_writer, knowledge_base_object) if not extraction_worker: resolver_context = context.Context() extraction_worker = worker.EventExtractionWorker( resolver_context, process_archives=process_archives) storage_writer.Open() storage_writer.WriteSessionStart() extraction_worker.ProcessPathSpec(mediator, path_spec) event_source = storage_writer.GetFirstWrittenEventSource() while event_source: extraction_worker.ProcessPathSpec(mediator, event_source.path_spec) event_source = storage_writer.GetNextWrittenEventSource() storage_writer.WriteSessionCompletion() storage_writer.Close()
def testAnalyzeFileObject(self): """Tests the _AnalyzeFileObject function.""" knowledge_base_values = {'year': 2016} session = sessions.Session() storage_writer = fake_storage.FakeStorageWriter(session) knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in iter(knowledge_base_values.items()): knowledge_base_object.SetValue(identifier, value) resolver_context = context.Context() mediator = parsers_mediator.ParserMediator( storage_writer, knowledge_base_object, preferred_year=2016, resolver_context=resolver_context) extraction_worker = worker.EventExtractionWorker() test_analyzer = analyzers_manager_test.TestAnalyzer() self.assertEqual(len(test_analyzer.GetResults()), 0) extraction_worker._analyzers = [test_analyzer] file_entry = self._GetTestFileEntry(['ímynd.dd']) mediator.SetFileEntry(file_entry) file_object = file_entry.GetFileObject() try: extraction_worker._AnalyzeFileObject(mediator, file_object) finally: file_object.close() self.assertEqual(len(mediator._extra_event_attributes), 1) event_attribute = mediator._extra_event_attributes.get( 'test_result', None) self.assertEqual(event_attribute, 'is_vegetable')
def testExtractMetadataFromFileEntry(self): """Tests the _ExtractMetadataFromFileEntry function.""" knowledge_base_values = {'year': 2016} session = sessions.Session() storage_writer = fake_writer.FakeStorageWriter() knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in knowledge_base_values.items(): knowledge_base_object.SetValue(identifier, value) resolver_context = context.Context() parser_mediator = parsers_mediator.ParserMediator( knowledge_base_object, resolver_context=resolver_context) parser_mediator.SetPreferredYear(2016) parser_mediator.SetStorageWriter(storage_writer) extraction_worker = worker.EventExtractionWorker() test_analyzer = analyzers_manager_test.TestAnalyzer() self.assertEqual(len(test_analyzer.GetResults()), 0) extraction_worker._analyzers = [test_analyzer] storage_writer.Open() session_start = session.CreateSessionStart() storage_writer.AddAttributeContainer(session_start) file_entry = self._GetTestFileEntry(['syslog.tgz']) parser_mediator.SetFileEntry(file_entry) extraction_worker._ExtractMetadataFromFileEntry( parser_mediator, file_entry, '') session_completion = session.CreateSessionCompletion() storage_writer.AddAttributeContainer(session_completion) storage_writer.Close()
def testExtractContentFromDataStream(self): """Tests the _ExtractContentFromDataStream function.""" knowledge_base_values = {'year': 2016} session = sessions.Session() storage_writer = fake_writer.FakeStorageWriter(session) knowledge_base_object = knowledge_base.KnowledgeBase() if knowledge_base_values: for identifier, value in knowledge_base_values.items(): knowledge_base_object.SetValue(identifier, value) resolver_context = context.Context() mediator = parsers_mediator.ParserMediator( storage_writer, knowledge_base_object, preferred_year=2016, resolver_context=resolver_context) extraction_worker = worker.EventExtractionWorker() test_analyzer = analyzers_manager_test.TestAnalyzer() self.assertEqual(len(test_analyzer.GetResults()), 0) extraction_worker._analyzers = [test_analyzer] storage_writer.Open() storage_writer.WriteSessionStart() file_entry = self._GetTestFileEntry(['syslog.tgz']) mediator.SetFileEntry(file_entry) extraction_worker._ExtractContentFromDataStream( mediator, file_entry, '') storage_writer.WriteSessionCompletion() storage_writer.Close()
def ProcessSources(self, source_path_specs, storage_writer, resolver_context, processing_configuration, filter_find_specs=None, status_update_callback=None): """Processes the sources. Args: source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. resolver_context (dfvfs.Context): resolver context. processing_configuration (ProcessingConfiguration): processing configuration. filter_find_specs (Optional[list[dfvfs.FindSpec]]): find specifications used in path specification extraction. status_update_callback (Optional[function]): callback function for status updates. Returns: ProcessingStatus: processing status. """ parser_mediator = parsers_mediator.ParserMediator( storage_writer, self.knowledge_base, preferred_year=processing_configuration.preferred_year, resolver_context=resolver_context, temporary_directory=processing_configuration.temporary_directory) parser_mediator.SetEventExtractionConfiguration( processing_configuration.event_extraction) parser_mediator.SetInputSourceConfiguration( processing_configuration.input_source) extraction_worker = worker.EventExtractionWorker( parser_filter_expression=( processing_configuration.parser_filter_expression)) extraction_worker.SetExtractionConfiguration( processing_configuration.extraction) self._processing_configuration = processing_configuration self._status_update_callback = status_update_callback logging.debug('Processing started.') self._StartProfiling(extraction_worker) if self._serializers_profiler: storage_writer.SetSerializersProfiler(self._serializers_profiler) storage_writer.Open() storage_writer.WriteSessionStart() try: storage_writer.WritePreprocessingInformation(self.knowledge_base) self._ProcessSources(source_path_specs, extraction_worker, parser_mediator, storage_writer, filter_find_specs=filter_find_specs) finally: storage_writer.WriteSessionCompletion(aborted=self._abort) storage_writer.Close() if self._serializers_profiler: storage_writer.SetSerializersProfiler(None) self._StopProfiling(extraction_worker) if self._abort: logging.debug('Processing aborted.') self._processing_status.aborted = True else: logging.debug('Processing completed.') self._processing_configuration = None self._status_update_callback = None return self._processing_status
def ProcessSources( self, source_configurations, storage_writer, resolver_context, processing_configuration, force_parser=False, status_update_callback=None): """Processes the sources. Args: source_configurations (list[SourceConfigurationArtifact]): configurations of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. resolver_context (dfvfs.Context): resolver context. processing_configuration (ProcessingConfiguration): processing configuration. force_parser (Optional[bool]): True if a specified parser should be forced to be used to extract events. status_update_callback (Optional[function]): callback function for status updates. Returns: ProcessingStatus: processing status. """ parser_mediator = self._CreateParserMediator( self.knowledge_base, resolver_context, processing_configuration) parser_mediator.SetStorageWriter(storage_writer) self._extraction_worker = worker.EventExtractionWorker( force_parser=force_parser, parser_filter_expression=( processing_configuration.parser_filter_expression)) self._extraction_worker.SetExtractionConfiguration( processing_configuration.extraction) self._parser_mediator = parser_mediator self._processing_configuration = processing_configuration self._resolver_context = resolver_context self._status_update_callback = status_update_callback self._storage_writer = storage_writer logger.debug('Processing started.') parser_mediator.StartProfiling( self._processing_configuration.profiling, self._name, self._process_information) self._StartProfiling(self._processing_configuration.profiling) if self._analyzers_profiler: self._extraction_worker.SetAnalyzersProfiler(self._analyzers_profiler) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(self._processing_profiler) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: self._storage_writer.SetStorageProfiler(self._storage_profiler) self._StartStatusUpdateThread() self._parsers_counter = collections.Counter({ parser_count.name: parser_count for parser_count in self._storage_writer.GetAttributeContainers( 'parser_count')}) try: self._ProcessSources(source_configurations, parser_mediator) finally: # Stop the status update thread after close of the storage writer # so we include the storage sync to disk in the status updates. self._StopStatusUpdateThread() if self._analyzers_profiler: self._extraction_worker.SetAnalyzersProfiler(None) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(None) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(None) if self._storage_profiler: self._storage_writer.SetStorageProfiler(None) self._StopProfiling() parser_mediator.StopProfiling() for key, value in parser_mediator.parsers_counter.items(): parser_count = self._parsers_counter.get(key, None) if parser_count: parser_count.number_of_events += value self._storage_writer.UpdateAttributeContainer(parser_count) else: parser_count = counts.ParserCount(name=key, number_of_events=value) self._parsers_counter[key] = parser_count self._storage_writer.AddAttributeContainer(parser_count) if self._abort: logger.debug('Processing aborted.') self._processing_status.aborted = True else: logger.debug('Processing completed.') # Update the status view one last time. self._UpdateStatus() self._extraction_worker = None self._file_system_cache = [] self._parser_mediator = None self._processing_configuration = None self._resolver_context = None self._status_update_callback = None self._storage_writer = None return self._processing_status
def ProcessFile(options): """Process a file and produce profile results.""" if options.proto_file and os.path.isfile(options.proto_file): with open(options.proto_file) as fh: proto_string = fh.read() proto = transmission_pb2.PathSpec() try: text_format.Merge(proto_string, proto) except text_format.ParseError as exception: logging.error( u'Unable to parse file, error: {}'.format(exception)) sys.exit(1) serializer = protobuf_serializer.ProtobufPathSpecSerializer path_spec = serializer.ReadSerializedObject(proto) else: path_spec = path_spec_factory.Factory.NewPathSpec( definitions.TYPE_INDICATOR_OS, location=options.file_to_parse) file_entry = path_spec_resolver.Resolver.OpenFileEntry(path_spec) if file_entry is None: logging.error(u'Unable to open file: {0:s}'.format( options.file_to_parse)) sys.exit(1) pre_obj = event.PreprocessObject() storage_queue = queue.SingleThreadedQueue() storage_queue_producer = queue.EventObjectQueueProducer(storage_queue) # Set few options the engine expects to be there. # TODO: Can we rather set this directly in argparse? options.single_process = True options.debug = False options.text_prepend = u'' parsers = putils.FindAllParsers(pre_obj, options) my_worker = worker.EventExtractionWorker('0', None, storage_queue_producer, pre_obj, parsers) if options.verbose: profiler = cProfile.Profile() profiler.enable() else: time_start = time.time() my_worker.ParseFile(file_entry) if options.verbose: profiler.disable() else: time_end = time.time() storage_queue_producer.SignalEndOfInput() event_object_consumer = PprofEventObjectQueueConsumer(storage_queue) event_object_consumer.ConsumeEventObjects() if not options.verbose: print frontend_utils.FormatHeader('Time Used') print u'{:>20f}s'.format(time_end - time_start) print frontend_utils.FormatHeader('Parsers Loaded') # Accessing protected member. # pylint: disable=protected-access plugins = [] for parser in sorted(my_worker._parsers['all']): print frontend_utils.FormatOutputString('', parser.parser_name) parser_plugins = getattr(parser, '_plugins', []) plugins.extend(parser_plugins) print frontend_utils.FormatHeader('Plugins Loaded') for plugin in sorted(plugins): if isinstance(plugin, basestring): print frontend_utils.FormatOutputString('', plugin) else: plugin_string = getattr(plugin, 'NAME', u'N/A') print frontend_utils.FormatOutputString('', plugin_string) print frontend_utils.FormatHeader('Parsers Used') for parser in sorted(event_object_consumer.parsers): print frontend_utils.FormatOutputString('', parser) print frontend_utils.FormatHeader('Plugins Used') for plugin in sorted(event_object_consumer.plugins): print frontend_utils.FormatOutputString('', plugin) print frontend_utils.FormatHeader('Counter') for key, value in event_object_consumer.counter.most_common(): print frontend_utils.FormatOutputString(key, value) if options.verbose: return GetStats(profiler)
def _Main(self): """The main loop.""" self._parser_mediator = parsers_mediator.ParserMediator( None, self._knowledge_base, preferred_year=self._preferred_year, temporary_directory=self._temporary_directory) if self._filter_object: self._parser_mediator.SetFilterObject(self._filter_object) if self._mount_path: self._parser_mediator.SetMountPath(self._mount_path) if self._text_prepend: self._parser_mediator.SetTextPrepend(self._text_prepend) # We need a resolver context per process to prevent multi processing # issues with file objects stored in images. resolver_context = context.Context() # We need to initialize the parser and hasher objects after the process # has forked otherwise on Windows the "fork" will fail with # a PickleError for Python modules that cannot be pickled. self._extraction_worker = worker.EventExtractionWorker( resolver_context, parser_filter_expression=self._parser_filter_expression, process_archives=self._process_archives, process_compressed_streams=self._process_compressed_streams) if self._hasher_names_string: self._extraction_worker.SetHashers(self._hasher_names_string) if self._yara_rules_string: self._extraction_worker.SetYaraRules(self._yara_rules_string) self._StartProfiling() logging.debug(u'Worker: {0!s} (PID: {1:d}) started'.format( self._name, self._pid)) self._status = definitions.PROCESSING_STATUS_RUNNING try: logging.debug( u'{0!s} (PID: {1:d}) started monitoring task queue.'.format( self._name, self._pid)) while not self._abort: try: task = self._task_queue.PopItem() except (errors.QueueClose, errors.QueueEmpty) as exception: logging.debug( u'ConsumeItems exiting with exception {0:s}.'.format( type(exception))) break if isinstance(task, plaso_queue.QueueAbort): logging.debug( u'ConsumeItems exiting, dequeued QueueAbort object.') break self._ProcessTask(task) logging.debug( u'{0!s} (PID: {1:d}) stopped monitoring task queue.'.format( self._name, self._pid)) # All exceptions need to be caught here to prevent the process # from being killed by an uncaught exception. except Exception as exception: # pylint: disable=broad-except logging.warning( u'Unhandled exception in process: {0!s} (PID: {1:d}).'.format( self._name, self._pid)) logging.exception(exception) self._abort = True self._StopProfiling() self._extraction_worker = None self._parser_mediator = None self._storage_writer = None if self._abort: self._status = definitions.PROCESSING_STATUS_ABORTED else: self._status = definitions.PROCESSING_STATUS_COMPLETED logging.debug(u'Worker: {0!s} (PID: {1:d}) stopped'.format( self._name, self._pid)) try: self._task_queue.Close(abort=self._abort) except errors.QueueAlreadyClosed: logging.error(u'Queue for {0:s} was already closed.'.format( self.name))
def ProcessSources(self, session, source_path_specs, storage_writer, resolver_context, processing_configuration, status_update_callback=None): """Processes the sources. Args: session (Session): session in which the sources are processed. source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. resolver_context (dfvfs.Context): resolver context. processing_configuration (ProcessingConfiguration): processing configuration. status_update_callback (Optional[function]): callback function for status updates. Returns: ProcessingStatus: processing status. """ parser_mediator = parsers_mediator.ParserMediator( storage_writer, self.knowledge_base, collection_filters_helper=self.collection_filters_helper, preferred_year=processing_configuration.preferred_year, resolver_context=resolver_context, temporary_directory=processing_configuration.temporary_directory) extraction_worker = worker.EventExtractionWorker( parser_filter_expression=( processing_configuration.parser_filter_expression)) extraction_worker.SetExtractionConfiguration( processing_configuration.extraction) self._processing_configuration = processing_configuration self._status_update_callback = status_update_callback logger.debug('Processing started.') parser_mediator.StartProfiling( self._processing_configuration.profiling, self._name, self._process_information) self._StartProfiling(self._processing_configuration.profiling) if self._analyzers_profiler: extraction_worker.SetAnalyzersProfiler(self._analyzers_profiler) if self._processing_profiler: extraction_worker.SetProcessingProfiler(self._processing_profiler) if self._serializers_profiler: storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: storage_writer.SetStorageProfiler(self._storage_profiler) storage_writer.Open() storage_writer.WriteSessionStart() # TODO: decouple session and storage writer? session.source_configurations = ( self.knowledge_base.GetSourceConfigurationArtifacts()) try: storage_writer.WriteSessionConfiguration() self._ProcessSources(source_path_specs, extraction_worker, parser_mediator, storage_writer) finally: storage_writer.WriteSessionCompletion(aborted=self._abort) storage_writer.Close() if self._analyzers_profiler: extraction_worker.SetAnalyzersProfiler(None) if self._processing_profiler: extraction_worker.SetProcessingProfiler(None) if self._serializers_profiler: storage_writer.SetSerializersProfiler(None) if self._storage_profiler: storage_writer.SetStorageProfiler(None) self._StopProfiling() parser_mediator.StopProfiling() if self._abort: logger.debug('Processing aborted.') self._processing_status.aborted = True else: logger.debug('Processing completed.') self._processing_configuration = None self._status_update_callback = None return self._processing_status
def _Main(self): """The main loop.""" # We need a resolver context per process to prevent multi processing # issues with file objects stored in images. resolver_context = context.Context() for credential_configuration in self._processing_configuration.credentials: resolver.Resolver.key_chain.SetCredential( credential_configuration.path_spec, credential_configuration.credential_type, credential_configuration.credential_data) self._parser_mediator = parsers_mediator.ParserMediator( None, self._knowledge_base, artifacts_filter_helper=self._artifacts_filter_helper, preferred_year=self._processing_configuration.preferred_year, resolver_context=resolver_context, temporary_directory=self._processing_configuration.temporary_directory) self._parser_mediator.SetEventExtractionConfiguration( self._processing_configuration.event_extraction) self._parser_mediator.SetInputSourceConfiguration( self._processing_configuration.input_source) # We need to initialize the parser and hasher objects after the process # has forked otherwise on Windows the "fork" will fail with # a PickleError for Python modules that cannot be pickled. self._extraction_worker = worker.EventExtractionWorker( parser_filter_expression=( self._processing_configuration.parser_filter_expression)) self._extraction_worker.SetExtractionConfiguration( self._processing_configuration.extraction) self._parser_mediator.StartProfiling( self._processing_configuration.profiling, self._name, self._process_information) self._StartProfiling(self._processing_configuration.profiling) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(self._processing_profiler) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(self._serializers_profiler) if self._storage_profiler: self._storage_writer.SetStorageProfiler(self._storage_profiler) logger.debug('Worker: {0!s} (PID: {1:d}) started.'.format( self._name, self._pid)) self._status = definitions.STATUS_INDICATOR_RUNNING try: logger.debug('{0!s} (PID: {1:d}) started monitoring task queue.'.format( self._name, self._pid)) while not self._abort: try: task = self._task_queue.PopItem() except (errors.QueueClose, errors.QueueEmpty) as exception: logger.debug('ConsumeItems exiting with exception {0:s}.'.format( type(exception))) break if isinstance(task, plaso_queue.QueueAbort): logger.debug('ConsumeItems exiting, dequeued QueueAbort object.') break self._ProcessTask(task) logger.debug('{0!s} (PID: {1:d}) stopped monitoring task queue.'.format( self._name, self._pid)) # All exceptions need to be caught here to prevent the process # from being killed by an uncaught exception. except Exception as exception: # pylint: disable=broad-except logger.warning( 'Unhandled exception in process: {0!s} (PID: {1:d}).'.format( self._name, self._pid)) logger.exception(exception) self._abort = True if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(None) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(None) if self._storage_profiler: self._storage_writer.SetStorageProfiler(None) self._StopProfiling() self._parser_mediator.StopProfiling() self._extraction_worker = None self._parser_mediator = None self._storage_writer = None if self._abort: self._status = definitions.STATUS_INDICATOR_ABORTED else: self._status = definitions.STATUS_INDICATOR_COMPLETED logger.debug('Worker: {0!s} (PID: {1:d}) stopped.'.format( self._name, self._pid)) try: self._task_queue.Close(abort=self._abort) except errors.QueueAlreadyClosed: logger.error('Queue for {0:s} was already closed.'.format(self.name))
def ProcessSources(self, session, source_path_specs, storage_writer, resolver_context, processing_configuration, force_parser=False, status_update_callback=None): """Processes the sources. Args: session (Session): session in which the sources are processed. source_path_specs (list[dfvfs.PathSpec]): path specifications of the sources to process. storage_writer (StorageWriter): storage writer for a session storage. resolver_context (dfvfs.Context): resolver context. processing_configuration (ProcessingConfiguration): processing configuration. force_parser (Optional[bool]): True if a specified parser should be forced to be used to extract events. status_update_callback (Optional[function]): callback function for status updates. Returns: ProcessingStatus: processing status. """ self._resolver_context = resolver_context self._session = session parser_mediator = parsers_mediator.ParserMediator( session, storage_writer, self.knowledge_base, collection_filters_helper=self.collection_filters_helper, preferred_year=processing_configuration.preferred_year, resolver_context=resolver_context, temporary_directory=processing_configuration.temporary_directory) self._extraction_worker = worker.EventExtractionWorker( force_parser=force_parser, parser_filter_expression=( processing_configuration.parser_filter_expression)) self._extraction_worker.SetExtractionConfiguration( processing_configuration.extraction) self._processing_configuration = processing_configuration self._status_update_callback = status_update_callback self._storage_writer = storage_writer logger.debug('Processing started.') parser_mediator.StartProfiling( self._processing_configuration.profiling, self._name, self._process_information) self._StartProfiling(self._processing_configuration.profiling) if self._analyzers_profiler: self._extraction_worker.SetAnalyzersProfiler( self._analyzers_profiler) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler( self._processing_profiler) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler( self._serializers_profiler) if self._storage_profiler: self._storage_writer.SetStorageProfiler(self._storage_profiler) self._StartStatusUpdateThread() try: self._ProcessSources(source_path_specs, parser_mediator) finally: # Stop the status update thread after close of the storage writer # so we include the storage sync to disk in the status updates. self._StopStatusUpdateThread() if self._analyzers_profiler: self._extraction_worker.SetAnalyzersProfiler(None) if self._processing_profiler: self._extraction_worker.SetProcessingProfiler(None) if self._serializers_profiler: self._storage_writer.SetSerializersProfiler(None) if self._storage_profiler: self._storage_writer.SetStorageProfiler(None) self._StopProfiling() parser_mediator.StopProfiling() if self._abort: logger.debug('Processing aborted.') self._processing_status.aborted = True else: logger.debug('Processing completed.') # Update the status view one last time. self._UpdateStatus() self._extraction_worker = None self._file_system_cache = [] self._processing_configuration = None self._resolver_context = None self._session = None self._status_update_callback = None self._storage_writer = None return self._processing_status