def generate_new_testcase_mutations(corpus_directory, new_testcase_mutations_directory, fuzzer_name, candidate_generator): """Generate new testcase mutations, using existing corpus directory or other methods. Returns true if mutations are successfully generated using radamsa or ml rnn. A false return signifies either no generator use or unsuccessful generation of testcase mutations.""" generation_timeout = get_new_testcase_mutations_timeout() pre_mutations_filecount = shell.get_directory_file_count( new_testcase_mutations_directory) # Generate new testcase mutations using Radamsa. if candidate_generator == Generator.RADAMSA: generate_new_testcase_mutations_using_radamsa( corpus_directory, new_testcase_mutations_directory, generation_timeout) # Generate new testcase mutations using ML RNN model. elif candidate_generator == Generator.ML_RNN: generate_new_testcase_mutations_using_ml_rnn( corpus_directory, new_testcase_mutations_directory, fuzzer_name, generation_timeout) # If new mutations are successfully generated, return true. if shell.get_directory_file_count( new_testcase_mutations_directory) > pre_mutations_filecount: return True return False
def _merge_new_units(self, target_path, corpus_dir, new_corpus_dir, fuzz_corpus_dirs, arguments, stat_overrides): """Merge new units.""" # Make a decision on whether merge step is needed at all. If there are no # new units added by libFuzzer run, then no need to do merge at all. new_units_added = shell.get_directory_file_count(new_corpus_dir) if not new_units_added: stat_overrides['new_units_added'] = 0 logs.log( 'Skipped corpus merge since no new units added by fuzzing.') return # If this times out, it's possible that we will miss some units. However, if # we're taking >10 minutes to load/merge the corpus something is going very # wrong and we probably don't want to make things worse by adding units # anyway. merge_corpus = self._create_merge_corpus_dir() merge_dirs = fuzz_corpus_dirs[:] # Merge the new units with the initial corpus. if corpus_dir not in merge_dirs: merge_dirs.append(corpus_dir) old_corpus_len = shell.get_directory_file_count(corpus_dir) new_units_added = 0 try: result = self._minimize_corpus_two_step( target_path=target_path, arguments=arguments, existing_corpus_dirs=merge_dirs, new_corpus_dir=new_corpus_dir, output_corpus_dir=merge_corpus, reproducers_dir=None, max_time=engine_common.get_merge_timeout( libfuzzer.DEFAULT_MERGE_TIMEOUT)) libfuzzer.move_mergeable_units(merge_corpus, corpus_dir) new_corpus_len = shell.get_directory_file_count(corpus_dir) new_units_added = new_corpus_len - old_corpus_len stat_overrides.update(result.stats) except (MergeError, TimeoutError) as e: logs.log_warn('Merge failed.', error=repr(e)) stat_overrides['new_units_added'] = new_units_added # Record the stats to make them easily searchable in stackdriver. logs.log('Stats calculated.', stats=stat_overrides) if new_units_added: logs.log(f'New units added to corpus: {new_units_added}.') else: logs.log('No new units found.')
def generate_new_testcase_mutations_using_radamsa( corpus_directory, new_testcase_mutations_directory, generation_timeout): """Generate new testcase mutations based on Radamsa.""" radamsa_path = get_radamsa_path() if not radamsa_path: # Mutations using radamsa are not supported on current platform, bail out. return radamsa_runner = new_process.ProcessRunner(radamsa_path) files_list = shell.get_files_list(corpus_directory) filtered_files_list = [ f for f in files_list if os.path.getsize(f) <= CORPUS_INPUT_SIZE_LIMIT ] if not filtered_files_list: # No mutations to do on an empty corpus or one with very large files. return old_corpus_size = shell.get_directory_file_count( new_testcase_mutations_directory) expected_completion_time = time.time() + generation_timeout for i in range(RADAMSA_MUTATIONS): original_file_path = random_choice(filtered_files_list) original_filename = os.path.basename(original_file_path) output_path = os.path.join( new_testcase_mutations_directory, get_radamsa_output_filename(original_filename, i)) result = radamsa_runner.run_and_wait( ['-o', output_path, original_file_path], timeout=RADAMSA_TIMEOUT) if (os.path.exists(output_path) and os.path.getsize(output_path) > CORPUS_INPUT_SIZE_LIMIT): # Skip large files to avoid further mutations and impact fuzzing # efficiency. shell.remove_file(output_path) elif result.return_code or result.timed_out: logs.log_warn('Radamsa failed to mutate or timed out.', output=result.output) # Check if we exceeded our timeout. If yes, do no more mutations and break. if time.time() > expected_completion_time: break new_corpus_size = shell.get_directory_file_count( new_testcase_mutations_directory) logs.log('Added %d tests using Radamsa mutations.' % (new_corpus_size - old_corpus_size))
def test(self): """Test get_directory_file_count.""" self.fs.create_file('/test/aa/bb.txt', contents='abc') self.fs.create_file('/test/aa/cc.txt', contents='def') self.fs.create_file('/test/aa/aa/aa.txt', contents='ghi') self.fs.create_file('/test/aa/aa/dd.txt', contents='t') self.assertEqual(shell.get_directory_file_count('/test/aa'), 4)
def run(self, initial_corpus_path, minimized_corpus_path, bad_units_path): """Run corpus pruning. Output result to directory.""" if not shell.get_directory_file_count(initial_corpus_path): # Empty corpus, nothing to do. return None # Set memory tool options and fuzzer arguments. engine_common.unpack_seed_corpus_if_needed(self.runner.target_path, initial_corpus_path, force_unpack=True) environment.reset_current_memory_tool_options(redzone_size=MIN_REDZONE, leaks=True) self.runner.process_sanitizer_options() additional_args = self.runner.get_libfuzzer_flags() # Execute fuzzer with arguments for corpus pruning. logs.log('Running merge...') try: result = self.runner.minimize_corpus(additional_args, [initial_corpus_path], minimized_corpus_path, bad_units_path, CORPUS_PRUNING_TIMEOUT) except TimeoutError as e: raise CorpusPruningException( 'Corpus pruning timed out while minimizing corpus\n' + repr(e)) except engine.Error as e: raise CorpusPruningException( 'Corpus pruning failed to minimize corpus\n' + repr(e)) symbolized_output = stack_symbolizer.symbolize_stacktrace(result.logs) # Sanity check that there are files in minimized corpus after merging. if not shell.get_directory_file_count(minimized_corpus_path): raise CorpusPruningException( 'Corpus pruning failed to minimize corpus\n' + symbolized_output) logs.log('Corpus merge finished successfully.', output=symbolized_output) return result.stats
def test(self): """Test clear_system_temp_directory works as expected.""" self.fs.create_file('/tmp/aa/bb.txt', contents='abc') self.fs.create_file('/tmp/cc/dd/ee.txt', contents='def') self.fs.create_dir('/tmp/ff/gg') self.fs.create_dir('/tmp/hh') self.fs.create_dir('/unrelated') self.fs.create_file('/unrelated/zz.txt', contents='zzz') os.symlink('/unrelated/zz.txt', '/tmp/hh/gg.txt') os.symlink('/unrelated', '/tmp/ii') shell.clear_system_temp_directory() self.assertTrue(os.path.exists('/tmp')) self.assertTrue(os.path.exists('/unrelated')) self.assertEqual(shell.get_directory_file_count('/tmp'), 0) self.assertEqual(shell.get_directory_file_count('/unrelated'), 1) self.assertFalse(os.path.exists('/tmp/aa/bb.txt')) self.assertFalse(os.path.exists('/tmp/cc/dd/ee.txt')) self.assertFalse(os.path.exists('/tmp/ff/gg')) self.assertFalse(os.path.exists('/tmp/hh'))
def test_generate(self): """Test generate specified number of inputs.""" # Set a large timeout value and a small count value to avoid timeout. timeout = 20 expected_count = 2 result = generator.run(self.input_directory, self.output_directory, self.model_path, timeout, generation_count=expected_count, hidden_state_size=MODEL_STATE_SIZE, hidden_layer_size=MODEL_LAYER_SIZE) # Process exits normally and no timeout. self.assertEqual(result.return_code, constants.ExitCode.SUCCESS) self.assertFalse(result.timed_out) actual_count = shell.get_directory_file_count(self.output_directory) self.assertEqual(expected_count, actual_count)
def test_empty_corpus(self): """Test generation should abort for empty corpus.""" # Set a large timeout value and a small count value to avoid timeout. timeout = 20 expected_count = 2 result = generator.run(self.empty_directory, self.output_directory, self.model_path, timeout, generation_count=expected_count, hidden_state_size=MODEL_STATE_SIZE, hidden_layer_size=MODEL_LAYER_SIZE) self.assertEqual(result.return_code, constants.ExitCode.CORPUS_TOO_SMALL) self.assertFalse(result.timed_out) # No new units. actual_count = shell.get_directory_file_count(self.output_directory) self.assertEqual(actual_count, 0)
def test_timeout(self): """Test timeout case in generation.""" # Set a small timeout value and a large count value to trigger timeout. # Note that timeout cannot be set too small since it takes time to # start generator. If this test failed please increase timeout value. timeout = 10 result = generator.run(self.input_directory, self.output_directory, self.model_path, timeout, generation_count=sys.maxsize, hidden_state_size=MODEL_STATE_SIZE, hidden_layer_size=MODEL_LAYER_SIZE) # Process timed out. self.assertNotEqual(result.return_code, constants.ExitCode.SUCCESS) self.assertTrue(result.timed_out) actual_count = shell.get_directory_file_count(self.output_directory) self.assertGreater(actual_count, 0)
def run(self, timeout): """Merge testcases from corpus from other fuzz targets.""" if not shell.get_directory_file_count(self.context.shared_corpus_path): logs.log('No files found in shared corpus, skip merge.') return None # Run pruning on the shared corpus and log the result in case of error. logs.log('Merging shared corpus...') environment.reset_current_memory_tool_options( redzone_size=DEFAULT_REDZONE) self.runner.process_sanitizer_options() additional_args = self.runner.get_libfuzzer_flags() try: result = self.runner.minimize_corpus( additional_args, [self.context.shared_corpus_path], self.context.minimized_corpus_path, self.context.bad_units_path, timeout) symbolized_output = stack_symbolizer.symbolize_stacktrace( result.logs) logs.log('Shared corpus merge finished successfully.', output=symbolized_output) except TimeoutError as e: # Other cross pollinated fuzzer corpuses can have unexpected test cases # that time us out. This is expected, so bail out. logs.log_warn( 'Corpus pruning timed out while merging shared corpus\n' + repr(e)) return None except engine.Error as e: # Other cross pollinated fuzzer corpuses can be large, so we can run out # of disk space and exception out. This is expected, so bail out. logs.log_warn('Corpus pruning failed to merge shared corpus\n' + repr(e)) return None return result.stats
def test_invalid_model(self): """Test TensorFlow should throw exception if model does not match.""" # Set a large timeout value and a small count value to avoid timeout. timeout = 20 expected_count = 2 # Change model parameters to make demo model invalid. invalid_state_size = 8 result = generator.run(self.input_directory, self.output_directory, self.model_path, timeout, generation_count=expected_count, hidden_state_size=invalid_state_size, hidden_layer_size=MODEL_LAYER_SIZE) self.assertEqual(result.return_code, constants.ExitCode.TENSORFLOW_ERROR) self.assertFalse(result.timed_out) # No new units. actual_count = shell.get_directory_file_count(self.output_directory) self.assertEqual(actual_count, 0)
def execute(input_directory, output_directory, fuzzer_name, generation_timeout): """Execute ML RNN generator to produce new inputs. This method should be called inside launcher, to generate a number of new inputs based on ML RNN model. It will fetch ML model from GCS bucket specified in environment variable `CORPUS_BUCKET`. The script to run the model resides in folder `tools/fuzzers/ml/rnn`. Args: input_directory: Seed corpus path. The directory should not be empty. output_directory: The directory to place generated inputs. fuzzer_name: Name of the fuzzer, e.g libpng_read_fuzzer. It indicates the subdirectory in gcs bucket to store models. generation_timeout: Time in seconds for the generator to run. Normally it takes <1s to generate an input, assuming the input length is <4KB. """ if environment.platform() != 'LINUX': logs.log('Unsupported platform for ML RNN generation, skipping.') return # Validate corpus folder. file_count = shell.get_directory_file_count(input_directory) if not file_count: logs.log('Corpus is empty. Skip generation.') return # Number of existing new inputs. They are possibly generated by other # generators. old_corpus_units = shell.get_directory_file_count(output_directory) old_corpus_bytes = shell.get_directory_size(output_directory) # Get model path. model_path = prepare_model_directory(fuzzer_name) if not model_path: return result = run(input_directory, output_directory, model_path, generation_timeout) # Generation process exited abnormally but not caused by timeout, meaning # error occurred during execution. if result.return_code and not result.timed_out: if result.return_code == constants.ExitCode.CORPUS_TOO_SMALL: logs.log_warn( 'ML RNN generation for fuzzer %s aborted due to small corpus.' % fuzzer_name) else: logs.log_error( 'ML RNN generation for fuzzer %s failed with ExitCode = %d.' % (fuzzer_name, result.return_code), output=utils.decode_to_unicode(result.output)) return # Timeout is not error, if we have new units generated. if result.timed_out: logs.log_warn('ML RNN generation for fuzzer %s timed out.' % fuzzer_name) new_corpus_units = (shell.get_directory_file_count(output_directory) - old_corpus_units) new_corpus_bytes = (shell.get_directory_size(output_directory) - old_corpus_bytes) if new_corpus_units: logs.log( 'Added %d new inputs (%d bytes) using ML RNN generator for %s.' % (new_corpus_units, new_corpus_bytes, fuzzer_name)) else: logs.log_error('ML RNN generator did not produce any inputs for %s' % fuzzer_name, output=utils.decode_to_unicode(result.output))
def prepare(self, corpus_dir, target_path, build_dir): """Prepare for a fuzzing session, by generating options. Returns a FuzzOptions object. Args: corpus_dir: The main corpus directory. target_path: Path to the target. build_dir: Path to the build directory. Returns: A FuzzOptions object. """ del build_dir arguments = fuzzer.get_arguments(target_path) grammar = fuzzer.get_grammar(target_path) if self.do_strategies: strategy_pool = strategy_selection.generate_weighted_strategy_pool( strategy_list=strategy.LIBFUZZER_STRATEGY_LIST, use_generator=True, engine_name=self.name) else: strategy_pool = strategy_selection.StrategyPool() strategy_info = libfuzzer.pick_strategies(strategy_pool, target_path, corpus_dir, arguments, grammar) arguments.extend(strategy_info.arguments) # Check for seed corpus and add it into corpus directory. engine_common.unpack_seed_corpus_if_needed(target_path, corpus_dir) # Pick a few testcases from our corpus to use as the initial corpus. subset_size = engine_common.random_choice( engine_common.CORPUS_SUBSET_NUM_TESTCASES) if (not strategy_info.use_dataflow_tracing and strategy_pool.do_strategy(strategy.CORPUS_SUBSET_STRATEGY) and shell.get_directory_file_count(corpus_dir) > subset_size): # Copy |subset_size| testcases into 'subset' directory. corpus_subset_dir = self._create_temp_corpus_dir('subset') libfuzzer.copy_from_corpus(corpus_subset_dir, corpus_dir, subset_size) strategy_info.fuzzing_strategies.append( strategy.CORPUS_SUBSET_STRATEGY.name + '_' + str(subset_size)) strategy_info.additional_corpus_dirs.append(corpus_subset_dir) else: strategy_info.additional_corpus_dirs.append(corpus_dir) # Check dict argument to make sure that it's valid. dict_path = fuzzer_utils.extract_argument(arguments, constants.DICT_FLAG, remove=False) if dict_path and not os.path.exists(dict_path): logs.log_error(f'Invalid dict {dict_path} for {target_path}.') fuzzer_utils.extract_argument(arguments, constants.DICT_FLAG) # If there's no dict argument, check for %target_binary_name%.dict file. dict_path = fuzzer_utils.extract_argument(arguments, constants.DICT_FLAG, remove=False) if not dict_path: dict_path = dictionary_manager.get_default_dictionary_path( target_path) if os.path.exists(dict_path): arguments.append(constants.DICT_FLAG + dict_path) # If we have a dictionary, correct any items that are not formatted properly # (e.g. quote items that are missing them). dictionary_manager.correct_if_needed(dict_path) strategies = stats.process_strategies(strategy_info.fuzzing_strategies, name_modifier=lambda x: x) return LibFuzzerOptions(corpus_dir, arguments, strategies, strategy_info.additional_corpus_dirs, strategy_info.extra_env, strategy_info.use_dataflow_tracing, strategy_info.is_mutations_run)
def _count_corpus_files(directory): """Count the number of corpus files.""" return shell.get_directory_file_count(directory)
def do_corpus_pruning(context, last_execution_failed, revision): """Run corpus pruning.""" # Set |FUZZ_TARGET| environment variable to help with unarchiving only fuzz # target and its related files. environment.set_value('FUZZ_TARGET', context.fuzz_target.binary) if environment.is_trusted_host(): from clusterfuzz._internal.bot.untrusted_runner import tasks_host return tasks_host.do_corpus_pruning(context, last_execution_failed, revision) if not build_manager.setup_build(revision=revision): raise CorpusPruningException('Failed to setup build.') build_directory = environment.get_value('BUILD_DIR') start_time = datetime.datetime.utcnow() runner = Runner(build_directory, context) pruner = CorpusPruner(runner) fuzzer_binary_name = os.path.basename(runner.target_path) # If our last execution failed, shrink to a randomized corpus of usable size # to prevent corpus from growing unbounded and recurring failures when trying # to minimize it. if last_execution_failed: for corpus_url in [ context.corpus.get_gcs_url(), context.quarantine_corpus.get_gcs_url() ]: _limit_corpus_size(corpus_url) # Get initial corpus to process from GCS. context.sync_to_disk() initial_corpus_size = shell.get_directory_file_count( context.initial_corpus_path) # Restore a small batch of quarantined units back to corpus. context.restore_quarantined_units() # Shrink to a minimized corpus using corpus merge. pruner_stats = pruner.run(context.initial_corpus_path, context.minimized_corpus_path, context.bad_units_path) # Sync minimized corpus back to GCS. context.sync_to_gcs() # Create corpus backup. # Temporarily copy the past crash regressions folder into the minimized corpus # so that corpus backup archive can have both. regressions_input_dir = os.path.join(context.initial_corpus_path, 'regressions') regressions_output_dir = os.path.join(context.minimized_corpus_path, 'regressions') if shell.get_directory_file_count(regressions_input_dir): shutil.copytree(regressions_input_dir, regressions_output_dir) backup_bucket = environment.get_value('BACKUP_BUCKET') corpus_backup_url = corpus_manager.backup_corpus( backup_bucket, context.corpus, context.minimized_corpus_path) shell.remove_directory(regressions_output_dir) minimized_corpus_size_units = shell.get_directory_file_count( context.minimized_corpus_path) minimized_corpus_size_bytes = shell.get_directory_size( context.minimized_corpus_path) logs.log('Corpus pruned from %d to %d units.' % (initial_corpus_size, minimized_corpus_size_units)) # Process bad units found during merge. # Mapping of crash state -> CorpusCrash crashes = {} pruner.process_bad_units(context.bad_units_path, context.quarantine_corpus_path, crashes) context.quarantine_corpus.rsync_from_disk(context.quarantine_corpus_path) # Store corpus stats into CoverageInformation entity. project_qualified_name = context.fuzz_target.project_qualified_name() today = datetime.datetime.utcnow().date() coverage_info = data_types.CoverageInformation( fuzzer=project_qualified_name, date=today) quarantine_corpus_size = shell.get_directory_file_count( context.quarantine_corpus_path) quarantine_corpus_dir_size = shell.get_directory_size( context.quarantine_corpus_path) # Save the minimize corpus size before cross pollination to put in BigQuery. pre_pollination_corpus_size = minimized_corpus_size_units # Populate coverage stats. coverage_info.corpus_size_units = minimized_corpus_size_units coverage_info.corpus_size_bytes = minimized_corpus_size_bytes coverage_info.quarantine_size_units = quarantine_corpus_size coverage_info.quarantine_size_bytes = quarantine_corpus_dir_size coverage_info.corpus_backup_location = corpus_backup_url coverage_info.corpus_location = context.corpus.get_gcs_url() coverage_info.quarantine_location = context.quarantine_corpus.get_gcs_url() # Calculate remaining time to use for shared corpus merging. time_remaining = _get_time_remaining(start_time) if time_remaining <= 0: logs.log_warn('Not enough time for shared corpus merging.') return None cross_pollinator = CrossPollinator(runner) pollinator_stats = cross_pollinator.run(time_remaining) context.sync_to_gcs() # Update corpus size stats. minimized_corpus_size_units = shell.get_directory_file_count( context.minimized_corpus_path) minimized_corpus_size_bytes = shell.get_directory_size( context.minimized_corpus_path) coverage_info.corpus_size_units = minimized_corpus_size_units coverage_info.corpus_size_bytes = minimized_corpus_size_bytes logs.log('Finished.') sources = ','.join([ fuzzer.fuzz_target.project_qualified_name() for fuzzer in context.cross_pollinate_fuzzers ]) cross_pollination_stats = None if pruner_stats and pollinator_stats: cross_pollination_stats = CrossPollinationStats( project_qualified_name, context.cross_pollination_method, sources, context.tag, initial_corpus_size, pre_pollination_corpus_size, pruner_stats['edge_coverage'], pollinator_stats['edge_coverage'], pruner_stats['feature_coverage'], pollinator_stats['feature_coverage']) return CorpusPruningResult(coverage_info=coverage_info, crashes=list(crashes.values()), fuzzer_binary_name=fuzzer_binary_name, revision=environment.get_value('APP_REVISION'), cross_pollination_stats=cross_pollination_stats)