def conduct_trial(self): """Conduct the benchmarking trial.""" self.initialize_directories() log_file = os.path.join(self.results_dir, 'fuzzer-log.txt') logs.info('Starting trial.') max_total_time = environment.get('MAX_TOTAL_TIME') args = (max_total_time, log_file) fuzz_thread = threading.Thread(target=run_fuzzer, args=args) fuzz_thread.start() if environment.get('FUZZ_OUTSIDE_EXPERIMENT'): # Hack so that the fuzz_thread has some time to fail if something is # wrong. Without this we will sleep for a long time before checking # if the fuzz thread is alive. time.sleep(5) while fuzz_thread.is_alive(): self.sleep_until_next_sync() self.do_sync() self.cycle += 1 logs.info('Doing final sync.') self.do_sync(final_sync=True) fuzz_thread.join()
def main(): """Do an experiment on a development machine or on a GCP runner instance.""" logs.initialize( default_extras={ 'benchmark': environment.get('BENCHMARK'), 'component': 'runner', 'fuzzer': environment.get('FUZZER'), 'trial_id': str(environment.get('TRIAL_ID')), }) experiment_main() return 0
def run_fuzzer(max_total_time, log_filename): """Runs the fuzzer using its script. Logs stdout and stderr of the fuzzer script to |log_filename| if provided.""" input_corpus = environment.get('SEED_CORPUS_DIR') output_corpus = environment.get('OUTPUT_CORPUS_DIR') fuzz_target_name = environment.get('FUZZ_TARGET') target_binary = fuzzer_utils.get_fuzz_target_binary( FUZZ_TARGET_DIR, fuzz_target_name) if not target_binary: logs.error('Fuzz target binary not found.') return _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus) _clean_seed_corpus(input_corpus) if max_total_time is None: logs.warning('max_total_time is None. Fuzzing indefinitely.') runner_niceness = environment.get('RUNNER_NICENESS', 0) try: # Because the runner is launched at a higher priority, # set it back to the default(0) for fuzzing processes. command = [ 'nice', '-n', str(0 - runner_niceness), 'python3', '-u', '-c', ('import fuzzer; ' 'fuzzer.fuzz(' "'{input_corpus}', '{output_corpus}', '{target_binary}')").format( input_corpus=shlex.quote(input_corpus), output_corpus=shlex.quote(output_corpus), target_binary=shlex.quote(target_binary)) ] fuzzer_environment = _get_fuzzer_environment() # Write output to stdout if user is fuzzing from command line. # Otherwise, write output to the log file. if environment.get('FUZZ_OUTSIDE_EXPERIMENT'): new_process.execute(command, timeout=max_total_time, write_to_stdout=True, kill_children=True, env=fuzzer_environment) else: with open(log_filename, 'wb') as log_file: new_process.execute(command, timeout=max_total_time, output_file=log_file, kill_children=True, env=fuzzer_environment) except subprocess.CalledProcessError: global fuzzer_errored_out # pylint:disable=invalid-name fuzzer_errored_out = True logs.error('Fuzz process returned nonzero.')
def _clean_seed_corpus(seed_corpus_dir): """Prepares |seed_corpus_dir| for the trial. This ensures that it can be used by AFL which is picky about the seed corpus. Moves seed corpus files from sub-directories into the corpus directory root. Also, deletes any files that exceed the 1 MB limit. If the NO_SEEDS env var is specified than the seed corpus files are deleted.""" if not os.path.exists(seed_corpus_dir): return if environment.get('NO_SEEDS'): logs.info('NO_SEEDS specified, deleting seed corpus files.') shutil.rmtree(seed_corpus_dir) os.mkdir(seed_corpus_dir) return failed_to_move_files = [] for root, _, files in os.walk(seed_corpus_dir): for filename in files: file_path = os.path.join(root, filename) if os.path.getsize(file_path) > CORPUS_ELEMENT_BYTES_LIMIT: os.remove(file_path) logs.warning('Removed seed file %s as it exceeds 1 Mb limit.', file_path) continue sha1sum = utils.file_hash(file_path) new_file_path = os.path.join(seed_corpus_dir, sha1sum) try: shutil.move(file_path, new_file_path) except OSError: failed_to_move_files.append((file_path, new_file_path)) if failed_to_move_files: logs.error('Failed to move seed corpus files: %s', failed_to_move_files)
def record_stats(self): """Use fuzzer.get_stats if it is offered, validate the stats and then save them to a file so that they will be synced to the filestore.""" # TODO(metzman): Make this more resilient so we don't wait forever and # so that breakages in stats parsing doesn't break runner. fuzzer_module = get_fuzzer_module(self.fuzzer) fuzzer_module_get_stats = getattr(fuzzer_module, 'get_stats', None) if fuzzer_module_get_stats is None: # Stats support is optional. return try: output_corpus = environment.get('OUTPUT_CORPUS_DIR') stats_json_str = fuzzer_module_get_stats(output_corpus, self.log_file) except Exception: # pylint: disable=broad-except logs.error('Call to %d failed.', fuzzer_module_get_stats) return try: fuzzer_stats.validate_fuzzer_stats(stats_json_str) except (ValueError, json.decoder.JSONDecodeError): logs.error('Stats are invalid.') return stats_filename = experiment_utils.get_stats_filename(self.cycle) stats_path = os.path.join(self.results_dir, stats_filename) with open(stats_path, 'w') as stats_file_handle: stats_file_handle.write(stats_json_str)
def get_runner_image_url(experiment, benchmark, fuzzer, docker_registry): """Get the URL of the docker runner image for fuzzing the benchmark with fuzzer.""" tag = 'latest' if environment.get('LOCAL_EXPERIMENT') else experiment return '{docker_registry}/runners/{fuzzer}/{benchmark}:{tag}'.format( docker_registry=docker_registry, fuzzer=fuzzer, benchmark=benchmark, tag=tag)
def gsutil_command(arguments, *args, parallel=True, **kwargs): """Executes a gsutil command with |arguments| and returns the result.""" if environment.get('FUZZ_OUTSIDE_EXPERIMENT'): logger.info('FUZZ_OUTSIDE_EXPERIMENT set, not running \'gsutil %s\'.', ' '.join(arguments)) return 0, '' command = ['gsutil'] if parallel: command.append('-m') return new_process.execute(command + arguments, *args, **kwargs)
def _unpack_clusterfuzz_seed_corpus(fuzz_target_path, corpus_directory): """If a clusterfuzz seed corpus archive is available, unpack it into the corpus directory if it exists. Copied from unpack_seed_corpus in engine_common.py in ClusterFuzz. """ oss_fuzz_corpus = environment.get('OSS_FUZZ_CORPUS') if oss_fuzz_corpus: benchmark = environment.get('BENCHMARK') corpus_archive_filename = f'{benchmark}.zip' oss_fuzz_corpus_archive_path = posixpath.join( experiment_utils.get_oss_fuzz_corpora_filestore_path(), corpus_archive_filename) seed_corpus_archive_path = posixpath.join(FUZZ_TARGET_DIR, corpus_archive_filename) filestore_utils.cp(oss_fuzz_corpus_archive_path, seed_corpus_archive_path) else: seed_corpus_archive_path = get_clusterfuzz_seed_corpus_path( fuzz_target_path) if not seed_corpus_archive_path: return with zipfile.ZipFile(seed_corpus_archive_path) as zip_file: # Unpack seed corpus recursively into the root of the main corpus # directory. idx = 0 for seed_corpus_file in zip_file.infolist(): if seed_corpus_file.filename.endswith('/'): # Ignore directories. continue # Allow callers to opt-out of unpacking large files. if seed_corpus_file.file_size > CORPUS_ELEMENT_BYTES_LIMIT: continue output_filename = '%016d' % idx output_file_path = os.path.join(corpus_directory, output_filename) zip_file.extract(seed_corpus_file, output_file_path) idx += 1 logs.info('Unarchived %d files from seed corpus %s.', idx, seed_corpus_archive_path)
def __init__(self): if not environment.get('FUZZ_OUTSIDE_EXPERIMENT'): benchmark = environment.get('BENCHMARK') fuzzer = environment.get('FUZZER') trial_id = environment.get('TRIAL_ID') self.gcs_sync_dir = experiment_utils.get_trial_bucket_dir( fuzzer, benchmark, trial_id) filestore_utils.rm(self.gcs_sync_dir, force=True, parallel=True) else: self.gcs_sync_dir = None self.cycle = 1 self.corpus_dir = 'corpus' self.corpus_archives_dir = 'corpus-archives' self.results_dir = 'results' self.unchanged_cycles_path = os.path.join(self.results_dir, 'unchanged-cycles') self.last_sync_time = None self.corpus_dir_contents = set()
def main(): """Set up Redis connection and start the experiment.""" redis_connection = redis.Redis(host="queue-server") config_path = environment.get('EXPERIMENT_CONFIG', 'fuzzbench/local-experiment-config.yaml') config = yaml_utils.read(config_path) config = config_utils.validate_and_expand(config) with rq.Connection(redis_connection): return run_experiment(config)
def run_fuzzer(max_total_time, log_filename): """Runs the fuzzer using its script. Logs stdout and stderr of the fuzzer script to |log_filename| if provided.""" input_corpus = environment.get('SEED_CORPUS_DIR') output_corpus = environment.get('OUTPUT_CORPUS_DIR') fuzz_target_name = environment.get('FUZZ_TARGET') target_binary = fuzzer_utils.get_fuzz_target_binary( FUZZ_TARGET_DIR, fuzz_target_name) if not target_binary: logs.error('Fuzz target binary not found.') return _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus) _clean_seed_corpus(input_corpus) if max_total_time is None: logs.warning('max_total_time is None. Fuzzing indefinitely.') runner_niceness = environment.get('RUNNER_NICENESS', 0) try: with open(log_filename, 'w') as log_file: # Because the runner is launched at a higher priority, # set it back to the default(0) for fuzzing processes. new_process.execute([ 'nice', '-n', str(0 - runner_niceness), 'python3', '-u', '-c', ('import fuzzer; ' 'fuzzer.fuzz(' "'{input_corpus}', '{output_corpus}', '{target_binary}')" ).format(input_corpus=shlex.quote(input_corpus), output_corpus=shlex.quote(output_corpus), target_binary=shlex.quote(target_binary)) ], timeout=max_total_time, output_files=[log_file], kill_children=True, env=_get_fuzzer_environment()) except subprocess.CalledProcessError: logs.error('Fuzz process returned nonzero.')
def gsutil_command(arguments, *args, parallel=True, **kwargs): """Executes a gsutil command with |arguments| and returns the result.""" if environment.get('LOCAL_EXPERIMENT'): logger.info('LOCAL_EXPERIMENT set, not running \'gsutil %s\'.', ' '.join(arguments)) return 0, '' command = ['gsutil'] if parallel: command.append('-m') write_to_stdout = kwargs.pop('write_to_stdout', False) return new_process.execute(command + arguments, *args, write_to_stdout=write_to_stdout, **kwargs)
def copy_coverage_binaries(benchmark): """Copy coverage binaries in a local experiment.""" shared_coverage_binaries_dir = get_shared_coverage_binaries_dir() mount_arg = '{0}:{0}'.format(shared_coverage_binaries_dir) builder_image_url = benchmark_utils.get_builder_image_url( benchmark, 'coverage', environment.get('DOCKER_REGISTRY')) coverage_build_archive = 'coverage-build-{}.tar.gz'.format(benchmark) coverage_build_archive_shared_dir_path = os.path.join( shared_coverage_binaries_dir, coverage_build_archive) command = 'cd /out; tar -czvf {} * /src /work'.format( coverage_build_archive_shared_dir_path) return new_process.execute([ 'docker', 'run', '-v', mount_arg, builder_image_url, '/bin/bash', '-c', command ])
def get_fuzzers_with_not_enough_samples( benchmark_snapshot_df, threshold=_DEFAULT_FUZZER_SAMPLE_NUM_THRESHOLD): """Returns fuzzers that didn't have enough trials running at snapshot time. It takes a benchmark snapshot and finds the fuzzers that have a sample size smaller than 80% of the largest sample size. Default threshold can be overridden. """ # Allow overriding threshold with environment variable as well. threshold = environment.get('FUZZER_SAMPLE_NUM_THRESHOLD', threshold) samples_per_fuzzer = benchmark_snapshot_df.fuzzer.value_counts() max_samples = samples_per_fuzzer.max() few_sample_criteria = samples_per_fuzzer < threshold * max_samples few_sample_fuzzers = samples_per_fuzzer[few_sample_criteria].index return few_sample_fuzzers.tolist()
def archive_corpus(self): """Archive this cycle's corpus.""" archive = os.path.join( self.corpus_archives_dir, experiment_utils.get_corpus_archive_name(self.cycle)) directories = [self.corpus_dir] if self.cycle == 1: # Some fuzzers like eclipser and LibFuzzer don't actually copy the # seed/input corpus to the output corpus (which AFL does do), this # results in their coverage being undercounted. seed_corpus = environment.get('SEED_CORPUS_DIR') directories.append(seed_corpus) archive_directories(directories, archive) return archive
def get_benchmark_snapshot(benchmark_df, threshold=_MIN_FRACTION_OF_ALIVE_TRIALS_AT_SNAPSHOT): """Finds the latest time where |threshold| fraction of the trials were still running. In most cases, this is the end of the experiment. However, if less than |threshold| fraction of the trials reached the end of the experiment, then we will use an earlier "snapshot" time for comparing results. Returns a data frame that only contains the measurements of the picked snapshot time. """ # Allow overriding threshold with environment variable as well. threshold = environment.get('BENCHMARK_SAMPLE_NUM_THRESHOLD', threshold) num_trials = benchmark_df.trial_id.nunique() trials_running_at_time = benchmark_df.time.value_counts() criteria = trials_running_at_time >= threshold * num_trials ok_times = trials_running_at_time[criteria] latest_ok_time = ok_times.index.max() benchmark_snapshot_df = benchmark_df[benchmark_df.time == latest_ok_time] return benchmark_snapshot_df
def conduct_trial(self): """Conduct the benchmarking trial.""" self.initialize_directories() log_file = os.path.join(self.results_dir, 'fuzzer-log.txt') logs.info('Starting trial.') max_total_time = environment.get('MAX_TOTAL_TIME') args = (max_total_time, log_file) thread = threading.Thread(target=run_fuzzer, args=args) thread.start() while thread.is_alive(): self.sleep_until_next_sync() self.do_sync() self.cycle += 1 logs.info('Doing final sync.') self.do_sync(final_sync=True) thread.join()
def copy_coverage_binaries(benchmark): """Copy coverage binaries in a local experiment.""" shared_coverage_binaries_dir = get_shared_coverage_binaries_dir() mount_arg = '{0}:{0}'.format(shared_coverage_binaries_dir) builder_image_url = benchmark_utils.get_builder_image_url( benchmark, 'coverage', environment.get('CLOUD_PROJECT')) coverage_build_archive = 'coverage-build-{}.tar.gz'.format(benchmark) coverage_build_archive_shared_dir_path = os.path.join( shared_coverage_binaries_dir, coverage_build_archive) command = 'cd /out; tar -czvf {} *'.format( coverage_build_archive_shared_dir_path) new_process.execute([ 'docker', 'run', '-v', mount_arg, builder_image_url, '/bin/bash', '-c', command ]) coverage_binaries_dir = build_utils.get_coverage_binaries_dir() coverage_build_archive_gcs_path = posixpath.join( exp_path.gcs(coverage_binaries_dir), coverage_build_archive) return gsutil.cp(coverage_build_archive_shared_dir_path, coverage_build_archive_gcs_path)
def get_benchmark_snapshot(benchmark_df, threshold=_DEFAULT_BENCHMARK_SAMPLE_NUM_THRESHOLD): """Finds the latest time where 80% of the trials were still running. In most cases, this is the end of the experiment. In this case, we won't consider the <20% of the trials that ended early for our analysis. If more than 20% of the trials ended early, it's better to pick an earlier snapshot time. The 80% can be overridden using the |threshold| argument. E.g., to find the latest time where each trials were running, set |threshold| to 1.0. Returns data frame that only contains the measurements of the picked snapshot time. """ # Allow overriding threshold with environment variable as well. threshold = environment.get('BENCHMARK_SAMPLE_NUM_THRESHOLD', threshold) num_trials = benchmark_df.trial_id.nunique() trials_running_at_time = benchmark_df.time.value_counts() criteria = trials_running_at_time > threshold * num_trials ok_times = trials_running_at_time[criteria] latest_ok_time = ok_times.index.max() benchmark_snapshot_df = benchmark_df[benchmark_df.time == latest_ok_time] return benchmark_snapshot_df
def __init__(self): benchmark_fuzzer_directory = '%s-%s' % (environment.get( 'BENCHMARK'), environment.get('FUZZER_VARIANT_NAME')) if not environment.get('FUZZ_OUTSIDE_EXPERIMENT'): bucket = environment.get('CLOUD_EXPERIMENT_BUCKET') experiment_name = environment.get('EXPERIMENT') trial = 'trial-%d' % environment.get('TRIAL_ID') self.gcs_sync_dir = posixpath.join(bucket, experiment_name, 'experiment-folders', benchmark_fuzzer_directory, trial) # Clean the directory before we use it. gsutil.rm(self.gcs_sync_dir, force=True) else: self.gcs_sync_dir = None self.cycle = 1 self.corpus_dir = 'corpus' self.corpus_archives_dir = 'corpus-archives' self.results_dir = 'results' self.unchanged_cycles_path = os.path.join(self.results_dir, 'unchanged-cycles') self.last_sync_time = None self.corpus_dir_contents = set()
def is_local_experiment(): """Returns True if running a local experiment.""" return bool(environment.get('LOCAL_EXPERIMENT'))
def get_snapshot_seconds(): """Returns the amount of time in seconds between snapshots of a fuzzer's corpus during an experiment.""" return environment.get('SNAPSHOT_PERIOD', DEFAULT_SNAPSHOT_SECONDS)