def render_startup_script_template(instance_name: str, fuzzer: str, benchmark: str, trial_id: int, experiment_config: dict): """Render the startup script using the template and the parameters provided and return the result.""" docker_image_url = benchmark_utils.get_runner_image_url( benchmark, fuzzer, experiment_config['docker_registry']) fuzz_target = benchmark_utils.get_fuzz_target(benchmark) local_experiment = experiment_utils.is_local_experiment() template = JINJA_ENV.get_template('runner-startup-script-template.sh') kwargs = { 'instance_name': instance_name, 'benchmark': benchmark, 'experiment': experiment_config['experiment'], 'fuzzer': fuzzer, 'trial_id': trial_id, 'max_total_time': experiment_config['max_total_time'], 'experiment_filestore': experiment_config['experiment_filestore'], 'report_filestore': experiment_config['report_filestore'], 'fuzz_target': fuzz_target, 'docker_image_url': docker_image_url, 'docker_registry': experiment_config['docker_registry'], 'local_experiment': local_experiment } if not local_experiment: kwargs['cloud_compute_zone'] = experiment_config['cloud_compute_zone'] kwargs['cloud_project'] = experiment_config['cloud_project'] return template.render(**kwargs)
def end_expired_trials(experiment_config: dict): """Get all expired trials, end them and return them.""" trials_past_expiry = get_expired_trials( experiment_config['experiment'], experiment_config['max_total_time']) expired_instances = [] current_dt = datetime_now() for trial in trials_past_expiry: expired_instances.append( experiment_utils.get_trial_instance_name( experiment_config['experiment'], trial.id)) trial.time_ended = current_dt # Bail out here because trials_past_expiry will be truthy until evaluated. if not expired_instances: return if not experiment_utils.is_local_experiment() and not delete_instances( expired_instances, experiment_config): # If we failed to delete some instances, then don't update the status # of expired trials in database as we don't know which instances were # successfully deleted. Wait for next iteration of end_expired_trials. logger.error('Failed to delete instances after trial expiry.') return db_utils.bulk_save(trials_past_expiry)
def initialize(experiment_config: dict): """Initialize everything that will be needed to schedule measurers.""" logger.info('Initializing worker scheduling.') gce.initialize() experiment = experiment_config['experiment'] project = experiment_config['project'] instance_template_name = get_measure_worker_instance_template_name( experiment) docker_image = posixpath.join(experiment_config['docker_registry'], 'measure-worker:{}'.format(experiment)) redis_host = experiment_config['redis_host'] experiment_filestore = experiment_config['experiment_filestore'] local_experiment = experiment_utils.is_local_experiment() cloud_compute_zone = experiment_config.get('cloud_compute_zone') env = { 'REDIS_HOST': redis_host, 'EXPERIMENT_FILESTORE': experiment_filestore, 'EXPERIMENT': experiment, 'LOCAL_EXPERIMENT': local_experiment, 'CLOUD_COMPUTE_ZONE': cloud_compute_zone, } zone = experiment_config['cloud_compute_zone'] instance_template_url = gcloud.create_instance_template( instance_template_name, docker_image, env, project, zone) instance_group_name = get_instance_group_name(experiment) base_instance_name = get_base_worker_instance_name(experiment) gce.create_instance_group(instance_group_name, instance_template_url, base_instance_name, project, zone) queue = queue_utils.initialize_queue(redis_host) return queue
def create_instance(instance_name: str, instance_type: InstanceType, config: dict, startup_script: str = None, preemptible: bool = False, **kwargs) -> bool: """Creates a GCE instance with name, |instance_name|, type, |instance_type| and with optionally provided and |startup_script|.""" if experiment_utils.is_local_experiment(): return run_local_instance(startup_script) command = [ 'gcloud', 'compute', 'instances', 'create', instance_name, '--image-family=cos-stable', '--image-project=cos-cloud', '--zone=%s' % config['cloud_compute_zone'], '--scopes=cloud-platform', ] if instance_type == InstanceType.DISPATCHER: command.extend([ '--machine-type=%s' % DISPATCHER_MACHINE_TYPE, '--boot-disk-size=%s' % DISPATCHER_BOOT_DISK_SIZE, '--boot-disk-type=%s' % DISPATCHER_BOOT_DISK_TYPE, ]) else: machine_type = config['runner_machine_type'] if machine_type is not None: command.append('--machine-type=%s' % machine_type) else: # Do this to support KLEE experiments. command.append([ '--custom-memory=%s' % config['runner_memory'], '--custom-cpu=%s' % config['runner_num_cpu_cores'] ]) command.extend([ '--no-address', '--boot-disk-size=%s' % RUNNER_BOOT_DISK_SIZE, ]) if preemptible: command.append('--preemptible') if startup_script: command.extend( ['--metadata-from-file', 'startup-script=' + startup_script]) result = new_process.execute(command, expect_zero=False, **kwargs) if result.retcode == 0: return True logs.info('Failed to create instance. Command: %s failed. Output: %s', command, result.output) return False
def dispatcher_main(): """Do the experiment and report results.""" logs.info('Starting experiment.') # Set this here because we get failures if we do it in measurer for some # reason. multiprocessing.set_start_method('spawn') db_utils.initialize() if experiment_utils.is_local_experiment(): models.Base.metadata.create_all(db_utils.engine) experiment_config_file_path = _get_config_file_path() experiment = Experiment(experiment_config_file_path) _initialize_experiment_in_db(experiment.config) trials = build_images_for_trials(experiment.fuzzers, experiment.benchmarks, experiment.num_trials, experiment.preemptible, experiment.concurrent_builds) _initialize_trials_in_db(trials) create_work_subdirs(['experiment-folders', 'measurement-folders']) # Start measurer and scheduler in seperate threads/processes. scheduler_loop_thread = threading.Thread(target=scheduler.schedule_loop, args=(experiment.config, )) scheduler_loop_thread.start() measurer_main_process = multiprocessing.Process( target=measure_manager.measure_main, args=(experiment.config, )) measurer_main_process.start() is_complete = False while True: time.sleep(LOOP_WAIT_SECONDS) if not scheduler_loop_thread.is_alive(): is_complete = not measurer_main_process.is_alive() # Generate periodic output reports. reporter.output_report(experiment.config, in_progress=not is_complete, coverage_report=is_complete) if is_complete: # Experiment is complete, bail out. break scheduler_loop_thread.join() measurer_main_process.join() _record_experiment_time_ended(experiment.experiment_name) logs.info('Experiment ended.')
def create_instance(instance_name: str, instance_type: InstanceType, config: dict, metadata: dict = None, startup_script: str = None, **kwargs) -> bool: """Creates a GCE instance with name, |instance_name|, type, |instance_type| and with optionally provided |metadata| and |startup_script|.""" if experiment_utils.is_local_experiment(): return run_local_instance(startup_script) command = [ 'gcloud', 'compute', 'instances', 'create', instance_name, '--image-family=cos-stable', '--image-project=cos-cloud', '--zone=%s' % config['cloud_compute_zone'], '--scopes=cloud-platform', ] if instance_type == InstanceType.DISPATCHER: command.extend([ '--machine-type=%s' % DISPATCHER_MACHINE_TYPE, '--boot-disk-size=%s' % DISPATCHER_BOOT_DISK_SIZE, '--boot-disk-type=%s' % DISPATCHER_BOOT_DISK_TYPE, ]) else: command.extend([ '--no-address', '--machine-type=%s' % RUNNER_MACHINE_TYPE, '--boot-disk-size=%s' % RUNNER_BOOT_DISK_SIZE, ]) if config.get('preemptible_runners'): # TODO(metzman): Make runners signal to scheduler that they were # preempted, and make scheduler+measurer tolerate preemption. command.append('--preemptible') if metadata: metadata_str = ','.join('{key}={value}'.format(key=key, value=value) for key, value in metadata.items()) command.extend(['--metadata', metadata_str]) if startup_script: command.extend( ['--metadata-from-file', 'startup-script=' + startup_script]) return new_process.execute(command, expect_zero=False, **kwargs)[0] == 0
def schedule_loop(experiment_config: dict): """Continuously run the scheduler until there is nothing left to schedule. Note that this should not be called unless multiprocessing.set_start_method('spawn') was called first. Otherwise it will use fork to create the Pool which breaks logging.""" # Create the thread pool once and reuse it to avoid leaking threads and # other issues. logger.info('Starting scheduler.') num_trials = len( get_experiment_trials(experiment_config['experiment']).all()) local_experiment = experiment_utils.is_local_experiment() if not local_experiment: gce.initialize() trial_instance_manager = TrialInstanceManager(num_trials, experiment_config) experiment = experiment_config['experiment'] with multiprocessing.Pool() as pool: handle_preempted = False while not all_trials_ended(experiment): try: if (not local_experiment and not handle_preempted and not any_pending_trials(experiment)): # This ensures that: # 1. handle_preempted will not becomes True when running # locally. # 2. Only start handling preempted instances once every # initial trial was started. handle_preempted = True schedule(experiment_config, pool) if handle_preempted: trial_instance_manager.handle_preempted_trials() except Exception: # pylint: disable=broad-except logger.error('Error occurred during scheduling.') # Either # - We had an unexpected exception OR # - We have not been able to start trials and still have some # remaining. This can happen when we run out of instance quota. # In these cases, sleep before retrying again. time.sleep(FAIL_WAIT_SECONDS) logger.info('Finished scheduling.')
def create_instance(instance_name: str, instance_type: InstanceType, config: dict, startup_script: str = None, preemptible: bool = False, **kwargs) -> bool: """Creates a GCE instance with name, |instance_name|, type, |instance_type| and with optionally provided and |startup_script|.""" if experiment_utils.is_local_experiment(): return run_local_instance(startup_script) command = [ 'gcloud', 'compute', 'instances', 'create', instance_name, '--image-family=cos-stable', '--image-project=cos-cloud', '--zone=%s' % config['cloud_compute_zone'], '--scopes=cloud-platform', ] if instance_type == InstanceType.DISPATCHER: command.extend([ '--machine-type=%s' % DISPATCHER_MACHINE_TYPE, '--boot-disk-size=%s' % DISPATCHER_BOOT_DISK_SIZE, '--boot-disk-type=%s' % DISPATCHER_BOOT_DISK_TYPE, ]) else: command.extend([ '--no-address', '--machine-type=%s' % RUNNER_MACHINE_TYPE, '--boot-disk-size=%s' % RUNNER_BOOT_DISK_SIZE, ]) if preemptible: command.append('--preemptible') if startup_script: command.extend( ['--metadata-from-file', 'startup-script=' + startup_script]) return new_process.execute(command, expect_zero=False, **kwargs)[0] == 0
def render_startup_script_template(instance_name: str, benchmark: str, fuzzer: str, trial_id: int, experiment_config: dict): """Render the startup script using the template and the parameters provided and return the result.""" fuzzer_config = fuzzer_config_utils.get_by_variant_name(fuzzer) underlying_fuzzer_name = fuzzer_config['fuzzer'] docker_image_url = benchmark_utils.get_runner_image_url( benchmark, underlying_fuzzer_name, experiment_config['cloud_project']) fuzz_target = benchmark_utils.get_fuzz_target(benchmark) # Convert additional environment variables from configuration to arguments # that will be passed to docker. additional_env = '' if 'env' in fuzzer_config: additional_env = ' '.join([ '-e {k}={v}'.format(k=k, v=shlex.quote(v)) for k, v in fuzzer_config['env'].items() ]) local_experiment = experiment_utils.is_local_experiment() template = JINJA_ENV.get_template('runner-startup-script-template.sh') kwargs = { 'instance_name': instance_name, 'benchmark': benchmark, 'experiment': experiment_config['experiment'], 'fuzzer': underlying_fuzzer_name, 'fuzzer_variant_name': fuzzer, 'trial_id': trial_id, 'max_total_time': experiment_config['max_total_time'], 'cloud_project': experiment_config['cloud_project'], 'cloud_compute_zone': experiment_config['cloud_compute_zone'], 'cloud_experiment_bucket': experiment_config['cloud_experiment_bucket'], 'fuzz_target': fuzz_target, 'docker_image_url': docker_image_url, 'additional_env': additional_env, 'local_experiment': local_experiment } if local_experiment: kwargs['host_gcloud_config'] = os.environ['HOST_GCLOUD_CONFIG'] return template.render(**kwargs)
def main(): """Do the experiment and report results.""" logs.initialize(default_extras={ 'component': 'dispatcher', }) try: dispatcher_main() except Exception as error: logs.error('Error conducting experiment.') raise error if experiment_utils.is_local_experiment(): return 0 experiment_config_file_path = _get_config_file_path() if stop_experiment.stop_experiment(experiment_utils.get_experiment_name(), experiment_config_file_path): return 0 return 1
import tarfile import time from typing import Callable, List, Tuple from common import benchmark_utils from common import experiment_path as exp_path from common import experiment_utils from common import filesystem from common import fuzzer_utils from common import utils from common import gsutil from common import logs from experiment.build import build_utils if not experiment_utils.is_local_experiment(): import experiment.build.gcb_build as buildlib else: import experiment.build.local_build as buildlib # FIXME: Make this configurable for users with the default quota of 10. # Even though it says queueing happen, we end up exceeding limits on "get", so # be conservative. Use 30 for now since this is limit for FuzzBench service. MAX_CONCURRENT_BUILDS = 30 # Build fail retries and wait interval. NUM_BUILD_RETRIES = 3 BUILD_FAIL_WAIT = 5 * 60 BENCHMARKS_DIR = os.path.join(utils.ROOT_DIR, 'benchmarks')