Exemple #1
0
def end_expired_trials(experiment_config: dict):
    """Get all expired trials, end them and return them."""
    trials_past_expiry = get_expired_trials(
        experiment_config['experiment'], experiment_config['max_total_time'])
    expired_instances = []
    current_dt = datetime_now()
    for trial in trials_past_expiry:
        expired_instances.append(
            experiment_utils.get_trial_instance_name(
                experiment_config['experiment'], trial.id))
        trial.time_ended = current_dt

    # Bail out here because trials_past_expiry will be truthy until evaluated.
    if not expired_instances:
        return

    # Delete instances for expired trials.
    running_instances = gcloud.list_instances()
    instances_to_delete = [
        i for i in expired_instances if i in running_instances
    ]
    if instances_to_delete and not gcloud.delete_instances(
            instances_to_delete,
            experiment_config['cloud_compute_zone'],
            write_to_stdout=False):
        # If we failed to delete some instances, then don't update the status
        # of expired trials in database as we don't know which instances were
        # successfully deleted. Wait for next iteration of end_expired_trials.
        return

    db_utils.bulk_save(trials_past_expiry)
Exemple #2
0
def _get_preemption_operation(trial_id, exp_conf):
    zone_url = (
        'https://www.googleapis.com/compute/v1/projects/{project}/zones/'
        '{zone}').format(zone=exp_conf['cloud_compute_zone'],
                         project=exp_conf['cloud_project'])
    instance_name = experiment_utils.get_trial_instance_name(
        exp_conf['experiment'], trial_id)
    target_link = posixpath.join('instances', zone_url, instance_name)
    name = 'systemevent-blah'
    self_link = posixpath.join(zone_url, name)
    return {
        'id': '1',
        'name': name,
        'zone': zone_url,
        'operationType': 'compute.instances.preempted',
        'targetLink': target_link,
        'targetId': '1',
        'status': 'DONE',
        'statusMessage': 'Instance was preempted.',
        'user': '******',
        'progress': 100,
        'insertTime': '2020-01-24T29:16:46.842-02:00',
        'startTime': '2020-01-24T29:16:46.842-02:00',
        'endTime': '2020-01-24T29:16:46.842-02:00',
        'selfLink': self_link,
        'kind': 'compute#operation'
    }
Exemple #3
0
def end_expired_trials(experiment_config: dict):
    """Get all expired trials, end them and return them."""
    trials_past_expiry = get_expired_trials(
        experiment_config['experiment'], experiment_config['max_total_time'])
    expired_instances = []
    current_dt = datetime_now()
    for trial in trials_past_expiry:
        expired_instances.append(
            experiment_utils.get_trial_instance_name(
                experiment_config['experiment'], trial.id))
        trial.time_ended = current_dt

    # Bail out here because trials_past_expiry will be truthy until evaluated.
    if not expired_instances:
        return

    if not experiment_utils.is_local_experiment() and not delete_instances(
            expired_instances, experiment_config):
        # If we failed to delete some instances, then don't update the status
        # of expired trials in database as we don't know which instances were
        # successfully deleted. Wait for next iteration of end_expired_trials.
        logger.error('Failed to delete instances after trial expiry.')
        return

    db_utils.bulk_save(trials_past_expiry)
Exemple #4
0
 def _get_started_unfinished_instances(self) -> Dict[str, models.Trial]:
     """Returns a dictionary of instance names to trials for trials were
     started but not finished according to the database."""
     experiment = self.experiment_config['experiment']
     running_trials = get_running_trials(experiment)
     return {
         experiment_utils.get_trial_instance_name(experiment, trial.id):
         trial for trial in running_trials
     }
Exemple #5
0
def _get_preempted_instance_item(trial_id, exp_conf):
    instance_name = experiment_utils.get_trial_instance_name(
        exp_conf['experiment'], trial_id)
    return {
        'id': '1',
        'name': instance_name,
        'status': 'TERMINATED',
        'scheduling': {
            'preemptible': True,
        }
    }
Exemple #6
0
def test_get_preempted_trials_stale_preempted(_, preempt_exp_conf):
    """Tests that TrialInstanceManager.get_preempted_trials doesn't return
    trials that we already know were preempted."""
    trial_instance_manager = get_trial_instance_manager(preempt_exp_conf)
    trial = models.Trial(experiment=preempt_exp_conf['experiment'],
                         fuzzer=FUZZER,
                         benchmark=BENCHMARK)
    db_utils.add_all([trial])
    instance_name = experiment_utils.get_trial_instance_name(
        preempt_exp_conf['experiment'], trial.id)
    trial_instance_manager.preempted_trials = {instance_name: trial}
    with mock.patch(
            'experiment.scheduler.TrialInstanceManager.'
            '_get_started_unfinished_instances',
            return_value=[instance_name]):
        assert trial_instance_manager.get_preempted_trials() == []
Exemple #7
0
def create_trial_instance(benchmark: str, fuzzer: str, trial_id: int,
                          experiment_config: dict) -> bool:
    """Create or start a trial instance for a specific
    trial_id,fuzzer,benchmark."""
    instance_name = experiment_utils.get_trial_instance_name(
        experiment_config['experiment'], trial_id)
    startup_script = render_startup_script_template(instance_name, benchmark,
                                                    fuzzer, trial_id,
                                                    experiment_config)
    startup_script_path = '/tmp/%s-start-docker.sh' % instance_name
    with open(startup_script_path, 'w') as file_handle:
        file_handle.write(startup_script)

    return gcloud.create_instance(instance_name,
                                  gcloud.InstanceType.RUNNER,
                                  experiment_config,
                                  startup_script=startup_script_path)
Exemple #8
0
def create_trial_instance(benchmark: str, fuzzer: str, trial_id: int,
                          experiment_config: dict) -> bool:
    """Create or start a trial instance for a specific
    trial_id,fuzzer,benchmark."""
    instance_name = experiment_utils.get_trial_instance_name(
        experiment_config['experiment'], trial_id)
    fuzzer_config = fuzzer_config_utils.get_by_variant_name(fuzzer)
    underlying_fuzzer_name = fuzzer_config['fuzzer']
    docker_image_url = benchmark_utils.get_runner_image_url(
        benchmark, underlying_fuzzer_name, experiment_config['cloud_project'])
    fuzz_target = benchmark_utils.get_fuzz_target(benchmark)

    # Convert additional environment variables from configuration to arguments
    # that will be passed to docker.
    additional_env = ''
    if 'env' in fuzzer_config:
        additional_env = ' '.join([
            '-e {k}={v}'.format(k=k, v=shlex.quote(v))
            for k, v in fuzzer_config['env'].items()
        ])

    startup_script = '''#!/bin/bash
echo 0 > /proc/sys/kernel/yama/ptrace_scope
echo core >/proc/sys/kernel/core_pattern

while ! docker pull {docker_image_url}
do
  echo 'Error pulling image, retrying...'
done

docker run --privileged --cpuset-cpus=0 --rm \
-e INSTANCE_NAME={instance_name} -e FUZZER={fuzzer} -e BENCHMARK={benchmark} \
-e FUZZER_VARIANT_NAME={fuzzer_variant_name} -e EXPERIMENT={experiment} \
-e TRIAL_ID={trial_id} -e MAX_TOTAL_TIME={max_total_time} \
-e CLOUD_PROJECT={cloud_project} -e CLOUD_COMPUTE_ZONE={cloud_compute_zone} \
-e CLOUD_EXPERIMENT_BUCKET={cloud_experiment_bucket} \
-e FUZZ_TARGET={fuzz_target} {additional_env} \
--cap-add SYS_NICE --cap-add SYS_PTRACE --name=runner-container \
{docker_image_url} 2>&1 | tee /tmp/runner-log.txt'''.format(
        instance_name=instance_name,
        benchmark=benchmark,
        experiment=experiment_config['experiment'],
        fuzzer=underlying_fuzzer_name,
        fuzzer_variant_name=fuzzer,
        trial_id=trial_id,
        max_total_time=experiment_config['max_total_time'],
        cloud_project=experiment_config['cloud_project'],
        cloud_compute_zone=experiment_config['cloud_compute_zone'],
        cloud_experiment_bucket=experiment_config['cloud_experiment_bucket'],
        fuzz_target=fuzz_target,
        docker_image_url=docker_image_url,
        additional_env=additional_env)

    startup_script_path = '/tmp/%s-start-docker.sh' % instance_name
    with open(startup_script_path, 'w') as file_handle:
        file_handle.write(startup_script)

    return gcloud.create_instance(instance_name,
                                  gcloud.InstanceType.RUNNER,
                                  experiment_config,
                                  startup_script=startup_script_path,
                                  write_to_stdout=False)
Exemple #9
0
    def handle_preempted_trials(self):
        """Handle preempted trials by marking them as preempted and creating
        replacement trials when appropriate.
        This is the algorithm used by handle_preempted_trials:

        1. Query the GCE API to find trials that were preempted since our last
        query (or the start of the experiment on our first query.

        2. For every preempted trial, ensure that it was not handled before and
        if it wasn't then mark the trials as finished and preempted and create
        replacement trials if appropriate.

        This is how it is determined whether a preempted trial should be
        replaced and what it should be replaced with:

        1. First we see if we can replace it with a preemptible instance. We
        will replace it with a preemptible instance if:

          a. We haven't created more than double the number of preemptible trial
          instances than the number of trial this experiment would take if it
          were using non-preemptibles ("target_trials") . This bounds the cost
          of our preemptible usage to <2X cost of using preemptibles naively
          If preemptibles are 20% cost of non-preemptibles, then <40% the cost
          of a non-preemptible experiment.

          b. We haven't spent longer than 3X the duration of time the
          experiment would take if using nonpreemptibles. This bounds the
          duration of the experiment to 4X the length of the nonpreemptible
          experiment.

        2. If we can't create a preemptible replacement, we replace it with a
        nonpreemptible if:

          a. We haven't created more than target_trials/20 nonpreemptibles
          already. This bounds the cost of the nonpreemptibles to 5% of the cost
          of a 100% nonpreemptible experiment.

          b. (TODO): Using preemptibles will actually help the results of this
          experiment. If we can't create any preemptible instances but we need
          to replace target_trials number of instances, replacing the tiny
          fraction of them with preemptibles will give you a 5% complete
          experiment. This is a hard issue to solve, because we restart
          trials as they are preempted so we may not determine it is futile to
          use nonpreemptibles until the last nonpreemptible above our limit is
          reached.

        3. TODO: There are other cases where we probably shouldn't replace
        trials that we haven't implemented, but would like to such as:

          a. If a trial is preempted very close to the end of its budgeted time.
          In that case it's probably fine if the comparison on the benchmark
          happens at 22:45 instead of 23:00.

          b. If a trial is the only trial for the fuzzer-benchmark that was
          preempted. In that case, not replacing the trial will save time and
          not hurt results much.

        The impact of this algorithm is that:

        1. The cost of a preemptible experiment, in the worst case scenario is
        45% of a nonpreemptible experiment. On average we find they will be
        ~30% the cost of a nonpreemptible experiment.

        2. Time of an experiment will be 4X the length of a nonpreemptible
        experiment in the worst case scenario. This is fine however because most
        of the experiment will finish earlier, only a few trials that won't
        change results very much will trickle in at the end.

        3. Experiments are guaranteed to terminate but results won't necessarily
        be complete if the preemption rate is pathologically high. This is
        acceptable because a human should intervene in these edge cases.
        """
        logger.info('Handling preempted.')
        if not self.experiment_config.get('preemptible_runners'):
            # Nothing to do here if not a preemptible experiment.
            return []

        preempted_trials = self.get_preempted_trials()
        if not preempted_trials:
            logs.info('No preempteds to handle.')
            return []

        replacements = self._get_preempted_replacements(preempted_trials)
        experiment = self.experiment_config['experiment']
        instances = [
            experiment_utils.get_trial_instance_name(experiment, trial.id)
            for trial in preempted_trials
        ]

        logs.info('Deleting preempted instances: %s', instances)
        if not delete_instances(instances, self.experiment_config):
            logs.error('Could not delete preempted instances: %s', instances)

        db_utils.add_all(preempted_trials + replacements)
        logger.info('Done handling preempted.')
        return replacements
def test_get_trial_instance_name():
    """Tests that get_trial_instance_name returns the expected result."""
    assert experiment_utils.get_trial_instance_name('experiment-a',
                                                    9) == 'r-experiment-a-9'