Exemple #1
0
def end_expired_trials(experiment_config: dict):
    """Get all expired trials, end them and return them."""
    trials_past_expiry = get_expired_trials(
        experiment_config['experiment'], experiment_config['max_total_time'])
    expired_instances = []
    current_dt = datetime_now()
    for trial in trials_past_expiry:
        expired_instances.append(
            experiment_utils.get_trial_instance_name(
                experiment_config['experiment'], trial.id))
        trial.time_ended = current_dt

    # Bail out here because trials_past_expiry will be truthy until evaluated.
    if not expired_instances:
        return

    # Delete instances for expired trials.
    running_instances = gcloud.list_instances()
    instances_to_delete = [
        i for i in expired_instances if i in running_instances
    ]
    if instances_to_delete and not gcloud.delete_instances(
            instances_to_delete,
            experiment_config['cloud_compute_zone'],
            write_to_stdout=False):
        # If we failed to delete some instances, then don't update the status
        # of expired trials in database as we don't know which instances were
        # successfully deleted. Wait for next iteration of end_expired_trials.
        return

    db_utils.bulk_save(trials_past_expiry)
Exemple #2
0
def stop_experiment(experiment_name, experiment_config_filename):
    """Stop the experiment specified by |experiment_config_filename|."""
    experiment_config = yaml_utils.read(experiment_config_filename)

    if experiment_config.get('local_experiment', False):
        raise NotImplementedError(
            'Local experiment stop logic is not implemented.')

    instances = gcloud.list_instances()

    cloud_compute_zone = experiment_config['cloud_compute_zone']
    trial_prefix = 'r-' + experiment_name
    experiment_instances = [
        instance for instance in instances if instance.startswith(trial_prefix)
    ]
    dispatcher_instance = experiment_utils.get_dispatcher_instance_name(
        experiment_name)
    if dispatcher_instance not in instances:
        logger.warning('Dispatcher instance not running, skip.')
    else:
        experiment_instances.append(dispatcher_instance)

    if not experiment_instances:
        logger.warning('No experiment instances found, no work to do.')
        return True

    if not gcloud.delete_instances(experiment_instances, cloud_compute_zone):
        logger.error('Failed to stop experiment instances.')
        return False

    logger.info('Successfully stopped experiment.')
    return True
Exemple #3
0
def delete_instances(instances, experiment_config):
    """Deletes |instances|."""
    running_instances = gcloud.list_instances()
    instances_to_delete = [i for i in instances if i in running_instances]
    return gcloud.delete_instances(instances_to_delete,
                                   experiment_config['cloud_compute_zone'])