def initialize(experiment_config: dict):
    """Initialize everything that will be needed to schedule measurers."""
    logger.info('Initializing worker scheduling.')
    gce.initialize()
    experiment = experiment_config['experiment']
    project = experiment_config['project']
    instance_template_name = get_measure_worker_instance_template_name(
        experiment)
    docker_image = posixpath.join(experiment_config['docker_registry'],
                                  'measure-worker:{}'.format(experiment))

    redis_host = experiment_config['redis_host']
    experiment_filestore = experiment_config['experiment_filestore']
    local_experiment = experiment_utils.is_local_experiment()
    cloud_compute_zone = experiment_config.get('cloud_compute_zone')
    env = {
        'REDIS_HOST': redis_host,
        'EXPERIMENT_FILESTORE': experiment_filestore,
        'EXPERIMENT': experiment,
        'LOCAL_EXPERIMENT': local_experiment,
        'CLOUD_COMPUTE_ZONE': cloud_compute_zone,
    }

    zone = experiment_config['cloud_compute_zone']
    instance_template_url = gcloud.create_instance_template(
        instance_template_name, docker_image, env, project, zone)

    instance_group_name = get_instance_group_name(experiment)

    base_instance_name = get_base_worker_instance_name(experiment)

    gce.create_instance_group(instance_group_name, instance_template_url,
                              base_instance_name, project, zone)
    queue = queue_utils.initialize_queue(redis_host)
    return queue
def stop_experiment(experiment_name, experiment_config_filename):
    """Stop the experiment specified by |experiment_config_filename|."""
    experiment_config = yaml_utils.read(experiment_config_filename)
    if experiment_config.get('local_experiment', False):
        raise NotImplementedError(
            'Local experiment stop logic is not implemented.')

    cloud_project = experiment_config['cloud_project']
    cloud_compute_zone = experiment_config['cloud_compute_zone']

    gce.initialize()
    instances = list(gce.get_instances(cloud_project, cloud_compute_zone))

    experiment_instances = []
    dispatcher_instance = experiment_utils.get_dispatcher_instance_name(
        experiment_name)
    if dispatcher_instance not in instances:
        logger.warning('Dispatcher instance not running, skip.')
    else:
        experiment_instances.append(dispatcher_instance)

    trial_prefix = 'r-' + experiment_name
    experiment_instances.extend([
        instance for instance in instances if instance.startswith(trial_prefix)
    ])
    if not experiment_instances:
        logger.warning('No experiment instances found, no work to do.')
        return True

    if not gcloud.delete_instances(experiment_instances, cloud_compute_zone):
        logger.error('Failed to stop experiment instances.')
        return False

    logger.info('Successfully stopped experiment.')
    return True
def main():
    """Run schedule_measure_workers as a standalone script by calling schedule
    in a loop. Useful for debugging."""
    logs.initialize(
        default_extras={
            'experiment': os.environ['EXPERIMENT'],
            'component': 'dispatcher',
            'subcomponent': 'scheduler'
        })
    gce.initialize()
    config_path = sys.argv[1]
    config = yaml_utils.read(config_path)
    queue = initialize(config)
    while True:
        schedule(config, queue)
        time.sleep(30)
Beispiel #4
0
def schedule_loop(experiment_config: dict):
    """Continuously run the scheduler until there is nothing left to schedule.
    Note that this should not be called unless
    multiprocessing.set_start_method('spawn') was called first. Otherwise it
    will use fork to create the Pool which breaks logging."""
    # Create the thread pool once and reuse it to avoid leaking threads and
    # other issues.
    logger.info('Starting scheduler.')
    num_trials = len(
        get_experiment_trials(experiment_config['experiment']).all())
    local_experiment = experiment_utils.is_local_experiment()
    if not local_experiment:
        gce.initialize()
        trial_instance_manager = TrialInstanceManager(num_trials,
                                                      experiment_config)
    experiment = experiment_config['experiment']
    with multiprocessing.Pool() as pool:
        handle_preempted = False
        while not all_trials_ended(experiment):
            try:
                if (not local_experiment and not handle_preempted
                        and not any_pending_trials(experiment)):
                    # This ensures that:
                    # 1. handle_preempted will not becomes True when running
                    #    locally.
                    # 2. Only start handling preempted instances once every
                    #    initial trial was started.
                    handle_preempted = True

                schedule(experiment_config, pool)
                if handle_preempted:
                    trial_instance_manager.handle_preempted_trials()
            except Exception:  # pylint: disable=broad-except
                logger.error('Error occurred during scheduling.')

            # Either
            # - We had an unexpected exception OR
            # - We have not been able to start trials and still have some
            #   remaining. This can happen when we run out of instance quota.
            # In these cases, sleep before retrying again.
            time.sleep(FAIL_WAIT_SECONDS)

    logger.info('Finished scheduling.')