def initialize(experiment_config: dict): """Initialize everything that will be needed to schedule measurers.""" logger.info('Initializing worker scheduling.') gce.initialize() experiment = experiment_config['experiment'] project = experiment_config['project'] instance_template_name = get_measure_worker_instance_template_name( experiment) docker_image = posixpath.join(experiment_config['docker_registry'], 'measure-worker:{}'.format(experiment)) redis_host = experiment_config['redis_host'] experiment_filestore = experiment_config['experiment_filestore'] local_experiment = experiment_utils.is_local_experiment() cloud_compute_zone = experiment_config.get('cloud_compute_zone') env = { 'REDIS_HOST': redis_host, 'EXPERIMENT_FILESTORE': experiment_filestore, 'EXPERIMENT': experiment, 'LOCAL_EXPERIMENT': local_experiment, 'CLOUD_COMPUTE_ZONE': cloud_compute_zone, } zone = experiment_config['cloud_compute_zone'] instance_template_url = gcloud.create_instance_template( instance_template_name, docker_image, env, project, zone) instance_group_name = get_instance_group_name(experiment) base_instance_name = get_base_worker_instance_name(experiment) gce.create_instance_group(instance_group_name, instance_template_url, base_instance_name, project, zone) queue = queue_utils.initialize_queue(redis_host) return queue
def stop_experiment(experiment_name, experiment_config_filename): """Stop the experiment specified by |experiment_config_filename|.""" experiment_config = yaml_utils.read(experiment_config_filename) if experiment_config.get('local_experiment', False): raise NotImplementedError( 'Local experiment stop logic is not implemented.') cloud_project = experiment_config['cloud_project'] cloud_compute_zone = experiment_config['cloud_compute_zone'] gce.initialize() instances = list(gce.get_instances(cloud_project, cloud_compute_zone)) experiment_instances = [] dispatcher_instance = experiment_utils.get_dispatcher_instance_name( experiment_name) if dispatcher_instance not in instances: logger.warning('Dispatcher instance not running, skip.') else: experiment_instances.append(dispatcher_instance) trial_prefix = 'r-' + experiment_name experiment_instances.extend([ instance for instance in instances if instance.startswith(trial_prefix) ]) if not experiment_instances: logger.warning('No experiment instances found, no work to do.') return True if not gcloud.delete_instances(experiment_instances, cloud_compute_zone): logger.error('Failed to stop experiment instances.') return False logger.info('Successfully stopped experiment.') return True
def main(): """Run schedule_measure_workers as a standalone script by calling schedule in a loop. Useful for debugging.""" logs.initialize( default_extras={ 'experiment': os.environ['EXPERIMENT'], 'component': 'dispatcher', 'subcomponent': 'scheduler' }) gce.initialize() config_path = sys.argv[1] config = yaml_utils.read(config_path) queue = initialize(config) while True: schedule(config, queue) time.sleep(30)
def schedule_loop(experiment_config: dict): """Continuously run the scheduler until there is nothing left to schedule. Note that this should not be called unless multiprocessing.set_start_method('spawn') was called first. Otherwise it will use fork to create the Pool which breaks logging.""" # Create the thread pool once and reuse it to avoid leaking threads and # other issues. logger.info('Starting scheduler.') num_trials = len( get_experiment_trials(experiment_config['experiment']).all()) local_experiment = experiment_utils.is_local_experiment() if not local_experiment: gce.initialize() trial_instance_manager = TrialInstanceManager(num_trials, experiment_config) experiment = experiment_config['experiment'] with multiprocessing.Pool() as pool: handle_preempted = False while not all_trials_ended(experiment): try: if (not local_experiment and not handle_preempted and not any_pending_trials(experiment)): # This ensures that: # 1. handle_preempted will not becomes True when running # locally. # 2. Only start handling preempted instances once every # initial trial was started. handle_preempted = True schedule(experiment_config, pool) if handle_preempted: trial_instance_manager.handle_preempted_trials() except Exception: # pylint: disable=broad-except logger.error('Error occurred during scheduling.') # Either # - We had an unexpected exception OR # - We have not been able to start trials and still have some # remaining. This can happen when we run out of instance quota. # In these cases, sleep before retrying again. time.sleep(FAIL_WAIT_SECONDS) logger.info('Finished scheduling.')