def spawn( image_id: str, total_count: int, instance_type: str, key_name: str, security_group_id: str, ): """Spawn new EC2 instances to make a total. Args: image_id (str): Image ID. total_count (int): Desired number of instances. instance_type (str): Type of the instance. key_name (str): Name of the key pair. security_group_id (str): Security group. """ available = get_num_instances() if available < total_count: run( image_id=image_id, count=total_count - available, instance_type=instance_type, key_name=key_name, security_group_id=security_group_id, ) else: out.out("Already enough instances available.")
def stop_running(): """Stop all running EC2 instances.""" instances = [instance for instance in get_state("running")] if len(instances) > 0: stop(*instances) else: out.out("No running instances.")
def start_stopped(): """Start all stopped EC2 instances.""" instances = [instance for instance in get_state("stopped")] if len(instances) > 0: start(*instances) else: out.out("No stopped instances.")
def terminate_all(): """Terminate all EC2 instances.""" instance_ids = [instance["InstanceId"] for instance in get_instances()] if len(instance_ids) > 0: execute_command("aws", "ec2", "terminate-instances", "--instance-ids", *instance_ids) else: out.out("No instances to terminate.")
def _minimise_l_bfgs_b(f, vs, f_calls=10000, iters=1000, trace=False, names=None, jit=False): names = _convert_and_validate_names(names) # Run function once to ensure that all variables are initialised and # available. val_init = f(vs) # SciPy doesn't perform zero iterations, so handle that edge case # manually. if iters == 0 or f_calls == 0: return B.to_numpy(val_init) # Extract initial value. x0 = B.to_numpy(vs.get_latent_vector(*names)) # The optimiser expects to get `float64`s. def _convert(*xs): return [B.cast(np.float64, B.to_numpy(x)) for x in xs] # Wrap the function and get the list of function evaluations. f_vals, f_wrapped = wrap_f(vs, names, f, jit, _convert) # Perform optimisation routine. def perform_minimisation(callback_=lambda _: None): return fmin_l_bfgs_b( func=f_wrapped, x0=x0, maxiter=iters, maxfun=f_calls, callback=callback_, disp=0, ) if trace: # Print progress during minimisation. with out.Progress(name='Minimisation of "{}"'.format(f.__name__), total=iters) as progress: def callback(_): progress({"Objective value": np.min(f_vals)}) x_opt, val_opt, info = perform_minimisation(callback) with out.Section("Termination message"): out.out(convert(info["task"], str)) else: # Don't print progress; simply perform minimisation. x_opt, val_opt, info = perform_minimisation() vs.set_latent_vector(x_opt, *names) # Assign optimum. return val_opt # Return optimal value.
def print_logs(path: str): """Display the tail of logs on all running instances. Args: path (str): Path to the log. """ for ip, log in ssh_map([f"tail -n100 {path}"], broadcast=True).items(): with out.Section(ip): out.out(log)
def test_time_report_calculation(monkeypatch): monkeypatch.setattr(out, "report_time", True) # Test that time is correctly calculated. with Mock() as mock: out._time_start = time.time() - 2 * 60 * 60 - 2 * 60 - 2 out.out("a") assert len(mock) == 1 assert mock[0] == "02:02:02 | a\n"
def test_out_newlines(): # Test that newlines are correctly indented. with Mock() as mock: out.out("a\nb") with out.Section(): out.out("c\nd") assert len(mock) == 2 assert mock[0] == "a\nb\n" assert mock[1] == " c\n d\n"
def minimise_l_bfgs_b(f, vs, f_calls=10000, iters=1000, trace=False, names=None): names = [] if names is None else names # Run function once to ensure that all variables are initialised and # available. val_init = f(vs) # SciPy doesn't perform zero iterations, so handle that edge case # manually. if iters == 0 or f_calls == 0: return B.to_numpy(val_init) # Extract initial value. x0 = B.to_numpy(vs.get_vector(*names)) # Wrap the function and get the list of function evaluations. f_vals, f_wrapped = wrap_f(vs, names, f) # Perform optimisation routine. def perform_minimisation(callback_=lambda _: None): return fmin_l_bfgs_b(func=f_wrapped, x0=x0, maxiter=iters, maxfun=f_calls, callback=callback_, disp=0) if trace: # Print progress during minimisation. with out.Progress(name='Minimisation of "{}"'.format(f.__name__), total=iters) as progress: def callback(_): progress({'Objective value': np.min(f_vals)}) x_opt, val_opt, info = perform_minimisation(callback) with out.Section('Termination message'): out.out(info['task'].decode('utf-8')) else: # Don't print progress; simply perform minimisation. x_opt, val_opt, info = perform_minimisation() vs.set_vector(x_opt, *names) # Assign optimum. return val_opt # Return optimal value.
def exception(x, e): """In the case that an exception is raised during function evaluation, print a warning and return NaN for the function value and gradient. Args: x (tensor): Current input. e (:class:`Exception`): Caught exception. Returns: tuple: Tuple containing NaN and NaNs for the gradient. """ with out.Section("Caught exception during function evaluation"): out.out(traceback.format_exc().strip()) grad_nan = np.empty(x.shape) grad_nan[:] = np.nan return np.nan, grad_nan
def test_section(): with Mock() as mock: out.out("before") with out.Section(): out.out("message1") with out.Section("name"): out.out("message2") with out.Section(): out.out("message3") out.out("after") assert len(mock) == 6 assert mock[0] == "before\n" assert mock[1] == " message1\n" assert mock[2] == "name:\n" assert mock[3] == " message2\n" assert mock[4] == " message3\n" assert mock[5] == "after\n"
def test_time_report_interval(monkeypatch): monkeypatch.setattr(out, "report_time", True) # Test that time stamp is not repeated unnecessarily. with Mock() as mock: out.out("a") out.out("b") time.sleep(1.0) out.out("c") assert len(mock) == 3 assert mock[0] == "00:00:00 | a\n" assert mock[1] == " | b\n" assert mock[2] == "00:00:01 | c\n"
def test_out(): with Mock() as mock: out.out("message") assert len(mock) == 1 assert str(mock) == "message\n"
def manage_cluster( commands: List[List[str]], instance_type: str, key_name: str, security_group_id: str, image_id: str, sync_sources: List[str], sync_target: Path, monitor_aws_repo: str, monitor_call: str, monitor_delay: int, ): """Manage the cluster. Args: commands (list[list[str]]): One list of commands for every experiment. image_id (str): Image ID. instance_type (str): Type of the instance. key_name (str): Name of the key pair. security_group_id (str): Security group. sync_sources (list[str]): List of sources to sync. sync_target (:class:`.util.Path`): Directory to sync to. monitor_aws_repo (str, optional): Path to the root of this repo. The repo must consider the virtual environment "venv" which has the repo installed in editable mode. monitor_call (str): Call to start the monitor. See :mod:`.monitor`. monitor_delay (int): Number of seconds to wait before starting the monitor. """ parser = argparse.ArgumentParser() parser.add_argument( "--spawn", type=int, help="Spawn instances.", ) parser.add_argument( "--start", action="store_true", help="Start experiments.", ) parser.add_argument( "--terminate", action="store_true", help="Terminate all instances. This is a kill switch.", ) parser.add_argument( "--kill", action="store_true", help="Kill all running experiments, but keep the instances running.", ) parser.add_argument( "--stop", action="store_true", help="Stop all running instances", ) parser.add_argument( "--sync-stopped", action="store_true", help="Synchronise all stopped instances.", ) parser.add_argument( "--sync-sleep", default=120, type=int, help="Number of seconds to sleep before syncing again.", ) args = parser.parse_args() if args.sync_stopped: with out.Section("Syncing all stopped instances in five batches"): for batch in np.array_split(get_state("stopped"), 5): # Batches can be empty. if len(batch) == 0: continue # Start the instances. start(*batch) try: # Wait for the instances to have booted. out.out( "Waiting a minute for the instances to have booted...") time.sleep(60) # Refresh the instances to get the IPs. instance_ids = [ instance["InstanceId"] for instance in batch ] batch = get_instances(*instance_ids) # Sync. sync( sync_sources, sync_target, ips=[ instance["PublicIpAddress"] for instance in batch ], ) finally: # Stop the instances again. stop(*batch) out.out("Syncing completed: not continuing execution of script.") exit() if args.spawn: with out.Section("Starting all stopped instances"): start_stopped() with out.Section("Spawning instances"): spawn( image_id=image_id, total_count=args.spawn, instance_type=instance_type, key_name=key_name, security_group_id=security_group_id, ) while not check_all_running(): out.out("Waiting for all instances to be running...") time.sleep(5) out.out("Waiting a minute for all instances to have booted...") time.sleep(60) if args.kill: with out.Section("Killing all experiments"): kill_all() if args.stop: with out.Section("Stopping all instances"): stop_running() if args.terminate: with out.Section("Terminating all instances"): terminate_all() if args.start: num_instances = len(get_running_ips()) pieces = np.array_split(commands, num_instances) # Ensure that we have regular Python lists. pieces = [piece.tolist() for piece in pieces] with out.Section("Starting experiments"): out.kv("Number of commands", len(commands)) out.kv("Number of instances", num_instances) out.kv("Maximum runs per instance", max([len(piece) for piece in pieces])) ssh_map( *[[ *config["setup_commands"], *sum(piece, []), *config["teardown_commands"], ] for piece in pieces], start_experiment=True, in_experiment=True, start_monitor=True, monitor_aws_repo=monitor_aws_repo, monitor_delay=monitor_delay, monitor_call=monitor_call, ) while True: out.kv("Instances still running", len(get_running_ips())) sync(sync_sources, sync_target) out.out(f"Sleeping for {args.sync_sleep} second(s)...") time.sleep(args.sync_sleep)