def manage_cluster() -> Generator[None, None, None]: # first assert the SHA of requirements hasn't changed # if changed, means we need to update test AMI. # build all the wheels tmpdir = tempfile.mkdtemp() plugin_wheels = build_ray_launcher_wheel(tmpdir) core_wheel = build_core_wheel(tmpdir) connect_config = { "cluster_name": cluster_name, "provider": { "type": "aws", "region": "us-west-2", "availability_zone": "us-west-2a,us-west-2b", "cache_stopped_nodes": False, "key_pair": {"key_name": f"hydra_test_{cluster_name}"}, }, "auth": {"ssh_user": "******"}, "setup_commands": [ f"echo 'export PATH=\"$HOME/anaconda3/envs/hydra_{cur_py_version}/bin:$PATH\"' >> ~/.bashrc" ], "head_setup_commands": [], "head_node": ray_nodes_conf, "worker_nodes": ray_nodes_conf, } with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f: with open(f.name, "w") as file: OmegaConf.save(config=connect_config, f=file.name, resolve=True) temp_yaml = f.name ray_up(temp_yaml) ray_new_dir(temp_yaml, temp_remote_dir, False) ray_new_dir(temp_yaml, temp_remote_wheel_dir, False) upload_and_install_wheels(tmpdir, temp_yaml, core_wheel, plugin_wheels) validate_lib_version(temp_yaml) yield ray_down(f.name)
def launch_jobs(launcher: RayAWSLauncher, local_tmp_dir: str, sweep_dir: Path) -> Sequence[JobReturn]: ray_up(launcher.ray_yaml_path) with tempfile.TemporaryDirectory() as local_tmp_download_dir: with ray_tmp_dir(launcher.ray_yaml_path, launcher.docker_enabled) as remote_tmp_dir: ray_rsync_up(launcher.ray_yaml_path, os.path.join(local_tmp_dir, ""), remote_tmp_dir) script_path = os.path.join(os.path.dirname(__file__), "_remote_invoke.py") ray_rsync_up(launcher.ray_yaml_path, script_path, remote_tmp_dir) if launcher.sync_up.source_dir: source_dir = _get_abs_code_dir(launcher.sync_up.source_dir) target_dir = (launcher.sync_up.target_dir if launcher.sync_up.target_dir else remote_tmp_dir) rsync( launcher.ray_yaml_path, launcher.sync_up.include, launcher.sync_up.exclude, os.path.join(source_dir, ""), target_dir, ) ray_exec( launcher.ray_yaml_path, launcher.docker_enabled, os.path.join(remote_tmp_dir, "_remote_invoke.py"), remote_tmp_dir, ) ray_rsync_down( launcher.ray_yaml_path, os.path.join(remote_tmp_dir, JOB_RETURN_PICKLE), local_tmp_download_dir, ) sync_down_cfg = launcher.sync_down if (sync_down_cfg.target_dir or sync_down_cfg.source_dir or sync_down_cfg.include or sync_down_cfg.exclude): source_dir = (sync_down_cfg.source_dir if sync_down_cfg.source_dir else sweep_dir) target_dir = (sync_down_cfg.source_dir if sync_down_cfg.source_dir else sweep_dir) target_dir = Path(_get_abs_code_dir(target_dir)) target_dir.mkdir(parents=True, exist_ok=True) rsync( launcher.ray_yaml_path, launcher.sync_down.include, launcher.sync_down.exclude, os.path.join(source_dir), str(target_dir), up=False, ) log.info( f"Syncing outputs from remote dir: {source_dir} to local dir: {target_dir.absolute()} " ) if launcher.stop_cluster: log.info("Stopping cluster now. (stop_cluster=true)") if launcher.ray_cfg.cluster.provider.cache_stopped_nodes: log.info( "NOT deleting the cluster (provider.cache_stopped_nodes=true)" ) else: log.info( "Deleted the cluster (provider.cache_stopped_nodes=false)") ray_down(launcher.ray_yaml_path) else: log.warning( "NOT stopping cluster, this may incur extra cost for you. (stop_cluster=false)" ) with open(os.path.join(local_tmp_download_dir, JOB_RETURN_PICKLE), "rb") as f: job_returns = pickle.load(f) # nosec assert isinstance(job_returns, List) for run in job_returns: assert isinstance(run, JobReturn) return job_returns