Beispiel #1
0
def manage_cluster() -> Generator[None, None, None]:
    # first assert the SHA of requirements hasn't changed
    # if changed, means we need to update test AMI.

    # build all the wheels
    tmpdir = tempfile.mkdtemp()
    plugin_wheels = build_ray_launcher_wheel(tmpdir)
    core_wheel = build_core_wheel(tmpdir)
    connect_config = {
        "cluster_name": cluster_name,
        "provider": {
            "type": "aws",
            "region": "us-west-2",
            "availability_zone": "us-west-2a,us-west-2b",
            "cache_stopped_nodes": False,
            "key_pair": {"key_name": f"hydra_test_{cluster_name}"},
        },
        "auth": {"ssh_user": "******"},
        "setup_commands": [
            f"echo 'export PATH=\"$HOME/anaconda3/envs/hydra_{cur_py_version}/bin:$PATH\"' >> ~/.bashrc"
        ],
        "head_setup_commands": [],
        "head_node": ray_nodes_conf,
        "worker_nodes": ray_nodes_conf,
    }
    with tempfile.NamedTemporaryFile(suffix=".yaml", delete=False) as f:
        with open(f.name, "w") as file:
            OmegaConf.save(config=connect_config, f=file.name, resolve=True)
        temp_yaml = f.name
        ray_up(temp_yaml)
        ray_new_dir(temp_yaml, temp_remote_dir, False)
        ray_new_dir(temp_yaml, temp_remote_wheel_dir, False)
        upload_and_install_wheels(tmpdir, temp_yaml, core_wheel, plugin_wheels)
        validate_lib_version(temp_yaml)
        yield
        ray_down(f.name)
Beispiel #2
0
def launch_jobs(launcher: RayAWSLauncher, local_tmp_dir: str,
                sweep_dir: Path) -> Sequence[JobReturn]:
    ray_up(launcher.ray_yaml_path)
    with tempfile.TemporaryDirectory() as local_tmp_download_dir:

        with ray_tmp_dir(launcher.ray_yaml_path,
                         launcher.docker_enabled) as remote_tmp_dir:

            ray_rsync_up(launcher.ray_yaml_path,
                         os.path.join(local_tmp_dir, ""), remote_tmp_dir)

            script_path = os.path.join(os.path.dirname(__file__),
                                       "_remote_invoke.py")
            ray_rsync_up(launcher.ray_yaml_path, script_path, remote_tmp_dir)

            if launcher.sync_up.source_dir:
                source_dir = _get_abs_code_dir(launcher.sync_up.source_dir)
                target_dir = (launcher.sync_up.target_dir if
                              launcher.sync_up.target_dir else remote_tmp_dir)
                rsync(
                    launcher.ray_yaml_path,
                    launcher.sync_up.include,
                    launcher.sync_up.exclude,
                    os.path.join(source_dir, ""),
                    target_dir,
                )

            ray_exec(
                launcher.ray_yaml_path,
                launcher.docker_enabled,
                os.path.join(remote_tmp_dir, "_remote_invoke.py"),
                remote_tmp_dir,
            )

            ray_rsync_down(
                launcher.ray_yaml_path,
                os.path.join(remote_tmp_dir, JOB_RETURN_PICKLE),
                local_tmp_download_dir,
            )

            sync_down_cfg = launcher.sync_down

            if (sync_down_cfg.target_dir or sync_down_cfg.source_dir
                    or sync_down_cfg.include or sync_down_cfg.exclude):
                source_dir = (sync_down_cfg.source_dir
                              if sync_down_cfg.source_dir else sweep_dir)
                target_dir = (sync_down_cfg.source_dir
                              if sync_down_cfg.source_dir else sweep_dir)
                target_dir = Path(_get_abs_code_dir(target_dir))
                target_dir.mkdir(parents=True, exist_ok=True)

                rsync(
                    launcher.ray_yaml_path,
                    launcher.sync_down.include,
                    launcher.sync_down.exclude,
                    os.path.join(source_dir),
                    str(target_dir),
                    up=False,
                )
                log.info(
                    f"Syncing outputs from remote dir: {source_dir} to local dir: {target_dir.absolute()} "
                )

        if launcher.stop_cluster:
            log.info("Stopping cluster now. (stop_cluster=true)")
            if launcher.ray_cfg.cluster.provider.cache_stopped_nodes:
                log.info(
                    "NOT deleting the cluster (provider.cache_stopped_nodes=true)"
                )
            else:
                log.info(
                    "Deleted the cluster (provider.cache_stopped_nodes=false)")
            ray_down(launcher.ray_yaml_path)
        else:
            log.warning(
                "NOT stopping cluster, this may incur extra cost for you. (stop_cluster=false)"
            )

        with open(os.path.join(local_tmp_download_dir, JOB_RETURN_PICKLE),
                  "rb") as f:
            job_returns = pickle.load(f)  # nosec
            assert isinstance(job_returns, List)
            for run in job_returns:
                assert isinstance(run, JobReturn)
            return job_returns