Beispiel #1
0
def install_matching_ray_locally(ray_wheels: Optional[str]):
    if not ray_wheels:
        logger.warning(
            "No Ray wheels found - can't install matching Ray wheels locally!")
        return
    assert "manylinux2014_x86_64" in ray_wheels, ray_wheels
    if sys.platform == "darwin":
        platform = "macosx_10_15_intel"
    elif sys.platform == "win32":
        platform = "win_amd64"
    else:
        platform = "manylinux2014_x86_64"
    ray_wheels = ray_wheels.replace("manylinux2014_x86_64", platform)
    logger.info(f"Installing matching Ray wheels locally: {ray_wheels}")
    subprocess.check_output("pip uninstall -y ray",
                            shell=True,
                            env=os.environ,
                            text=True)
    subprocess.check_output(
        f"pip install -U {shlex.quote(ray_wheels)}",
        shell=True,
        env=os.environ,
        text=True,
    )
    for module_name in RELOAD_MODULES:
        if module_name in sys.modules:
            importlib.reload(sys.modules[module_name])
Beispiel #2
0
    def _push_local_dir(self):
        remote_upload_to = self._generate_tmp_s3_path()
        # pack local dir
        _, local_path = tempfile.mkstemp()
        shutil.make_archive(local_path, "gztar", os.getcwd())
        # local source -> s3
        self._run_with_retry(lambda: self.s3_client.upload_file(
            Filename=local_path + ".tar.gz",
            Bucket=self.bucket,
            Key=remote_upload_to,
        ))
        # remove local archive
        os.unlink(local_path)

        bucket_address = f"s3://{self.bucket}/{remote_upload_to}"
        # s3 -> remote target
        retcode, _ = self.job_manager.run_and_wait(
            f"pip install -q awscli && "
            f"aws s3 cp {bucket_address} archive.tar.gz && "
            f"tar xf archive.tar.gz ",
            {},
        )
        if retcode != 0:
            raise FileUploadError(f"Error uploading local dir to session "
                                  f"{self.cluster_manager.cluster_name}.")
        try:
            self._run_with_retry(
                lambda: self.s3_client.delete_object(Bucket=self.bucket,
                                                     Key=remote_upload_to),
                initial_retry_delay_s=2,
            )
        except Exception as e:
            logger.warning(f"Could not remove temporary S3 object: {e}")
Beispiel #3
0
def maybe_rewrite_wheels_url(ray_wheels_url: str,
                             python_version: Tuple[int, int]) -> str:
    full_url = resolve_url(ray_wheels_url)

    # If the version is matching, just return the full url
    if is_wheels_url_matching_ray_verison(ray_wheels_url=full_url,
                                          python_version=python_version):
        return full_url

    # Try to parse the version from the filename / URL
    parsed_ray_version, parsed_python_version = parse_wheels_filename(full_url)
    if not parsed_ray_version or not python_version:
        # If we can't parse, we don't know the version, so we raise a warning
        logger.warning(
            f"The passed Ray wheels URL may not work with the python version "
            f"used in this test! Got python version {python_version} and "
            f"wheels URL: {ray_wheels_url}.")
        return full_url

    # If we parsed this and the python version is different from the actual version,
    # try to rewrite the URL
    current_filename = get_wheels_filename(parsed_ray_version,
                                           parsed_python_version)
    rewritten_filename = get_wheels_filename(parsed_ray_version,
                                             python_version)

    new_url = full_url.replace(current_filename, rewritten_filename)
    if new_url != full_url:
        logger.warning(
            f"The passed Ray wheels URL were for a different python version than "
            f"used in this test! Found python version {parsed_python_version} "
            f"but expected {python_version}. The wheels URL was re-written to "
            f"{new_url}.")

    return new_url
Beispiel #4
0
    def create_cluster_env(self, _repeat: bool = True):
        assert self.cluster_env_id is None

        if self.cluster_env:
            assert self.cluster_env_name

            logger.info(
                f"Test uses a cluster env with name "
                f"{self.cluster_env_name}. Looking up existing "
                f"cluster envs with this name."
            )

            paging_token = None
            while not self.cluster_env_id:
                result = self.sdk.search_cluster_environments(
                    dict(
                        project_id=self.project_id,
                        name=dict(equals=self.cluster_env_name),
                        paging=dict(count=50, token=paging_token),
                    )
                )
                paging_token = result.metadata.next_paging_token

                for res in result.results:
                    if res.name == self.cluster_env_name:
                        self.cluster_env_id = res.id
                        logger.info(
                            f"Cluster env already exists with ID "
                            f"{self.cluster_env_id}"
                        )
                        break

                if not paging_token or self.cluster_env_id:
                    break

            if not self.cluster_env_id:
                logger.info("Cluster env not found. Creating new one.")
                try:
                    result = self.sdk.create_cluster_environment(
                        dict(
                            name=self.cluster_env_name,
                            project_id=self.project_id,
                            config_json=self.cluster_env,
                        )
                    )
                    self.cluster_env_id = result.result.id
                except Exception as e:
                    if _repeat:
                        logger.warning(
                            f"Got exception when trying to create cluster "
                            f"env: {e}. Sleeping for 10 seconds and then "
                            f"try again once..."
                        )
                        time.sleep(10)
                        return self.create_cluster_env(_repeat=False)

                    raise ClusterEnvCreateError("Could not create cluster env.") from e

                logger.info(f"Cluster env created with ID {self.cluster_env_id}")
Beispiel #5
0
    def create_cluster_compute(self, _repeat: bool = True):
        assert self.cluster_compute_id is None

        if self.cluster_compute:
            assert self.cluster_compute

            logger.info(f"Tests uses compute template "
                        f"with name {self.cluster_compute_name}. "
                        f"Looking up existing cluster computes.")

            paging_token = None
            while not self.cluster_compute_id:
                result = self.sdk.search_cluster_computes(
                    dict(
                        project_id=self.project_id,
                        name=dict(equals=self.cluster_compute_name),
                        include_anonymous=True,
                        paging=dict(token=paging_token),
                    ))
                paging_token = result.metadata.next_paging_token

                for res in result.results:
                    if res.name == self.cluster_compute_name:
                        self.cluster_compute_id = res.id
                        logger.info(f"Cluster compute already exists "
                                    f"with ID {self.cluster_compute_id}")
                        break

                if not paging_token:
                    break

            if not self.cluster_compute_id:
                logger.info(f"Cluster compute not found. "
                            f"Creating with name {self.cluster_compute_name}.")
                try:
                    result = self.sdk.create_cluster_compute(
                        dict(
                            name=self.cluster_compute_name,
                            project_id=self.project_id,
                            config=self.cluster_compute,
                        ))
                    self.cluster_compute_id = result.result.id
                except Exception as e:
                    if _repeat:
                        logger.warning(
                            f"Got exception when trying to create cluster "
                            f"compute: {e}. Sleeping for 10 seconds and then "
                            f"try again once...")
                        time.sleep(10)
                        return self.create_cluster_compute(_repeat=False)

                    raise ClusterComputeCreateError(
                        "Could not create cluster compute") from e

                logger.info(f"Cluster compute template created with "
                            f"name {self.cluster_compute_name} and "
                            f"ID {self.cluster_compute_id}")
Beispiel #6
0
def get_buildkite_prompt_value(key: str) -> Optional[str]:
    try:
        value = subprocess.check_output(
            ["buildkite-agent", "meta-data", "get", key], text=True)
    except Exception as e:
        logger.warning(f"Could not fetch metadata for {key}: {e}")
        return None
    logger.debug(f"Got Buildkite prompt value for {key}: {value}")
    return value
Beispiel #7
0
def as_smoke_test(test: Test) -> Test:
    if "smoke_test" not in test:
        logger.warning(
            f"Requested smoke test, but test with name {test['name']} does "
            f"not have any smoke test configuration.")
        return test

    smoke_test_config = test.pop("smoke_test")
    new_test = deep_update(test, smoke_test_config)
    return new_test
Beispiel #8
0
    def build_configs(self, timeout: float = 30.0):
        try:
            self.create_cluster_compute()
        except AssertionError as e:
            # If already exists, ignore
            logger.warning(str(e))
        except ClusterComputeCreateError as e:
            raise e
        except Exception as e:
            raise ClusterComputeCreateError(
                f"Unexpected cluster compute build error: {e}") from e

        try:
            self.create_cluster_env()
        except AssertionError as e:
            # If already exists, ignore
            logger.warning(str(e))
        except ClusterEnvCreateError as e:
            raise e
        except Exception as e:
            raise ClusterEnvCreateError(
                f"Unexpected cluster env create error: {e}") from e

        try:
            self.build_cluster_env(timeout=timeout)
        except AssertionError as e:
            # If already exists, ignore
            logger.warning(str(e))
        except (ClusterEnvBuildError, ClusterEnvBuildTimeout) as e:
            raise e
        except Exception as e:
            raise ClusterEnvBuildError(
                f"Unexpected cluster env build error: {e}") from e
Beispiel #9
0
def get_concurrency_group(test: Test) -> Tuple[str, int]:
    try:
        test_cpus, test_gpus = get_test_resources(test)
    except Exception as e:
        logger.warning(
            f"Couldn't get test resources for test {test['name']}: {e}")
        return "small", CONCURRENY_GROUPS["small"]

    for condition in gpu_cpu_to_concurrency_groups:
        min_gpu = parse_condition(condition.min_gpu, float("-inf"))
        max_gpu = parse_condition(condition.max_gpu, float("inf"))
        min_cpu = parse_condition(condition.min_cpu, float("-inf"))
        max_cpu = parse_condition(condition.max_cpu, float("inf"))

        if min_cpu <= test_cpus <= max_cpu and min_gpu <= test_gpus <= max_gpu:
            group = condition.group
            return group, CONCURRENY_GROUPS[group]

    # Return default
    logger.warning(f"Could not find concurrency group for test {test['name']} "
                   f"based on used resources.")
    return "small", CONCURRENY_GROUPS["small"]
Beispiel #10
0
def install_matching_ray(ray_wheels: Optional[str]):
    if not ray_wheels:
        logger.warning(
            "No Ray wheels found - can't install matching Ray wheels locally!")
        return
    assert "manylinux2014_x86_64" in ray_wheels, ray_wheels
    if sys.platform == "darwin":
        platform = "macosx_10_15_intel"
    elif sys.platform == "win32":
        platform = "win_amd64"
    else:
        platform = "manylinux2014_x86_64"
    ray_wheels = ray_wheels.replace("manylinux2014_x86_64", platform)
    logger.info(f"Installing matching Ray wheels locally: {ray_wheels}")
    subprocess.check_output("pip uninstall -y ray",
                            shell=True,
                            env=os.environ,
                            text=True)
    subprocess.check_output(f"pip install -U {ray_wheels}",
                            shell=True,
                            env=os.environ,
                            text=True)
Beispiel #11
0
    def upload(self, source: Optional[str] = None, target: Optional[str] = None):
        if source is None and target is None:
            self._push_local_dir()
            return

        assert isinstance(source, str)
        assert isinstance(target, str)

        remote_upload_to = self._generate_tmp_s3_path()

        # local source -> s3
        self._run_with_retry(
            lambda: self.s3_client.upload_file(
                Filename=source,
                Bucket=self.bucket,
                Key=remote_upload_to,
            )
        )

        # s3 -> remote target
        bucket_address = f"s3://{self.bucket}/{remote_upload_to}"
        retcode, _ = self.job_manager.run_and_wait(
            "pip install -q awscli && " f"aws s3 cp {bucket_address} {target}",
            {},
        )

        if retcode != 0:
            raise FileUploadError(f"Error uploading file {source} to {target}")

        try:
            self._run_with_retry(
                lambda: self.s3_client.delete_object(
                    Bucket=self.bucket, Key=remote_upload_to
                ),
                initial_retry_delay_s=2,
            )
        except Exception as e:
            logger.warning(f"Could not remove temporary S3 object: {e}")
Beispiel #12
0
    def run_command(
        self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0
    ) -> float:
        logger.info(
            f"Running command using Ray client on cluster "
            f"{self.cluster_manager.cluster_name}: {command}"
        )

        env = env or {}
        full_env = self.get_full_command_env(
            {
                **os.environ,
                **env,
                "RAY_ADDRESS": self.cluster_manager.get_cluster_address(),
                "RAY_JOB_NAME": "test_job",
                "PYTHONUNBUFFERED": "1",
            }
        )

        kill_event = threading.Event()

        def _kill_after(
            proc: subprocess.Popen,
            timeout: int = 30,
            kill_event: Optional[threading.Event] = None,
        ):
            timeout_at = time.monotonic() + timeout
            while time.monotonic() < timeout_at:
                if proc.poll() is not None:
                    return
                time.sleep(1)
            logger.info(
                f"Client command timed out after {timeout} seconds, "
                f"killing subprocess."
            )
            if kill_event:
                kill_event.set()
            proc.terminate()

        start_time = time.monotonic()
        proc = subprocess.Popen(
            command,
            env=full_env,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=True,
            text=True,
        )

        kill_thread = threading.Thread(
            target=_kill_after, args=(proc, timeout, kill_event)
        )
        kill_thread.start()

        proc.stdout.reconfigure(line_buffering=True)
        sys.stdout.reconfigure(line_buffering=True)
        logs = deque(maxlen=LAST_LOGS_LENGTH)
        for line in proc.stdout:
            logs.append(line)
            sys.stdout.write(line)
        proc.wait()
        sys.stdout.reconfigure(line_buffering=False)
        time_taken = time.monotonic() - start_time
        self.last_logs = "\n".join(logs)

        return_code = proc.poll()
        if return_code == -15 or return_code == 15 or kill_event.is_set():
            # Process has been terminated
            raise CommandTimeout(f"Cluster command timed out after {timeout} seconds.")
        if return_code != 0:
            raise CommandError(f"Command returned non-success status: {return_code}")

        logger.warning(f"WE GOT RETURN CODE {return_code} AFTER {time_taken}")

        return time_taken
Beispiel #13
0
def main(test_collection_file: Optional[str] = None):
    settings = get_pipeline_settings()

    repo = settings["ray_test_repo"]
    branch = settings["ray_test_branch"]
    tmpdir = None

    env = {}
    if repo:
        # If the Ray test repo is set, we clone that repo to fetch
        # the test configuration file. Otherwise we might be missing newly
        # added test.
        repo = settings["ray_test_repo"]
        tmpdir = tempfile.mktemp()

        clone_cmd = f"git clone --depth 1 --branch {branch} {repo} {tmpdir}"
        try:
            subprocess.check_output(clone_cmd, shell=True)
        except Exception as e:
            raise ReleaseTestCLIError(f"Could not clone test repository "
                                      f"{repo} (branch {branch}): {e}") from e
        test_collection_file = os.path.join(tmpdir, "release",
                                            "release_tests.yaml")
        env = {
            "RAY_TEST_REPO": repo,
            "RAY_TEST_BRANCH": branch,
        }
    else:
        test_collection_file = test_collection_file or os.path.join(
            os.path.dirname(__file__), "..", "..", "release_tests.yaml")
    test_collection = read_and_validate_release_test_collection(
        test_collection_file)

    if tmpdir:
        shutil.rmtree(tmpdir, ignore_errors=True)

    frequency = settings["frequency"]
    prefer_smoke_tests = settings["prefer_smoke_tests"]
    test_attr_regex_filters = settings["test_attr_regex_filters"]
    ray_wheels = settings["ray_wheels"]
    priority = settings["priority"]

    logger.info(
        f"Found the following buildkite pipeline settings:\n\n"
        f"  frequency =               {settings['frequency']}\n"
        f"  prefer_smoke_tests =      {settings['prefer_smoke_tests']}\n"
        f"  test_attr_regex_filters = {settings['test_attr_regex_filters']}\n"
        f"  ray_wheels =              {settings['ray_wheels']}\n"
        f"  ray_test_repo =           {settings['ray_test_repo']}\n"
        f"  ray_test_branch =         {settings['ray_test_branch']}\n"
        f"  priority =                {settings['priority']}\n"
        f"  no_concurrency_limit =    {settings['no_concurrency_limit']}\n")

    filtered_tests = filter_tests(
        test_collection,
        frequency=frequency,
        test_attr_regex_filters=test_attr_regex_filters,
        prefer_smoke_tests=prefer_smoke_tests,
    )
    logger.info(f"Found {len(filtered_tests)} tests to run.")
    if len(filtered_tests) == 0:
        raise ReleaseTestCLIError(
            "Empty test collection. The selected frequency or filter did "
            "not return any tests to run. Adjust your filters.")
    grouped_tests = group_tests(filtered_tests)

    group_str = ""
    for group, tests in grouped_tests.items():
        group_str += f"\n{group}:\n"
        for test, smoke in tests:
            group_str += f"  {test['name']}"
            if smoke:
                group_str += " [smoke test]"
            group_str += "\n"

    logger.info(f"Tests to run:\n{group_str}")

    # Wait for wheels here so we have them ready before we kick off
    # the other workers
    ray_wheels_url = find_and_wait_for_ray_wheels_url(
        ray_wheels, timeout=DEFAULT_WHEEL_WAIT_TIMEOUT)
    logger.info(f"Starting pipeline for Ray wheel: {ray_wheels_url}")

    no_concurrency_limit = settings["no_concurrency_limit"]
    if no_concurrency_limit:
        logger.warning("Concurrency is not limited for this run!")

    # Report if REPORT=1 or BUILDKITE_SOURCE=schedule
    report = (bool(int(os.environ.get("REPORT", "0")))
              or os.environ.get("BUILDKITE_SOURCE", "manual") == "schedule")

    steps = []
    for group in sorted(grouped_tests):
        tests = grouped_tests[group]
        group_steps = []
        for test, smoke_test in tests:
            # If the python version is defined, we need a different Ray wheels URL
            if "python" in test:
                python_version = parse_python_version(test["python"])
                this_ray_wheels_url = find_ray_wheels_url(
                    ray_wheels, python_version=python_version)
            else:
                this_ray_wheels_url = ray_wheels_url

            step = get_step(
                test,
                report=report,
                smoke_test=smoke_test,
                ray_wheels=this_ray_wheels_url,
                env=env,
                priority_val=priority.value,
            )

            if no_concurrency_limit:
                step.pop("concurrency", None)
                step.pop("concurrency_group", None)

            group_steps.append(step)

        group_step = {"group": group, "steps": group_steps}
        steps.append(group_step)

    if "BUILDKITE" in os.environ:
        if os.path.exists(PIPELINE_ARTIFACT_PATH):
            shutil.rmtree(PIPELINE_ARTIFACT_PATH)

        os.makedirs(PIPELINE_ARTIFACT_PATH, exist_ok=True, mode=0o755)

        with open(os.path.join(PIPELINE_ARTIFACT_PATH, "pipeline.json"),
                  "wt") as fp:
            json.dump(steps, fp)

        settings["frequency"] = settings["frequency"].value
        settings["priority"] = settings["priority"].value
        with open(os.path.join(PIPELINE_ARTIFACT_PATH, "settings.json"),
                  "wt") as fp:
            json.dump(settings, fp)

    steps_str = json.dumps(steps)
    print(steps_str)