Ejemplo n.º 1
0
    def build_configs(self, timeout: float = 30.0):
        try:
            self.create_cluster_compute()
        except AssertionError as e:
            # If already exists, ignore
            logger.warning(str(e))
        except ClusterComputeCreateError as e:
            raise e
        except Exception as e:
            raise ClusterComputeCreateError(
                f"Unexpected cluster compute build error: {e}") from e

        try:
            self.create_cluster_env()
        except AssertionError as e:
            # If already exists, ignore
            logger.warning(str(e))
        except ClusterEnvCreateError as e:
            raise e
        except Exception as e:
            raise ClusterEnvCreateError(
                f"Unexpected cluster env create error: {e}") from e

        try:
            self.build_cluster_env(timeout=timeout)
        except AssertionError as e:
            # If already exists, ignore
            logger.warning(str(e))
        except (ClusterEnvBuildError, ClusterEnvBuildTimeout) as e:
            raise e
        except Exception as e:
            raise ClusterEnvBuildError(
                f"Unexpected cluster env build error: {e}") from e
Ejemplo n.º 2
0
    def build_cluster_env(self, timeout: float = 600.0):
        assert self.cluster_env_id
        assert self.cluster_env_build_id is None

        # Fetch build
        build_id = None
        last_status = None
        error_message = None
        config_json = None
        result = self.sdk.list_cluster_environment_builds(self.cluster_env_id)
        if not result or not result.results:
            raise ClusterEnvBuildError(f"No build found for cluster env: {result}")

        build = sorted(result.results, key=lambda b: b.created_at)[-1]
        build_id = build.id
        last_status = build.status
        error_message = build.error_message
        config_json = build.config_json

        if last_status == "succeeded":
            logger.info(
                f"Link to succeeded cluster env build: "
                f"{format_link(anyscale_cluster_env_build_url(build_id))}"
            )
            self.cluster_env_build_id = build_id
            return

        if last_status == "failed":
            logger.info(f"Previous cluster env build failed: {error_message}")
            logger.info("Starting new cluster env build...")

            # Retry build
            result = self.sdk.create_cluster_environment_build(
                dict(
                    cluster_environment_id=self.cluster_env_id, config_json=config_json
                )
            )
            build_id = result.result.id

            logger.info(
                f"Link to created cluster env build: "
                f"{format_link(anyscale_cluster_env_build_url(build_id))}"
            )

        # Build found but not failed/finished yet
        completed = False
        start_wait = time.time()
        next_report = start_wait + REPORT_S
        timeout_at = time.monotonic() + timeout
        logger.info(f"Waiting for build {build_id} to finish...")
        logger.info(
            f"Track progress here: "
            f"{format_link(anyscale_cluster_env_build_url(build_id))}"
        )
        while not completed:
            now = time.time()
            if now > next_report:
                logger.info(
                    f"... still waiting for build {build_id} to finish "
                    f"({int(now - start_wait)} seconds) ..."
                )
                next_report = next_report + REPORT_S

            result = self.sdk.get_build(build_id)
            build = result.result

            if build.status == "failed":
                raise ClusterEnvBuildError(
                    f"Cluster env build failed. Please see "
                    f"{anyscale_cluster_env_build_url(build_id)} for details. "
                    f"Error message: {build.error_message}"
                )

            if build.status == "succeeded":
                logger.info("Build succeeded.")
                self.cluster_env_build_id = build_id
                return

            completed = build.status not in ["in_progress", "pending"]

            if completed:
                raise ClusterEnvBuildError(
                    f"Unknown build status: {build.status}. Please see "
                    f"{anyscale_cluster_env_build_url(build_id)} for details"
                )

            if time.monotonic() > timeout_at:
                raise ClusterEnvBuildTimeout(
                    f"Time out when building cluster env {self.cluster_env_name}"
                )

            time.sleep(1)

        self.cluster_env_build_id = build_id