Example #1
0
 def get_cluster_url(self) -> Optional[str]:
     if not self.project_id or not self.cluster_id:
         return None
     return anyscale_cluster_url(self.project_id, self.cluster_id)
Example #2
0
    def start_cluster(self, timeout: float = 600.0):
        logger.info(f"Creating cluster {self.cluster_name}")
        logger.info(f"Autosuspend time: {self.autosuspend_minutes} minutes")
        try:
            result = self.sdk.create_cluster(
                dict(
                    name=self.cluster_name,
                    project_id=self.project_id,
                    cluster_environment_build_id=self.cluster_env_build_id,
                    cluster_compute_id=self.cluster_compute_id,
                    idle_timeout_minutes=self.autosuspend_minutes,
                ))
            self.cluster_id = result.result.id
        except Exception as e:
            raise ClusterCreationError(f"Error creating cluster: {e}") from e

        # Trigger session start
        logger.info(
            f"Starting cluster {self.cluster_name} ({self.cluster_id})")
        cluster_url = anyscale_cluster_url(project_id=self.project_id,
                                           session_id=self.cluster_id)
        logger.info(f"Link to cluster: {format_link(cluster_url)}")

        try:
            result = self.sdk.start_cluster(self.cluster_id,
                                            start_cluster_options={})
            cop_id = result.result.id
            completed = result.result.completed
        except Exception as e:
            raise ClusterStartupError(
                f"Error starting cluster with name "
                f"{self.cluster_name} and {self.cluster_id} ({cluster_url}): "
                f"{e}") from e

        # Wait for session
        logger.info(f"Waiting for cluster {self.cluster_name}...")

        start_time = time.monotonic()
        timeout_at = start_time + timeout
        next_status = start_time + 30
        while not completed:
            now = time.monotonic()
            if now >= timeout_at:
                raise ClusterStartupTimeout(
                    f"Time out when creating cluster {self.cluster_name}")

            if now >= next_status:
                logger.info(
                    f"... still waiting for cluster {self.cluster_name} "
                    f"({int(now - start_time)} seconds) ...")
                next_status += 30

            # Sleep 1 sec before next check.
            time.sleep(1)

            result = exponential_backoff_retry(
                lambda: self.sdk.get_cluster_operation(cop_id,
                                                       _request_timeout=30),
                retry_exceptions=Exception,
                initial_retry_delay_s=2,
                max_retries=3,
            )
            completed = result.result.completed

        result = self.sdk.get_cluster(self.cluster_id)
        if result.result.state != "Running":
            raise ClusterStartupFailed(
                f"Cluster did not come up - most likely the nodes are currently "
                f"not available. Please check the cluster startup logs: "
                f"{cluster_url} (cluster state: {result.result.state})")