def run_command(self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0) -> float: full_env = self.get_full_command_env(env) if full_env: env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " " else: env_str = "" full_command = f"{env_str}{command}" logger.info( f"Running command in cluster {self.cluster_manager.cluster_name}: " f"{full_command}") logger.info(f"Link to cluster: " f"{format_link(self.cluster_manager.get_cluster_url())}") status_code, time_taken = self.job_manager.run_and_wait( full_command, full_env, timeout=timeout) if status_code != 0: raise CommandError( f"Command returned non-success status: {status_code}") return time_taken
def run_command( self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0 ) -> float: full_env = self.get_full_command_env(env) if full_env: env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " " else: env_str = "" full_command = f"{env_str}{command}" logger.info( f"Running command in cluster {self.cluster_manager.cluster_name}: " f"{full_command}" ) logger.info( f"Link to cluster: " f"{format_link(self.cluster_manager.get_cluster_url())}" ) result = self.sdk.create_session_command( dict(session_id=self.cluster_manager.cluster_id, shell_command=full_command) ) scd_id = result.result.id self.last_command_scd_id = scd_id completed = result.result.finished_at is not None start_time = time.monotonic() timeout_at = start_time + timeout next_status = start_time + 30 while not completed: now = time.monotonic() if now >= timeout_at: raise CommandTimeout( f"Cluster command timed out after {timeout} seconds." ) if now >= next_status: logger.info( f"... command still running ..." f"({int(now - start_time)} seconds) ..." ) next_status += 30 # Sleep 1 sec before next check. time.sleep(1) result = exponential_backoff_retry( lambda: self.sdk.get_session_command(session_command_id=scd_id), retry_exceptions=Exception, initial_retry_delay_s=10, max_retries=3, ) completed = result.result.finished_at status_code = result.result.status_code time_taken = time.monotonic() - start_time if status_code != 0: raise CommandError(f"Command returned non-success status: {status_code}") return time_taken
def run_command( self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0 ) -> float: logger.info( f"Running command using Ray client on cluster " f"{self.cluster_manager.cluster_name}: {command}" ) env = env or {} full_env = self.get_full_command_env( { **os.environ, **env, "RAY_ADDRESS": self.cluster_manager.get_cluster_address(), "RAY_JOB_NAME": "test_job", "PYTHONUNBUFFERED": "1", } ) kill_event = threading.Event() def _kill_after( proc: subprocess.Popen, timeout: int = 30, kill_event: Optional[threading.Event] = None, ): timeout_at = time.monotonic() + timeout while time.monotonic() < timeout_at: if proc.poll() is not None: return time.sleep(1) logger.info( f"Client command timed out after {timeout} seconds, " f"killing subprocess." ) if kill_event: kill_event.set() proc.terminate() start_time = time.monotonic() proc = subprocess.Popen( command, env=full_env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True, ) kill_thread = threading.Thread( target=_kill_after, args=(proc, timeout, kill_event) ) kill_thread.start() proc.stdout.reconfigure(line_buffering=True) sys.stdout.reconfigure(line_buffering=True) logs = deque(maxlen=LAST_LOGS_LENGTH) for line in proc.stdout: logs.append(line) sys.stdout.write(line) proc.wait() sys.stdout.reconfigure(line_buffering=False) time_taken = time.monotonic() - start_time self.last_logs = "\n".join(logs) return_code = proc.poll() if return_code == -15 or return_code == 15 or kill_event.is_set(): # Process has been terminated raise CommandTimeout(f"Cluster command timed out after {timeout} seconds.") if return_code != 0: raise CommandError(f"Command returned non-success status: {return_code}") logger.warning(f"WE GOT RETURN CODE {return_code} AFTER {time_taken}") return time_taken