コード例 #1
0
ファイル: job_runner.py プロジェクト: vishalbelsare/ray
    def run_command(self,
                    command: str,
                    env: Optional[Dict] = None,
                    timeout: float = 3600.0) -> float:
        full_env = self.get_full_command_env(env)

        if full_env:
            env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " "
        else:
            env_str = ""

        full_command = f"{env_str}{command}"
        logger.info(
            f"Running command in cluster {self.cluster_manager.cluster_name}: "
            f"{full_command}")

        logger.info(f"Link to cluster: "
                    f"{format_link(self.cluster_manager.get_cluster_url())}")

        status_code, time_taken = self.job_manager.run_and_wait(
            full_command, full_env, timeout=timeout)

        if status_code != 0:
            raise CommandError(
                f"Command returned non-success status: {status_code}")

        return time_taken
コード例 #2
0
    def run_command(
        self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0
    ) -> float:
        full_env = self.get_full_command_env(env)

        if full_env:
            env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " "
        else:
            env_str = ""

        full_command = f"{env_str}{command}"
        logger.info(
            f"Running command in cluster {self.cluster_manager.cluster_name}: "
            f"{full_command}"
        )

        logger.info(
            f"Link to cluster: "
            f"{format_link(self.cluster_manager.get_cluster_url())}"
        )

        result = self.sdk.create_session_command(
            dict(session_id=self.cluster_manager.cluster_id, shell_command=full_command)
        )

        scd_id = result.result.id
        self.last_command_scd_id = scd_id

        completed = result.result.finished_at is not None

        start_time = time.monotonic()
        timeout_at = start_time + timeout
        next_status = start_time + 30

        while not completed:
            now = time.monotonic()
            if now >= timeout_at:
                raise CommandTimeout(
                    f"Cluster command timed out after {timeout} seconds."
                )

            if now >= next_status:
                logger.info(
                    f"... command still running ..."
                    f"({int(now - start_time)} seconds) ..."
                )
                next_status += 30

            # Sleep 1 sec before next check.
            time.sleep(1)

            result = exponential_backoff_retry(
                lambda: self.sdk.get_session_command(session_command_id=scd_id),
                retry_exceptions=Exception,
                initial_retry_delay_s=10,
                max_retries=3,
            )
            completed = result.result.finished_at

        status_code = result.result.status_code
        time_taken = time.monotonic() - start_time

        if status_code != 0:
            raise CommandError(f"Command returned non-success status: {status_code}")

        return time_taken
コード例 #3
0
ファイル: client_runner.py プロジェクト: vishalbelsare/ray
    def run_command(
        self, command: str, env: Optional[Dict] = None, timeout: float = 3600.0
    ) -> float:
        logger.info(
            f"Running command using Ray client on cluster "
            f"{self.cluster_manager.cluster_name}: {command}"
        )

        env = env or {}
        full_env = self.get_full_command_env(
            {
                **os.environ,
                **env,
                "RAY_ADDRESS": self.cluster_manager.get_cluster_address(),
                "RAY_JOB_NAME": "test_job",
                "PYTHONUNBUFFERED": "1",
            }
        )

        kill_event = threading.Event()

        def _kill_after(
            proc: subprocess.Popen,
            timeout: int = 30,
            kill_event: Optional[threading.Event] = None,
        ):
            timeout_at = time.monotonic() + timeout
            while time.monotonic() < timeout_at:
                if proc.poll() is not None:
                    return
                time.sleep(1)
            logger.info(
                f"Client command timed out after {timeout} seconds, "
                f"killing subprocess."
            )
            if kill_event:
                kill_event.set()
            proc.terminate()

        start_time = time.monotonic()
        proc = subprocess.Popen(
            command,
            env=full_env,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            shell=True,
            text=True,
        )

        kill_thread = threading.Thread(
            target=_kill_after, args=(proc, timeout, kill_event)
        )
        kill_thread.start()

        proc.stdout.reconfigure(line_buffering=True)
        sys.stdout.reconfigure(line_buffering=True)
        logs = deque(maxlen=LAST_LOGS_LENGTH)
        for line in proc.stdout:
            logs.append(line)
            sys.stdout.write(line)
        proc.wait()
        sys.stdout.reconfigure(line_buffering=False)
        time_taken = time.monotonic() - start_time
        self.last_logs = "\n".join(logs)

        return_code = proc.poll()
        if return_code == -15 or return_code == 15 or kill_event.is_set():
            # Process has been terminated
            raise CommandTimeout(f"Cluster command timed out after {timeout} seconds.")
        if return_code != 0:
            raise CommandError(f"Command returned non-success status: {return_code}")

        logger.warning(f"WE GOT RETURN CODE {return_code} AFTER {time_taken}")

        return time_taken