Ejemplo n.º 1
0
    def __init__(self, cluster_manager: ClusterManager):
        super(JobFileManager, self).__init__(cluster_manager=cluster_manager)

        self.sdk = self.cluster_manager.sdk
        self.s3_client = boto3.client("s3")
        self.bucket = RELEASE_AWS_BUCKET
        self.job_manager = JobManager(cluster_manager)

        sys.path.insert(0, f"{anyscale.ANYSCALE_RAY_DIR}/bin")
Ejemplo n.º 2
0
    def __init__(
        self,
        cluster_manager: ClusterManager,
        file_manager: FileManager,
        working_dir: str,
        sdk: Optional[AnyscaleSDK] = None,
    ):
        super(JobRunner, self).__init__(
            cluster_manager=cluster_manager,
            file_manager=file_manager,
            working_dir=working_dir,
        )
        self.sdk = sdk or get_anyscale_sdk()
        self.job_manager = JobManager(cluster_manager)

        self.last_command_scd_id = None
Ejemplo n.º 3
0
class JobRunner(CommandRunner):
    def __init__(
        self,
        cluster_manager: ClusterManager,
        file_manager: FileManager,
        working_dir: str,
        sdk: Optional[AnyscaleSDK] = None,
    ):
        super(JobRunner, self).__init__(
            cluster_manager=cluster_manager,
            file_manager=file_manager,
            working_dir=working_dir,
        )
        self.sdk = sdk or get_anyscale_sdk()
        self.job_manager = JobManager(cluster_manager)

        self.last_command_scd_id = None

    def prepare_local_env(self, ray_wheels_url: Optional[str] = None):
        # Install matching Ray for job submission
        try:
            install_matching_ray_locally(ray_wheels_url
                                         or os.environ.get("RAY_WHEELS", None))
        except Exception as e:
            raise LocalEnvSetupError(
                f"Error setting up local environment: {e}") from e

    def prepare_remote_env(self):
        # Copy wait script to working dir
        wait_script = os.path.join(os.path.dirname(__file__),
                                   "_wait_cluster.py")
        # Copy wait script to working dir
        if os.path.exists("wait_cluster.py"):
            os.unlink("wait_cluster.py")
        os.link(wait_script, "wait_cluster.py")

        try:
            self.file_manager.upload()
        except Exception as e:
            logger.exception(e)
            raise RemoteEnvSetupError(
                f"Error setting up remote environment: {e}") from e

    def wait_for_nodes(self, num_nodes: int, timeout: float = 900):
        # Wait script should be uploaded already. Kick off command
        try:
            # Give 30 seconds more to acount for communication
            self.run_prepare_command(
                f"python wait_cluster.py {num_nodes} {timeout}",
                timeout=timeout + 30)
        except (CommandError, CommandTimeout) as e:
            raise ClusterNodesWaitTimeout(
                f"Not all {num_nodes} nodes came up within {timeout} seconds."
            ) from e

    def run_command(self,
                    command: str,
                    env: Optional[Dict] = None,
                    timeout: float = 3600.0) -> float:
        full_env = self.get_full_command_env(env)

        if full_env:
            env_str = " ".join(f"{k}={v}" for k, v in full_env.items()) + " "
        else:
            env_str = ""

        full_command = f"{env_str}{command}"
        logger.info(
            f"Running command in cluster {self.cluster_manager.cluster_name}: "
            f"{full_command}")

        logger.info(f"Link to cluster: "
                    f"{format_link(self.cluster_manager.get_cluster_url())}")

        status_code, time_taken = self.job_manager.run_and_wait(
            full_command, full_env, timeout=timeout)

        if status_code != 0:
            raise CommandError(
                f"Command returned non-success status: {status_code}")

        return time_taken

    def get_last_logs(self, scd_id: Optional[str] = None):
        try:
            return self.job_manager.get_last_logs()
        except Exception as e:
            raise LogsError(f"Could not get last logs: {e}") from e

    def fetch_results(self) -> Dict[str, Any]:
        try:
            tmpfile = tempfile.mkstemp(suffix=".json")[1]
            logger.info(tmpfile)
            self.file_manager.download(self.result_output_json, tmpfile)

            with open(tmpfile, "rt") as f:
                data = json.load(f)

            os.unlink(tmpfile)
            return data
        except Exception as e:
            raise ResultsError(
                f"Could not fetch results from session: {e}") from e
Ejemplo n.º 4
0
class JobFileManager(FileManager):
    def __init__(self, cluster_manager: ClusterManager):
        import anyscale

        super(JobFileManager, self).__init__(cluster_manager=cluster_manager)

        self.sdk = self.cluster_manager.sdk
        self.s3_client = boto3.client("s3")
        self.bucket = str(RELEASE_AWS_BUCKET)
        self.job_manager = JobManager(cluster_manager)

        sys.path.insert(0, f"{anyscale.ANYSCALE_RAY_DIR}/bin")

    def _run_with_retry(self, f, initial_retry_delay_s: int = 10):
        assert callable(f)
        return exponential_backoff_retry(
            f,
            retry_exceptions=Exception,
            initial_retry_delay_s=initial_retry_delay_s,
            max_retries=3,
        )

    def _generate_tmp_s3_path(self):
        fn = "".join(random.choice(string.ascii_lowercase) for i in range(10))
        location = f"tmp/{fn}"
        return location

    def download(self, source: str, target: str):
        # Attention: Only works for single files at the moment
        remote_upload_to = self._generate_tmp_s3_path()
        # remote source -> s3
        bucket_address = f"s3://{self.bucket}/{remote_upload_to}"
        retcode, _ = self._run_with_retry(
            lambda: self.job_manager.run_and_wait(
                (
                    f"pip install -q awscli && "
                    f"aws s3 cp {source} {bucket_address} "
                    "--acl bucket-owner-full-control"
                ),
                {},
            )
        )

        if retcode != 0:
            raise FileDownloadError(f"Error downloading file {source} to {target}")

        # s3 -> local target
        self._run_with_retry(
            lambda: self.s3_client.download_file(
                Bucket=self.bucket,
                Key=remote_upload_to,
                Filename=target,
            )
        )

        self._run_with_retry(
            lambda: self.s3_client.delete_object(
                Bucket=self.bucket, Key=remote_upload_to
            ),
            initial_retry_delay_s=2,
        )

    def _push_local_dir(self):
        remote_upload_to = self._generate_tmp_s3_path()
        # pack local dir
        _, local_path = tempfile.mkstemp()
        shutil.make_archive(local_path, "gztar", os.getcwd())
        # local source -> s3
        self._run_with_retry(
            lambda: self.s3_client.upload_file(
                Filename=local_path + ".tar.gz",
                Bucket=self.bucket,
                Key=remote_upload_to,
            )
        )
        # remove local archive
        os.unlink(local_path)

        bucket_address = f"s3://{self.bucket}/{remote_upload_to}"
        # s3 -> remote target
        retcode, _ = self.job_manager.run_and_wait(
            f"pip install -q awscli && "
            f"aws s3 cp {bucket_address} archive.tar.gz && "
            f"tar xf archive.tar.gz ",
            {},
        )
        if retcode != 0:
            raise FileUploadError(
                f"Error uploading local dir to session "
                f"{self.cluster_manager.cluster_name}."
            )
        try:
            self._run_with_retry(
                lambda: self.s3_client.delete_object(
                    Bucket=self.bucket, Key=remote_upload_to
                ),
                initial_retry_delay_s=2,
            )
        except Exception as e:
            logger.warning(f"Could not remove temporary S3 object: {e}")

    def upload(self, source: Optional[str] = None, target: Optional[str] = None):
        if source is None and target is None:
            self._push_local_dir()
            return

        assert isinstance(source, str)
        assert isinstance(target, str)

        remote_upload_to = self._generate_tmp_s3_path()

        # local source -> s3
        self._run_with_retry(
            lambda: self.s3_client.upload_file(
                Filename=source,
                Bucket=self.bucket,
                Key=remote_upload_to,
            )
        )

        # s3 -> remote target
        bucket_address = f"s3://{self.bucket}/{remote_upload_to}"
        retcode, _ = self.job_manager.run_and_wait(
            "pip install -q awscli && " f"aws s3 cp {bucket_address} {target}",
            {},
        )

        if retcode != 0:
            raise FileUploadError(f"Error uploading file {source} to {target}")

        try:
            self._run_with_retry(
                lambda: self.s3_client.delete_object(
                    Bucket=self.bucket, Key=remote_upload_to
                ),
                initial_retry_delay_s=2,
            )
        except Exception as e:
            logger.warning(f"Could not remove temporary S3 object: {e}")