コード例 #1
0
ファイル: kv_store.py プロジェクト: stjordanis/ray
    def get(self, key: str) -> Optional[bytes]:
        """Get the value associated with the given key from the store.

        Args:
            key (str)

        Returns:
            The bytes value. If the key wasn't found, returns None.
        """
        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))

        try:
            response = self._s3.get_object(Bucket=self._bucket,
                                           Key=self.get_storage_key(key))
            return response["Body"].read()
        except ClientError as e:
            if e.response["Error"]["Code"] == "NoSuchKey":
                logger.warning(f"No such key in s3 for key = {key}")
                return None
            else:
                message = e.response["Error"]["Message"]
                logger.error(f"Encountered ClientError while calling get() "
                             f"in RayExternalKVStore: {message}")
                raise e
コード例 #2
0
ファイル: long_poll.py プロジェクト: zhang12574/ray
    def _process_update(self, updates: Dict[str, UpdatedObject]):
        if isinstance(updates, (ray.exceptions.RayActorError)):
            # This can happen during shutdown where the controller is
            # intentionally killed, the client should just gracefully
            # exit.
            logger.debug("LongPollClient failed to connect to host. "
                         "Shutting down.")
            return

        if isinstance(updates, (ray.exceptions.RayTaskError)):
            # This can happen during shutdown where the controller doesn't
            # contain this key, we will just repull.
            # NOTE(simon): should we repull or just wait in the long poll
            # host?
            if not isinstance(updates.as_instanceof_cause(), ValueError):
                logger.error("LongPollHost errored\n" + updates.traceback_str)
            self._poll_next()
            return

        # Before we process the updates and calling callbacks, kick off
        # another poll so we can pipeline the polling and processing.
        self._poll_next()
        logger.debug("LongPollClient received updates for keys: "
                     f"{list(updates.keys())}.")
        for key, update in updates.items():
            self.object_snapshots[key] = update.object_snapshot
            self.snapshot_ids[key] = update.snapshot_id
            callback = self.key_listeners[key]
            if self.event_loop is None:
                callback(update.object_snapshot)
            else:
                self.event_loop.call_soon_threadsafe(
                    lambda: callback(update.object_snapshot))
コード例 #3
0
ファイル: serve_test_utils.py プロジェクト: wuisawesome/ray
def run_one_wrk_trial(
    trial_length: str,
    num_connections: int,
    http_host: str,
    http_port: str,
    endpoint: str = "",
) -> None:
    proc = subprocess.Popen(
        [
            "wrk",
            "-c",
            str(num_connections),
            "-t",
            str(NUM_CPU_PER_NODE),
            "-d",
            trial_length,
            "--latency",
            f"http://{http_host}:{http_port}/{endpoint}",
        ],
        stdout=PIPE,
        stderr=PIPE,
    )
    proc.wait()
    out, err = proc.communicate()

    if err.decode() != "":
        logger.error(err.decode())

    return out.decode()
コード例 #4
0
    def _process_update(self, updates: Dict[str, UpdatedObject]):
        if isinstance(updates, (ray.exceptions.RayActorError)):
            # This can happen during shutdown where the controller is
            # intentionally killed, the client should just gracefully
            # exit.
            logger.debug("LongPollClient failed to connect to host. "
                         "Shutting down.")
            return

        if isinstance(updates, (ray.exceptions.RayTaskError)):
            # Some error happened in the controller. It could be a bug or some
            # undesired state.
            logger.error("LongPollHost errored\n" + updates.traceback_str)
            self._poll_next()
            return

        logger.debug(f"LongPollClient {self} received updates for keys: "
                     f"{list(updates.keys())}.")
        for key, update in updates.items():
            self.object_snapshots[key] = update.object_snapshot
            self.snapshot_ids[key] = update.snapshot_id
            callback = self.key_listeners[key]

            # Bind the parameters because closures are late-binding.
            # https://docs.python-guide.org/writing/gotchas/#late-binding-closures # noqa: E501
            def chained(callback=callback, arg=update.object_snapshot):
                callback(arg)
                self._on_callback_completed(trigger_at=len(updates))

            if self.event_loop is None:
                chained()
            else:
                self.event_loop.call_soon_threadsafe(chained)
コード例 #5
0
ファイル: long_poll.py プロジェクト: miqdigital/ray
 def _schedule_to_event_loop(self, callback):
     # Schedule the next iteration only if the loop is running.
     # The event loop might not be running if users used a cached
     # version across loops.
     if self.event_loop.is_running():
         self.event_loop.call_soon_threadsafe(callback)
     else:
         logger.error("The event loop is closed, shutting down long poll client.")
         self.is_running = False
コード例 #6
0
    def _process_update(self, updates: Dict[str, UpdatedObject]):
        if isinstance(updates, (ray.exceptions.RayActorError)):
            # This can happen during shutdown where the controller is
            # intentionally killed, the client should just gracefully
            # exit.
            logger.debug("LongPollClient failed to connect to host. "
                         "Shutting down.")
            self.is_running = False
            return

        if isinstance(updates, ConnectionError):
            logger.warning("LongPollClient connection failed, shutting down.")
            self.is_running = False
            return

        if isinstance(updates, (ray.exceptions.RayTaskError)):
            if isinstance(updates.as_instanceof_cause(),
                          (asyncio.TimeoutError)):
                logger.debug("LongPollClient polling timed out. Retrying.")
            else:
                # Some error happened in the controller. It could be a bug or
                # some undesired state.
                logger.error("LongPollHost errored\n" + updates.traceback_str)
            self._poll_next()
            return

        logger.debug(f"LongPollClient {self} received updates for keys: "
                     f"{list(updates.keys())}.")
        for key, update in updates.items():
            self.object_snapshots[key] = update.object_snapshot
            self.snapshot_ids[key] = update.snapshot_id
            callback = self.key_listeners[key]

            # Bind the parameters because closures are late-binding.
            # https://docs.python-guide.org/writing/gotchas/#late-binding-closures # noqa: E501
            def chained(callback=callback, arg=update.object_snapshot):
                callback(arg)
                self._on_callback_completed(trigger_at=len(updates))

            if self.event_loop is None:
                chained()
            else:
                # Schedule the next iteration only if the loop is running.
                # The event loop might not be running if users used a cached
                # version across loops.
                if self.event_loop.is_running():
                    self.event_loop.call_soon_threadsafe(chained)
                else:
                    logger.error(
                        "The event loop is closed, shutting down long poll "
                        "client.")
                    self.is_running = False
コード例 #7
0
ファイル: controller.py プロジェクト: weileze/ray
    async def run_control_loop(self) -> None:
        while True:
            async with self.write_lock:
                try:
                    self.http_state.update()
                except Exception as e:
                    logger.error(f"Exception updating HTTP state: {e}")
                try:
                    self.backend_state.update()
                except Exception as e:
                    logger.error(f"Exception updating backend state: {e}")

            await asyncio.sleep(CONTROL_LOOP_PERIOD_S)
コード例 #8
0
ファイル: kv_store.py プロジェクト: stjordanis/ray
    def delete(self, key: str):
        """Delete the value associated with the given key from the store.

        Args:
            key (str)
        """

        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))

        try:
            self._s3.delete_object(Bucket=self._bucket,
                                   Key=self.get_storage_key(key))
        except ClientError as e:
            message = e.response["Error"]["Message"]
            logger.error(f"Encountered ClientError while calling delete() "
                         f"in RayExternalKVStore: {message}")
            raise e
コード例 #9
0
ファイル: ray_gcs_kv_store.py プロジェクト: stjordanis/ray
    def delete(self, key: str):
        """Delete the value associated with the given key from the store.

        Args:
            key (str)
        """

        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))

        try:
            blob_name = self.get_storage_key(key)
            blob = self._bucket.blob(blob_name=blob_name)
            blob.delete()
        except NotFound:
            logger.error(f"Encountered ClientError while calling delete() "
                         f"in RayExternalKVStore - "
                         f"Blob {blob_name} was not found!")
コード例 #10
0
ファイル: long_poll.py プロジェクト: miqdigital/ray
    def _process_update(self, updates: Dict[str, UpdatedObject]):
        if isinstance(updates, (ray.exceptions.RayActorError)):
            # This can happen during shutdown where the controller is
            # intentionally killed, the client should just gracefully
            # exit.
            logger.debug("LongPollClient failed to connect to host. Shutting down.")
            self.is_running = False
            return

        if isinstance(updates, ConnectionError):
            logger.warning("LongPollClient connection failed, shutting down.")
            self.is_running = False
            return

        if isinstance(updates, (ray.exceptions.RayTaskError)):
            if isinstance(updates.as_instanceof_cause(), (asyncio.TimeoutError)):
                logger.debug("LongPollClient polling timed out. Retrying.")
            else:
                # Some error happened in the controller. It could be a bug or
                # some undesired state.
                logger.error("LongPollHost errored\n" + updates.traceback_str)
            # We must call this in event loop so it works in Ray Client.
            # See https://github.com/ray-project/ray/issues/20971
            self._schedule_to_event_loop(self._poll_next)
            return

        logger.debug(
            f"LongPollClient {self} received updates for keys: "
            f"{list(updates.keys())}."
        )
        for key, update in updates.items():
            self.object_snapshots[key] = update.object_snapshot
            self.snapshot_ids[key] = update.snapshot_id
            callback = self.key_listeners[key]

            # Bind the parameters because closures are late-binding.
            # https://docs.python-guide.org/writing/gotchas/#late-binding-closures # noqa: E501
            def chained(callback=callback, arg=update.object_snapshot):
                callback(arg)
                self._on_callback_completed(trigger_at=len(updates))

            self._schedule_to_event_loop(chained)
コード例 #11
0
    def put(self, key: str, val: bytes) -> bool:
        """Put the key-value pair into the store.

        Args:
            key (str)
            val (bytes)
        """
        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))
        if not isinstance(val, bytes):
            raise TypeError("val must be bytes, got: {}.".format(type(val)))

        try:
            self._s3.put_object(
                Body=val, Bucket=self._bucket, Key=self.get_storage_key(key))
        except ClientError as e:
            message = e.response["Error"]["Message"]
            logger.error(f"Encountered ClientError while calling put() "
                         f"in RayExternalKVStore: {message}")
            raise e
コード例 #12
0
ファイル: ray_gcs_kv_store.py プロジェクト: stjordanis/ray
    def put(self, key: str, val: bytes) -> bool:
        """Put the key-value pair into the store.

        Args:
            key (str)
            val (bytes)
        """
        if not isinstance(key, str):
            raise TypeError("key must be a string, got: {}.".format(type(key)))
        if not isinstance(val, bytes):
            raise TypeError("val must be bytes, got: {}.".format(type(val)))

        try:
            blob = self._bucket.blob(blob_name=self.get_storage_key(key))
            f = io.BytesIO(val)
            blob.upload_from_file(f, num_retries=5)
        except Exception as e:
            message = str(e)
            logger.error(f"Encountered ClientError while calling put() "
                         f"in RayExternalKVStore: {message}")
            raise e
コード例 #13
0
ファイル: backend_state.py プロジェクト: running-mars/ray
    def _scale_backend_replicas(
        self,
        backend_tag: BackendTag,
        num_replicas: int,
    ) -> bool:
        """Scale the given backend to the number of replicas.

        NOTE: this does not actually start or stop the replicas, but instead
        adds them to ReplicaState.SHOULD_START or ReplicaState.SHOULD_STOP.
        The caller is responsible for then first writing a checkpoint and then
        actually starting/stopping the intended replicas. This avoids
        inconsistencies with starting/stopping a replica and then crashing
        before writing a checkpoint.
        """
        logger.debug("Scaling backend '{}' to {} replicas".format(
            backend_tag, num_replicas))
        assert (backend_tag in self._backend_metadata
                ), "Backend {} is not registered.".format(backend_tag)
        assert num_replicas >= 0, ("Number of replicas must be"
                                   " greater than or equal to 0.")

        current_num_replicas = sum([
            len(self._replicas[backend_tag][ReplicaState.SHOULD_START]),
            len(self._replicas[backend_tag][ReplicaState.STARTING]),
            len(self._replicas[backend_tag][ReplicaState.RUNNING]),
        ])

        delta_num_replicas = num_replicas - current_num_replicas

        backend_info: BackendInfo = self._backend_metadata[backend_tag]
        if delta_num_replicas == 0:
            return False

        elif delta_num_replicas > 0:
            can_schedule = try_schedule_resources_on_nodes(requirements=[
                backend_info.replica_config.resource_dict
                for _ in range(delta_num_replicas)
            ])

            if _RESOURCE_CHECK_ENABLED and not all(can_schedule):
                num_possible = sum(can_schedule)
                logger.error(
                    "Cannot scale backend {} to {} replicas. Ray Serve tried "
                    "to add {} replicas but the resources only allows {} "
                    "to be added. This is not a problem if the cluster is "
                    "autoscaling. To fix this, consider scaling to replica to "
                    "{} or add more resources to the cluster. You can check "
                    "avaiable resources with ray.nodes().".format(
                        backend_tag, num_replicas, delta_num_replicas,
                        num_possible, current_num_replicas + num_possible))

            logger.debug("Adding {} replicas to backend {}".format(
                delta_num_replicas, backend_tag))
            for _ in range(delta_num_replicas):
                replica_tag = "{}#{}".format(backend_tag, get_random_letters())
                self._replicas[backend_tag][ReplicaState.SHOULD_START].append(
                    BackendReplica(self._controller_name, self._detached,
                                   replica_tag, backend_tag))

        elif delta_num_replicas < 0:
            logger.debug("Removing {} replicas from backend '{}'".format(
                -delta_num_replicas, backend_tag))
            assert self._target_replicas[backend_tag] >= delta_num_replicas

            for _ in range(-delta_num_replicas):
                replica_state_dict = self._replicas[backend_tag]
                list_to_use = replica_state_dict[ReplicaState.SHOULD_START] \
                    or replica_state_dict[ReplicaState.STARTING] \
                    or replica_state_dict[ReplicaState.RUNNING]

                assert len(list_to_use), replica_state_dict
                replica_to_stop = list_to_use.pop()

                graceful_timeout_s = (backend_info.backend_config.
                                      experimental_graceful_shutdown_timeout_s)

                replica_to_stop.set_should_stop(graceful_timeout_s)
                self._replicas[backend_tag][ReplicaState.SHOULD_STOP].append(
                    replica_to_stop)

        return True