Example #1
0
 def actor_handle(self) -> ActorHandle:
     return ray.get_actor(self._actor_name)
Example #2
0
def test_whether_worker_leaked_when_task_finished_with_errors(
        ray_start_regular):

    driver_template = """
import ray
import os
import ray
import numpy as np
import time

ray.init(address="{address}", namespace="test")

# The util actor to store the pid cross jobs.
@ray.remote
class PidStoreActor:
    def __init(self):
        self._pid = None

    def put(self, pid):
        self._pid = pid
        return True

    def get(self):
        return self._pid

def _store_pid_helper():
    try:
        pid_store_actor = ray.get_actor("pid-store", "test")
    except Exception:
        pid_store_actor = PidStoreActor.options(
            name="pid-store", lifetime="detached").remote()
    assert ray.get(pid_store_actor.put.remote(os.getpid()))

@ray.remote
def normal_task(large1, large2):
    # Record the pid of this normal task.
    _store_pid_helper()
    time.sleep(60 * 60)
    return "normaltask"

large = ray.put(np.zeros(100 * 2**10, dtype=np.int8))
obj = normal_task.remote(large, large)
print(ray.get(obj))
"""
    driver_script = driver_template.format(
        address=ray_start_regular["address"])
    driver_proc = run_string_as_driver_nonblocking(driver_script)
    try:
        driver_proc.wait(10)
    except Exception:
        pass

    def get_normal_task_pid():
        try:
            pid_store_actor = ray.get_actor("pid-store", "test")
            return ray.get(pid_store_actor.get.remote())
        except Exception:
            return None

    wait_for_condition(lambda: get_normal_task_pid() is not None, 10)
    pid_store_actor = ray.get_actor("pid-store", "test")
    normal_task_pid = ray.get(pid_store_actor.get.remote())
    assert normal_task_pid is not None
    normal_task_proc = psutil.Process(normal_task_pid)
    print("killing normal task process, pid =", normal_task_pid)
    normal_task_proc.send_signal(signal.SIGTERM)

    def normal_task_was_reconstructed():
        curr_pid = get_normal_task_pid()
        return curr_pid is not None and curr_pid != normal_task_pid

    wait_for_condition(lambda: normal_task_was_reconstructed(), 10)
    driver_proc.send_signal(signal.SIGTERM)
    # Sleep here to make sure raylet has triggered cleaning up
    # the idle workers.
    wait_for_condition(lambda: not psutil.pid_exists(normal_task_pid), 10)
Example #3
0
def ray_dask_get(dsk, keys, **kwargs):
    """
    A Dask-Ray scheduler. This scheduler will send top-level (non-inlined) Dask
    tasks to a Ray cluster for execution. The scheduler will wait for the
    tasks to finish executing, fetch the results, and repackage them into the
    appropriate Dask collections. This particular scheduler uses a threadpool
    to submit Ray tasks.

    This can be passed directly to `dask.compute()`, as the scheduler:

    >>> dask.compute(obj, scheduler=ray_dask_get)

    You can override the currently active global Dask-Ray callbacks (e.g.
    supplied via a context manager), the number of threads to use when
    submitting the Ray tasks, or the threadpool used to submit Ray tasks:

    >>> dask.compute(
            obj,
            scheduler=ray_dask_get,
            ray_callbacks=some_ray_dask_callbacks,
            num_workers=8,
            pool=some_cool_pool,
        )

    Args:
        dsk (Dict): Dask graph, represented as a task DAG dictionary.
        keys (List[str]): List of Dask graph keys whose values we wish to
            compute and return.
        ray_callbacks (Optional[list[callable]]): Dask-Ray callbacks.
        num_workers (Optional[int]): The number of worker threads to use in
            the Ray task submission traversal of the Dask graph.
        pool (Optional[ThreadPool]): A multiprocessing threadpool to use to
            submit Ray tasks.

    Returns:
        Computed values corresponding to the provided keys.
    """
    num_workers = kwargs.pop("num_workers", None)
    pool = kwargs.pop("pool", None)
    # We attempt to reuse any other thread pools that have been created within
    # this thread and with the given number of workers. We reuse a global
    # thread pool if num_workers is not given and we're in the main thread.
    global default_pool
    thread = threading.current_thread()
    if pool is None:
        with pools_lock:
            if num_workers is None and thread is main_thread:
                if default_pool is None:
                    default_pool = ThreadPool(CPU_COUNT)
                    atexit.register(default_pool.close)
                pool = default_pool
            elif thread in pools and num_workers in pools[thread]:
                pool = pools[thread][num_workers]
            else:
                pool = ThreadPool(num_workers)
                atexit.register(pool.close)
                pools[thread][num_workers] = pool

    ray_callbacks = kwargs.pop("ray_callbacks", None)
    persist = kwargs.pop("ray_persist", False)
    enable_progress_bar = kwargs.pop("_ray_enable_progress_bar", None)

    with local_ray_callbacks(ray_callbacks) as ray_callbacks:
        # Unpack the Ray-specific callbacks.
        (
            ray_presubmit_cbs,
            ray_postsubmit_cbs,
            ray_pretask_cbs,
            ray_posttask_cbs,
            ray_postsubmit_all_cbs,
            ray_finish_cbs,
        ) = unpack_ray_callbacks(ray_callbacks)
        # NOTE: We hijack Dask's `get_async` function, injecting a different
        # task executor.
        object_refs = get_async(
            _apply_async_wrapper(
                pool.apply_async,
                _rayify_task_wrapper,
                ray_presubmit_cbs,
                ray_postsubmit_cbs,
                ray_pretask_cbs,
                ray_posttask_cbs,
            ),
            len(pool._pool),
            dsk,
            keys,
            get_id=_thread_get_id,
            pack_exception=pack_exception,
            **kwargs,
        )
        if ray_postsubmit_all_cbs is not None:
            for cb in ray_postsubmit_all_cbs:
                cb(object_refs, dsk)
        # NOTE: We explicitly delete the Dask graph here so object references
        # are garbage-collected before this function returns, i.e. before all
        # Ray tasks are done. Otherwise, no intermediate objects will be
        # cleaned up until all Ray tasks are done.
        del dsk
        if persist:
            result = object_refs
        else:
            pb_actor = None
            if enable_progress_bar:
                pb_actor = ray.get_actor("_dask_on_ray_pb")
            result = ray_get_unpack(object_refs, progress_bar_actor=pb_actor)
        if ray_finish_cbs is not None:
            for cb in ray_finish_cbs:
                cb(result)

    # cleanup pools associated with dead threads.
    with pools_lock:
        active_threads = set(threading.enumerate())
        if thread is not main_thread:
            for t in list(pools):
                if t not in active_threads:
                    for p in pools.pop(t).values():
                        p.close()
    return result
Example #4
0
def test_get_non_existing_named_actor(ray_start_regular_shared):
    with pytest.raises(ValueError):
        _ = ray.get_actor("non_existing_actor")
Example #5
0
def get_management_actor() -> "ActorHandle":
    return ray.get_actor(common.MANAGEMENT_ACTOR_NAME,
                         namespace=common.MANAGEMENT_ACTOR_NAMESPACE)
Example #6
0
def test_detached_actor(ray_start_regular):
    @ray.remote
    class DetachedActor:
        def ping(self):
            return "pong"

    with pytest.raises(TypeError):
        DetachedActor._remote(lifetime="detached", name=1)

    with pytest.raises(ValueError, match="Actor name cannot be an empty string"):
        DetachedActor._remote(lifetime="detached", name="")

    with pytest.raises(ValueError):
        DetachedActor._remote(lifetime="detached", name="hi", namespace="")

    with pytest.raises(TypeError):
        DetachedActor._remote(lifetime="detached", name="hi", namespace=2)

    d = DetachedActor._remote(lifetime="detached", name="d_actor")
    assert ray.get(d.ping.remote()) == "pong"

    with pytest.raises(ValueError, match="Please use a different name"):
        DetachedActor._remote(lifetime="detached", name="d_actor")

    address = ray_start_regular["address"]

    get_actor_name = "d_actor"
    create_actor_name = "DetachedActor"
    driver_script = """
import ray
ray.init(address="{}", namespace="default_test_namespace")

name = "{}"
assert ray.util.list_named_actors() == [name]
existing_actor = ray.get_actor(name)
assert ray.get(existing_actor.ping.remote()) == "pong"

@ray.remote
def foo():
    return "bar"

@ray.remote
class NonDetachedActor:
    def foo(self):
        return "bar"

@ray.remote
class DetachedActor:
    def ping(self):
        return "pong"

    def foobar(self):
        actor = NonDetachedActor.remote()
        return ray.get([foo.remote(), actor.foo.remote()])

actor = DetachedActor._remote(lifetime="detached", name="{}")
ray.get(actor.ping.remote())
""".format(
        address, get_actor_name, create_actor_name
    )

    run_string_as_driver(driver_script)
    assert len(ray.util.list_named_actors()) == 2
    assert get_actor_name in ray.util.list_named_actors()
    assert create_actor_name in ray.util.list_named_actors()
    detached_actor = ray.get_actor(create_actor_name)
    assert ray.get(detached_actor.ping.remote()) == "pong"
    # Verify that a detached actor is able to create tasks/actors
    # even if the driver of the detached actor has exited.
    assert ray.get(detached_actor.foobar.remote()) == ["bar", "bar"]
Example #7
0
def test_multiple_routers(ray_cluster):
    cluster = ray_cluster
    head_node = cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)

    ray.init(head_node.address)
    node_ids = ray.state.node_ids()
    assert len(node_ids) == 2
    serve.start(http_options=dict(port=8005, location="EveryNode"))

    def get_proxy_names():
        proxy_names = []
        for node_id, _ in get_all_node_ids():
            proxy_names.append(
                format_actor_name(SERVE_PROXY_NAME,
                                  serve.api._global_client._controller_name,
                                  node_id))
        return proxy_names

    wait_for_condition(lambda: len(get_proxy_names()) == 2)
    proxy_names = get_proxy_names()

    # Two actors should be started.
    def get_first_two_actors():
        try:
            ray.get_actor(proxy_names[0])
            ray.get_actor(proxy_names[1])
            return True
        except ValueError:
            return False

    wait_for_condition(get_first_two_actors)

    # Wait for the actors to come up.
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Kill one of the servers, the HTTP server should still function.
    ray.kill(ray.get_actor(get_proxy_names()[0]), no_restart=True)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))

    # Add a new node to the cluster. This should trigger a new router to get
    # started.
    new_node = cluster.add_node()

    wait_for_condition(lambda: len(get_proxy_names()) == 3)
    third_proxy = get_proxy_names()[2]

    def get_third_actor():
        try:
            ray.get_actor(third_proxy)
            return True
        # IndexErrors covers when cluster resources aren't updated yet.
        except (IndexError, ValueError):
            return False

    wait_for_condition(get_third_actor)

    # Remove the newly-added node from the cluster. The corresponding actor
    # should be removed as well.
    cluster.remove_node(new_node)

    def third_actor_removed():
        try:
            ray.get_actor(third_proxy)
            return False
        except ValueError:
            return True

    # Check that the actor is gone and the HTTP server still functions.
    wait_for_condition(third_actor_removed)
    ray.get(block_until_http_ready.remote("http://127.0.0.1:8005/-/routes"))
Example #8
0
def start(
        detached: bool = False,
        http_host: Optional[str] = DEFAULT_HTTP_HOST,
        http_port: int = DEFAULT_HTTP_PORT,
        http_middlewares: List[Any] = [],
        http_options: Optional[Union[dict, HTTPOptions]] = None,
) -> Client:
    """Initialize a serve instance.

    By default, the instance will be scoped to the lifetime of the returned
    Client object (or when the script exits). If detached is set to True, the
    instance will instead persist until client.shutdown() is called and clients
    to it can be connected using serve.connect(). This is only relevant if
    connecting to a long-running Ray cluster (e.g., with address="auto").

    Args:
        detached (bool): Whether not the instance should be detached from this
          script.
        http_host (Optional[str]): Deprecated, use http_options instead.
        http_port (int): Deprecated, use http_options instead.
        http_middlewares (list): Deprecated, use http_options instead.
        http_options (Optional[Dict, serve.HTTPOptions]): Configuration options
          for HTTP proxy. You can pass in a dictionary or HTTPOptions object
          with fields:

            - host(str, None): Host for HTTP servers to listen on. Defaults to
              "127.0.0.1". To expose Serve publicly, you probably want to set
              this to "0.0.0.0".
            - port(int): Port for HTTP server. Defaults to 8000.
            - middlewares(list): A list of Starlette middlewares that will be
              applied to the HTTP servers in the cluster.
            - location(str, serve.config.DeploymentMode): The deployment
              location of HTTP servers:

                - "HeadOnly": start one HTTP server on the head node. Serve
                  assumes the head node is the node you executed serve.start
                  on. This is the default.
                - "EveryNode": start one HTTP server per node.
                - "NoServer" or None: disable HTTP server.
    """
    if ((http_host != DEFAULT_HTTP_HOST) or (http_port != DEFAULT_HTTP_PORT)
            or (len(http_middlewares) != 0)):
        if http_options is not None:
            raise ValueError(
                "You cannot specify both `http_options` and any of the "
                "`http_host`, `http_port`, and `http_middlewares` arguments. "
                "`http_options` is preferred.")
        else:
            warn(
                "`http_host`, `http_port`, `http_middlewares` are deprecated. "
                "Please use serve.start(http_options={'host': ..., "
                "'port': ..., middlewares': ...}) instead.",
                DeprecationWarning,
            )

    # Initialize ray if needed.
    if not ray.is_initialized():
        ray.init()

    register_custom_serializers()

    # Try to get serve controller if it exists
    if detached:
        controller_name = SERVE_CONTROLLER_NAME
        try:
            ray.get_actor(controller_name)
            raise RayServeException("Called serve.start(detached=True) but a "
                                    "detached instance is already running. "
                                    "Please use serve.connect() to connect to "
                                    "the running instance instead.")
        except ValueError:
            pass
    else:
        controller_name = format_actor_name(SERVE_CONTROLLER_NAME,
                                            get_random_letters())

    if isinstance(http_options, dict):
        http_options = HTTPOptions.parse_obj(http_options)
    if http_options is None:
        http_options = HTTPOptions(
            host=http_host, port=http_port, middlewares=http_middlewares)

    controller = ServeController.options(
        name=controller_name,
        lifetime="detached" if detached else None,
        max_restarts=-1,
        max_task_retries=-1,
        # Pin Serve controller on the head node.
        resources={
            get_current_node_resource_key(): 0.01
        },
    ).remote(
        controller_name,
        http_options,
        detached=detached,
    )

    proxy_handles = ray.get(controller.get_http_proxies.remote())
    if len(proxy_handles) > 0:
        try:
            ray.get(
                [handle.ready.remote() for handle in proxy_handles.values()],
                timeout=HTTP_PROXY_TIMEOUT,
            )
        except ray.exceptions.GetTimeoutError:
            raise TimeoutError(
                "HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.")

    client = Client(controller, controller_name, detached=detached)
    _set_global_client(client)
    return client
Example #9
0
        async def handler(self, block: bool):
            if block:
                signal = ray.get_actor(signal_name)
                await signal.wait.remote()

            return f"1|{os.getpid()}"
Example #10
0
 async def v1(request):
     if request.query_params["block"] == "True":
         signal = ray.get_actor(signal_name)
         await signal.wait.remote()
     return f"1|{os.getpid()}"
Example #11
0
 def get_actor_count(self, name):
     actor = ray.get_actor(name)
     return ray.get(actor.inc_and_get.remote())
Example #12
0
def test_serializing_exceptions(ray_start_regular_shared):
    with ray_start_client_server() as ray:
        with pytest.raises(ValueError):
            ray.get_actor("abc")
Example #13
0
from seesaw.memory_cache import ReferenceCache
import ray
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="control the data cache. calling it again will restart the cache"
    )
    # parser.add_argument('--restart', type=int, action='store_true',  help='restart the cache')
    args = parser.parse_args()

    ray.init("auto", namespace="seesaw")

    actor_name = "actor#cache"
    try:
        oldh = ray.get_actor(actor_name)
        print("found old cache actor, destroying it")
        ray.kill(oldh)
    except:
        pass
        # no actor to kill

    print("starting new cache actor")
    h = (
        ray.remote(ReferenceCache)
        .options(name=actor_name, num_cpus=1, lifetime="detached")
        .remote()
    )
    r = h.ready.remote()
    ray.get(r)
    print("new cache actor ready")
Example #14
0
 def __init__(self):
     self.actor = ray.get_actor("a")
Example #15
0
 def do_run(name, concurrency=4):
     name = "actor_" + str(name)
     tasks = [getter.remote(name) for _ in range(concurrency)]
     result = ray.get(tasks)
     ray.kill(ray.get_actor(name))  # Cleanup
     return result
Example #16
0
 async def reconfigure(self, config):
     # Don't block when the replica is first created.
     if self.config is not None:
         signal = ray.get_actor(signal_name)
         ray.get(signal.wait.remote())
     self.config = config
Example #17
0
 def check_name_available(name):
     try:
         ray.get_actor(name)
         return False
     except ValueError:
         return True
Example #18
0
def _record_step_status(step_id: "StepID", status: "WorkflowStatus") -> None:
    workflow_id = workflow_context.get_current_workflow_id()
    workflow_manager = ray.get_actor(MANAGEMENT_ACTOR_NAME)
    ray.get(
        workflow_manager.update_step_status.remote(workflow_id, step_id,
                                                   status))
Example #19
0
    async def _recover_from_checkpoint(self, checkpoint_bytes: bytes) -> None:
        """Recover the instance state from the provided checkpoint.

        Performs the following operations:
            1) Deserializes the internal state from the checkpoint.
            2) Pushes the latest configuration to the routers
               in case we crashed before updating them.
            3) Starts/stops any worker replicas that are pending creation or
               deletion.

        NOTE: this requires that self.write_lock is already acquired and will
        release it before returning.
        """
        assert self.write_lock.locked()

        start = time.time()
        logger.info("Recovering from checkpoint")

        # Load internal state from the checkpoint data.
        (
            self.routes,
            router_node_ids,
            self.backends,
            self.traffic_policies,
            self.replicas,
            self.replicas_to_start,
            self.replicas_to_stop,
            self.backends_to_remove,
            self.endpoints_to_remove,
        ) = pickle.loads(checkpoint_bytes)

        for node_id in router_node_ids:
            router_name = format_actor_name(SERVE_PROXY_NAME,
                                            self.controller_name, node_id)
            self.routers[node_id] = ray.get_actor(router_name)

        # Fetch actor handles for all of the backend replicas in the system.
        # All of these workers are guaranteed to already exist because they
        # would not be written to a checkpoint in self.workers until they
        # were created.
        for backend_tag, replica_tags in self.replicas.items():
            for replica_tag in replica_tags:
                replica_name = format_actor_name(replica_tag,
                                                 self.controller_name)
                self.workers[backend_tag][replica_tag] = ray.get_actor(
                    replica_name)

        # Push configuration state to the router.
        # TODO(edoakes): should we make this a pull-only model for simplicity?
        for endpoint, traffic_policy in self.traffic_policies.items():
            await asyncio.gather(*[
                router.set_traffic.remote(endpoint, traffic_policy)
                for router in self.routers.values()
            ])

        for backend_tag, replica_dict in self.workers.items():
            for replica_tag, worker in replica_dict.items():
                await asyncio.gather(*[
                    router.add_new_worker.remote(backend_tag, replica_tag,
                                                 worker)
                    for router in self.routers.values()
                ])

        for backend, info in self.backends.items():
            await asyncio.gather(*[
                router.set_backend_config.remote(backend, info.backend_config)
                for router in self.routers.values()
            ])
            await self.broadcast_backend_config(backend)
            metadata = info.backend_config.internal_metadata
            if metadata.autoscaling_config is not None:
                self.autoscaling_policies[backend] = BasicAutoscalingPolicy(
                    backend, metadata.autoscaling_config)

        # Push configuration state to the routers.
        await asyncio.gather(*[
            router.set_route_table.remote(self.routes)
            for router in self.routers.values()
        ])

        # Start/stop any pending backend replicas.
        await self._start_pending_replicas()
        await self._stop_pending_replicas()

        # Remove any pending backends and endpoints.
        await self._remove_pending_backends()
        await self._remove_pending_endpoints()

        logger.info("Recovered from checkpoint in {:.3f}s".format(time.time() -
                                                                  start))

        self.write_lock.release()
def main(args=None, model=None) -> GenerativeQAModule:

    parser = argparse.ArgumentParser()
    parser = pl.Trainer.add_argparse_args(parser)
    parser = GenerativeQAModule.add_model_specific_args(parser, os.getcwd())
    parser = GenerativeQAModule.add_retriever_specific_args(parser)

    args = args or parser.parse_args()

    Path(args.output_dir).mkdir(exist_ok=True)

    named_actors = []
    if args.distributed_retriever == "ray" and args.gpus > 1:
        if not is_ray_available():
            raise RuntimeError("Please install Ray to use the Ray " "distributed retriever.")
        # Connect to an existing Ray cluster.
        try:
            ray.init(address=args.ray_address)
        except (ConnectionError, ValueError):
            logger.warning(
                "Connection to Ray cluster failed. Make sure a Ray"
                "cluster is running by either using Ray's cluster "
                "launcher (`ray up`) or by manually starting Ray on "
                "each node via `ray start --head` for the head node "
                "and `ray start --address='<ip address>:6379'` for "
                "additional nodes. See "
                "https://docs.ray.io/en/master/cluster/index.html "
                "for more info."
            )
            raise

        # Create Ray actors only for rank 0.
        if ("LOCAL_RANK" not in os.environ or os.environ["LOCAL_RANK"] == 0) and (
            "NODE_RANK" not in os.environ or os.environ["NODE_RANK"] == 0
        ):
            remote_cls = ray.remote(RayRetriever)
            named_actors = [
                remote_cls.options(name="retrieval_worker_{}".format(i)).remote()
                for i in range(args.num_retrieval_workers)
            ]
        else:
            logger.info(
                "Getting named actors for NODE_RANK {}, LOCAL_RANK {}".format(
                    os.environ["NODE_RANK"], os.environ["LOCAL_RANK"]
                )
            )
            named_actors = [ray.get_actor("retrieval_worker_{}".format(i)) for i in range(args.num_retrieval_workers)]
    args.actor_handles = named_actors
    assert args.actor_handles == named_actors

    if model is None:
        model: GenerativeQAModule = GenerativeQAModule(args)

    dataset = Path(args.data_dir).name
    if (
        args.logger_name == "default"
        or args.fast_dev_run
        or str(args.output_dir).startswith("/tmp")
        or str(args.output_dir).startswith("/var")
    ):
        training_logger = True  # don't pollute wandb logs unnecessarily
    elif args.logger_name == "wandb":
        from pytorch_lightning.loggers import WandbLogger

        project = os.environ.get("WANDB_PROJECT", dataset)
        training_logger = WandbLogger(name=model.output_dir.name, project=project)

    elif args.logger_name == "wandb_shared":
        from pytorch_lightning.loggers import WandbLogger

        training_logger = WandbLogger(name=model.output_dir.name, project=f"hf_{dataset}")

    es_callback = (
        get_early_stopping_callback(model.val_metric, args.early_stopping_patience)
        if args.early_stopping_patience >= 0
        else False
    )

    trainer: pl.Trainer = generic_train(
        model,
        args,
        logging_callback=Seq2SeqLoggingCallback(),
        checkpoint_callback=get_checkpoint_callback(args.output_dir, model.val_metric),
        early_stopping_callback=es_callback,
        logger=training_logger,
        accelerator=CustomAccel() if args.gpus > 1 else None,
        profiler=pl.profiler.AdvancedProfiler() if args.profile else None,
    )
    pickle_save(model.hparams, model.output_dir / "hparams.pkl")

    if not args.do_predict:
        return model

    # test() without a model tests using the best checkpoint automatically
    trainer.test()
    return model
Example #21
0
 def third_actor_removed():
     try:
         ray.get_actor(third_proxy)
         return False
     except ValueError:
         return True
Example #22
0
    async def setup(self, name, controller_name):
        # Note: Several queues are used in the router
        # - When a request come in, it's placed inside its corresponding
        #   endpoint_queue.
        # - The endpoint_queue is dequeued during flush operation, which moves
        #   the queries to backend buffer_queue. Here we match a request
        #   for an endpoint to a backend given some policy.
        # - The worker_queue is used to collect idle actor handle. These
        #   handles are dequed during the second stage of flush operation,
        #   which assign queries in buffer_queue to actor handle.

        self.name = name

        # -- Queues -- #

        # endpoint_name -> request queue
        # We use FIFO (left to right) ordering. The new items should be added
        # using appendleft. Old items should be removed via pop().
        self.endpoint_queues: DefaultDict[deque[Query]] = defaultdict(deque)
        # backend_name -> worker replica tag queue
        self.worker_queues: DefaultDict[deque[str]] = defaultdict(deque)
        # backend_name -> worker payload queue
        self.backend_queues = defaultdict(deque)

        # -- Metadata -- #

        # endpoint_name -> traffic_policy
        self.traffic = dict()
        # backend_name -> backend_config
        self.backend_info = dict()
        # replica tag -> worker_handle
        self.replicas = dict()
        # backend_name -> replica_tag -> concurrent queries counter
        self.queries_counter = defaultdict(lambda: defaultdict(int))

        # -- Synchronization -- #

        # This lock guarantee that only one flush operation can happen at a
        # time. Without the lock, multiple flush operation can pop from the
        # same buffer_queue and worker_queue and create deadlock. For example,
        # an operation holding the only query and the other flush operation
        # holding the only idle replica. Additionally, allowing only one flush
        # operation at a time simplifies design overhead for custom queuing and
        # batching policies.
        self.flush_lock = asyncio.Lock()

        # -- State Restoration -- #
        # Fetch the worker handles, traffic policies, and backend configs from
        # the controller. We use a "pull-based" approach instead of pushing
        # them from the controller so that the router can transparently recover
        # from failure.
        self.controller = ray.get_actor(controller_name)

        traffic_policies = ray.get(
            self.controller.get_traffic_policies.remote())
        for endpoint, traffic_policy in traffic_policies.items():
            await self.set_traffic(endpoint, traffic_policy)

        backend_dict = ray.get(self.controller.get_all_worker_handles.remote())
        for backend_tag, replica_dict in backend_dict.items():
            for replica_tag, worker in replica_dict.items():
                await self.add_new_worker(backend_tag, replica_tag, worker)

        backend_configs = ray.get(self.controller.get_backend_configs.remote())
        for backend, backend_config in backend_configs.items():
            await self.set_backend_config(backend, backend_config)

        # -- Metrics Registration -- #
        self.num_router_requests = metrics.Count(
            "num_router_requests",
            "Number of requests processed by the router.", "requests",
            ["endpoint"])
        self.num_error_endpoint_requests = metrics.Count(
            "num_error_endpoint_requests",
            ("Number of requests that errored when getting results "
             "for the endpoint."), "requests", ["endpoint"])
        self.num_error_backend_requests = metrics.Count(
            "num_error_backend_requests",
            ("Number of requests that errored when getting result "
             "from the backend."), "requests", ["backend"])

        self.backend_queue_size = metrics.Gauge(
            "backend_queued_queries",
            "Current number of queries queued in the router for a backend",
            "requests", ["backend"])

        asyncio.get_event_loop().create_task(self.report_queue_lengths())
Example #23
0
 def actor_removed():
     try:
         ray.get_actor("hi")
         return False
     except ValueError:
         return True
Example #24
0
def get_from_ray(idx, redis_address, redis_password, idx_to_store_name):
    init_ray_if_not(redis_address, redis_password)
    local_store_handle = ray.get_actor(idx_to_store_name[idx])
    partition = ray.get(local_store_handle.get_partition.remote(idx))
    return partition
Example #25
0
File: api.py Project: zjureel/ray
def start(detached: bool = False,
          http_host: str = DEFAULT_HTTP_HOST,
          http_port: int = DEFAULT_HTTP_PORT,
          http_middlewares: List[Any] = []) -> Client:
    """Initialize a serve instance.

    By default, the instance will be scoped to the lifetime of the returned
    Client object (or when the script exits). If detached is set to True, the
    instance will instead persist until client.shutdown() is called and clients
    to it can be connected using serve.connect(). This is only relevant if
    connecting to a long-running Ray cluster (e.g., with address="auto").

    Args:
        detached (bool): Whether not the instance should be detached from this
            script.
        http_host (str): Host for HTTP servers to listen on. Defaults to
            "127.0.0.1". To expose Serve publicly, you probably want to set
            this to "0.0.0.0". One HTTP server will be started on each node in
            the Ray cluster. To not start HTTP servers, set this to None.
        http_port (int): Port for HTTP server. Defaults to 8000.
        http_middlewares (list): A list of Starlette middlewares that will be
            applied to the HTTP servers in the cluster.
    """
    # Initialize ray if needed.
    if not ray.is_initialized():
        ray.init()

    # Try to get serve controller if it exists
    if detached:
        controller_name = SERVE_CONTROLLER_NAME
        try:
            ray.get_actor(controller_name)
            raise RayServeException("Called serve.start(detached=True) but a "
                                    "detached instance is already running. "
                                    "Please use serve.connect() to connect to "
                                    "the running instance instead.")
        except ValueError:
            pass
    else:
        controller_name = format_actor_name(SERVE_CONTROLLER_NAME,
                                            get_random_letters())

    controller = ServeController.options(
        name=controller_name,
        lifetime="detached" if detached else None,
        max_restarts=-1,
        max_task_retries=-1,
    ).remote(controller_name,
             HTTPConfig(http_host, http_port, http_middlewares),
             detached=detached)

    if http_host is not None:
        futures = []
        for node_id in ray.state.node_ids():
            future = block_until_http_ready.options(
                num_cpus=0, resources={
                    node_id: 0.01
                }).remote("http://{}:{}/-/routes".format(http_host, http_port),
                          timeout=HTTP_PROXY_TIMEOUT)
            futures.append(future)
        try:
            ray.get(futures)
        except ray.exceptions.RayTaskError:
            raise TimeoutError(
                "HTTP proxies not available after {HTTP_PROXY_TIMEOUT}s.")

    return Client(controller, controller_name, detached=detached)
Example #26
0
def test_serializing_exceptions(ray_start_regular_shared):
    with ray_start_client_server() as ray:
        with pytest.raises(ValueError,
                           match="Failed to look up actor with name 'abc'"):
            ray.get_actor("abc")
Example #27
0
 def get_normal_task_pid():
     try:
         pid_store_actor = ray.get_actor("pid-store", "test")
         return ray.get(pid_store_actor.get.remote())
     except Exception:
         return None
Example #28
0
def test_get_actor_no_input(ray_start_regular_shared):
    for bad_name in [None, "", "    "]:
        with pytest.raises(ValueError):
            ray.get_actor(bad_name)
Example #29
0

def process_incremental(sum, result):
    time.sleep(1)  # Replace this with some processing code.
    return sum + result


start_pipeline = timer()

start = timer()
'''Register Actors if not registered already'''
flow1_actors = {}
actor_names = ['flow1_actor1', 'flow1_actor2', 'flow1_actor3']
for actor_name in actor_names:
    try:
        flow1_actors[actor_name] = ray.get_actor(actor_name)
        print('Actor already registered: {}'.format(actor_name))
    except ValueError:
        flow1_actors[actor_name] = Pipeline.options(
            name=actor_name, lifetime="detached").remote()
        flow1_actors[actor_name]

print("duration =", timer() - start, " seconds for registering actors")
'''
for actor_name in actor_names:
    flow1_actors[actor_name] = ray.get_actor(actor_name)
'''

df = readtextfile.ReadTextFile(
    ipfile='/tmp/data/5m_Sales_Records.csv',
    ipschemafile=
Example #30
0
 def force_stop(self):
     """Force the actor to exit without shutting down gracefully."""
     try:
         ray.kill(ray.get_actor(self._actor_name))
     except ValueError:
         pass