コード例 #1
0
ファイル: internal_api.py プロジェクト: fcardoso75/ray
def node_stats(node_manager_address=None,
               node_manager_port=None,
               include_memory_info=True):
    """Returns NodeStats object describing memory usage in the cluster."""

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    # We can ask any Raylet for the global memory info.
    assert (node_manager_address is not None and node_manager_port is not None)
    raylet_address = "{}:{}".format(node_manager_address, node_manager_port)
    channel = grpc.insecure_channel(
        raylet_address,
        options=[
            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
        ],
    )
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    node_stats = stub.GetNodeStats(
        node_manager_pb2.GetNodeStatsRequest(
            include_memory_info=include_memory_info),
        timeout=30.0)
    return node_stats
コード例 #2
0
def memory_summary(node_manager_address=None,
                   node_manager_port=None,
                   stats_only=False):
    """Returns a formatted string describing memory usage in the cluster."""

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    # We can ask any Raylet for the global memory info, that Raylet internally
    # asks all nodes in the cluster for memory stats.
    if (node_manager_address is None or node_manager_port is None):
        raylet = ray.nodes()[0]
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        raylet["NodeManagerPort"])
    else:
        raylet_address = "{}:{}".format(node_manager_address,
                                        node_manager_port)
    channel = grpc.insecure_channel(
        raylet_address,
        options=[
            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
        ],
    )
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    reply = stub.FormatGlobalMemoryInfo(
        node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0)
    if stats_only:
        return store_stats_summary(reply)
    return reply.memory_summary + "\n" + store_stats_summary(reply, stats_only)
コード例 #3
0
ファイル: conftest.py プロジェクト: RuofanKong/ray
 def kill_raylet(ip, port, graceful=False):
     raylet_address = f"{ip}:{port}"
     channel = grpc.insecure_channel(raylet_address)
     stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
     print(f"Sending a shutdown request to {ip}:{port}")
     stub.ShutdownRaylet(
         node_manager_pb2.ShutdownRayletRequest(graceful=graceful))
コード例 #4
0
ファイル: internal_api.py プロジェクト: fcardoso75/ray
def get_store_stats(state, node_manager_address=None, node_manager_port=None):
    """Returns a formatted string describing memory usage in the cluster."""

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    # We can ask any Raylet for the global memory info, that Raylet internally
    # asks all nodes in the cluster for memory stats.
    if (node_manager_address is None or node_manager_port is None):
        # We should ask for a raylet that is alive.
        raylet = None
        for node in state.node_table():
            if node["Alive"]:
                raylet = node
                break
        assert raylet is not None, "Every raylet is dead"
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        raylet["NodeManagerPort"])
    else:
        raylet_address = "{}:{}".format(node_manager_address,
                                        node_manager_port)
    channel = grpc.insecure_channel(
        raylet_address,
        options=[
            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
        ],
    )
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    reply = stub.FormatGlobalMemoryInfo(
        node_manager_pb2.FormatGlobalMemoryInfoRequest(
            include_memory_info=False),
        timeout=30.0)
    return store_stats_summary(reply)
コード例 #5
0
ファイル: internal_api.py プロジェクト: tuyulers5/jav44
def memory_summary(node_manager_address=None,
                   node_manager_port=None,
                   stats_only=False):
    """Returns a formatted string describing memory usage in the cluster."""

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    # We can ask any Raylet for the global memory info, that Raylet internally
    # asks all nodes in the cluster for memory stats.
    if (node_manager_address is None or node_manager_port is None):
        raylet = ray.nodes()[0]
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        raylet["NodeManagerPort"])
    else:
        raylet_address = "{}:{}".format(node_manager_address,
                                        node_manager_port)
    channel = grpc.insecure_channel(
        raylet_address,
        options=[
            ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
            ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
        ],
    )
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    reply = stub.FormatGlobalMemoryInfo(
        node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0)
    store_summary = "--- Aggregate object store stats across all nodes ---\n"
    store_summary += (
        "Plasma memory usage {} MiB, {} objects, {}% full\n".format(
            int(reply.store_stats.object_store_bytes_used / (1024 * 1024)),
            reply.store_stats.num_local_objects,
            round(
                100 * reply.store_stats.object_store_bytes_used /
                reply.store_stats.object_store_bytes_avail, 2)))
    if reply.store_stats.spill_time_total_s > 0:
        store_summary += (
            "Spilled {} MiB, {} objects, avg write throughput {} MiB/s\n".
            format(
                int(reply.store_stats.spilled_bytes_total / (1024 * 1024)),
                reply.store_stats.spilled_objects_total,
                int(reply.store_stats.spilled_bytes_total / (1024 * 1024) /
                    reply.store_stats.spill_time_total_s)))
    if reply.store_stats.restore_time_total_s > 0:
        store_summary += (
            "Restored {} MiB, {} objects, avg read throughput {} MiB/s\n".
            format(
                int(reply.store_stats.restored_bytes_total / (1024 * 1024)),
                reply.store_stats.restored_objects_total,
                int(reply.store_stats.restored_bytes_total / (1024 * 1024) /
                    reply.store_stats.restore_time_total_s)))
    if reply.store_stats.consumed_bytes > 0:
        store_summary += ("Objects consumed by Ray tasks: {} MiB.".format(
            int(reply.store_stats.consumed_bytes / (1024 * 1024))))
    if stats_only:
        return store_summary
    return reply.memory_summary + "\n" + store_summary
コード例 #6
0
ファイル: test_utils.py プロジェクト: parasj/ray
 def _kill_raylet(self, ip, port, graceful=False):
     raylet_address = f"{ip}:{port}"
     channel = grpc.insecure_channel(raylet_address)
     stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
     try:
         stub.ShutdownRaylet(
             node_manager_pb2.ShutdownRayletRequest(graceful=graceful))
     except _InactiveRpcError:
         assert not graceful
コード例 #7
0
ファイル: dashboard.py プロジェクト: sytelus/ray
    def update_nodes(self):
        with self.nodes_lock:
            self.nodes = ray.nodes()
            self.stubs = []

            for node in self.nodes:
                channel = grpc.insecure_channel("{}:{}".format(
                    node["NodeManagerAddress"], node["NodeManagerPort"]))
                stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
                self.stubs.append(stub)
コード例 #8
0
 def kill_raylet(ip, port, graceful=True):
     raylet_address = f"{ip}:{port}"
     channel = grpc.insecure_channel(raylet_address)
     stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
     print(f"Sending a shutdown request to {ip}:{port}")
     try:
         stub.ShutdownRaylet(
             node_manager_pb2.ShutdownRayletRequest(graceful=graceful))
     except _InactiveRpcError:
         assert not graceful
コード例 #9
0
ファイル: stats_collector_head.py プロジェクト: zachkeer/ray
 async def _update_stubs(self, change):
     if change.old:
         ip, port = change.old
         self._stubs.pop(ip)
     if change.new:
         ip, node_info = change.new
         address = "{}:{}".format(ip, int(node_info["nodeManagerPort"]))
         channel = aiogrpc.insecure_channel(address)
         stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
         self._stubs[ip] = stub
コード例 #10
0
ファイル: test_multi_tenancy.py プロジェクト: wangziyuruc/ray
def get_num_workers():
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    raylet["NodeManagerPort"])
    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    return len([
        worker for worker in stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest()).workers_stats
        if not worker.is_driver
    ])
コード例 #11
0
ファイル: test_multi_tenancy.py プロジェクト: holdenk/ray
def get_workers():
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    raylet["NodeManagerPort"])
    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    return [
        worker for worker in stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest()).core_workers_stats
        if worker.worker_type != common_pb2.DRIVER
    ]
コード例 #12
0
ファイル: stats_collector_head.py プロジェクト: yncxcw/ray
 async def _update_stubs(self, change):
     if change.old:
         node_id, node_info = change.old
         self._stubs.pop(node_id)
     if change.new:
         node_id, node_info = change.new
         address = "{}:{}".format(node_info["nodeManagerAddress"],
                                  int(node_info["nodeManagerPort"]))
         options = (("grpc.enable_http_proxy", 0), )
         channel = aiogrpc.insecure_channel(address, options=options)
         stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
         self._stubs[node_id] = stub
コード例 #13
0
ファイル: actor_head.py プロジェクト: stefanbschneider/ray
 async def _update_stubs(self, change):
     if change.old:
         node_id, node_info = change.old
         self._stubs.pop(node_id)
     if change.new:
         # TODO(fyrestone): Handle exceptions.
         node_id, node_info = change.new
         address = "{}:{}".format(node_info["nodeManagerAddress"],
                                  int(node_info["nodeManagerPort"]))
         options = (("grpc.enable_http_proxy", 0), )
         channel = ray._private.utils.init_grpc_channel(
             address, options, asynchronous=True)
         stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
         self._stubs[node_id] = stub
コード例 #14
0
def _get_raylet_stub():
    global _STUB
    if _STUB is None:
        import grpc
        from ray.core.generated import node_manager_pb2_grpc
        raylet_address = _get_raylet_address()
        channel = grpc.insecure_channel(
            raylet_address,
            options=[
                ("grpc.max_send_message_length", MAX_MESSAGE_LENGTH),
                ("grpc.max_receive_message_length", MAX_MESSAGE_LENGTH),
            ],
        )
        _STUB = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    return _STUB
コード例 #15
0
def test_initial_workers(shutdown_only):
    # `num_cpus` should be <=2 because a Travis CI machine only has 2 CPU cores
    ray.init(num_cpus=1,
             include_dashboard=True,
             _internal_config=json.dumps({"enable_multi_tenancy": True}))
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    raylet["NodeManagerPort"])
    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    wait_for_condition(lambda: len([
        worker for worker in stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest()).workers_stats
        if not worker.is_driver
    ]) == 1,
                       timeout=10)
コード例 #16
0
def memory_summary():
    """Returns a formatted string describing memory usage in the cluster."""

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    # We can ask any Raylet for the global memory info.
    raylet = ray.nodes()[0]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])
    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    reply = stub.FormatGlobalMemoryInfo(
        node_manager_pb2.FormatGlobalMemoryInfoRequest(), timeout=30.0)
    return reply.memory_summary
コード例 #17
0
def test_worker_stats(ray_start_regular):
    raylet = ray.nodes()[0]
    num_cpus = raylet["Resources"]["CPU"]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])

    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
    reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
    # Check that there is one connected driver.
    drivers = [worker for worker in reply.workers_stats if worker.is_driver]
    assert len(drivers) == 1
    assert os.getpid() == drivers[0].pid

    timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException(
                "Timed out while waiting for worker processes")

        # Wait for the workers to start.
        if len(reply.workers_stats) < num_cpus + 1:
            time.sleep(1)
            reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest())
            continue

        # Check that the rest of the processes are workers, 1 for each CPU.
        print(reply)
        assert len(reply.workers_stats) >= num_cpus + 1
        views = [view.view_name for view in reply.view_data]
        assert "redis_latency" in views
        assert "local_available_resource" in views
        # Check that all processes are Python.
        pids = [worker.pid for worker in reply.workers_stats]
        processes = [
            p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
            if p.info["pid"] in pids
        ]
        for process in processes:
            # TODO(ekl): what is with travis/mi when running in Travis?
            assert ("python" in process or "ray" in process
                    or "travis/mi" in process)
        break
コード例 #18
0
ファイル: dashboard.py プロジェクト: mloperator/ray
    def update_nodes(self):
        with self.nodes_lock:
            self.nodes = ray.nodes()
            node_ids = [node["NodeID"] for node in self.nodes]

            # First remove node connections of disconnected nodes.
            for node_id in self.stubs.keys():
                if node_id not in node_ids:
                    stub = self.stubs.pop(node_id)
                    stub.close()

            # Now add node connections of new nodes.
            for node in self.nodes:
                node_id = node["NodeID"]
                if node_id not in self.stubs:
                    channel = grpc.insecure_channel("{}:{}".format(
                        node["NodeManagerAddress"], node["NodeManagerPort"]))
                    stub = node_manager_pb2_grpc.NodeManagerServiceStub(
                        channel)
                    self.stubs[node_id] = stub
def stat(address):
    if not address:
        address = services.find_redis_address_or_die()
    logger.info("Connecting to Ray instance at {}.".format(address))
    ray.init(address=address)

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    for raylet in ray.nodes():
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        ray.nodes()[0]["NodeManagerPort"])
        logger.info("Querying raylet {}".format(raylet_address))

        channel = grpc.insecure_channel(raylet_address)
        stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
        reply = stub.GetNodeStats(node_manager_pb2.GetNodeStatsRequest(),
                                  timeout=2.0)
        print(reply)
コード例 #20
0
def stat(address):
    """Get the current metrics protobuf from a Ray cluster (developer tool)."""
    if not address:
        address = services.find_redis_address_or_die()
    logger.info("Connecting to Ray instance at {}.".format(address))
    ray.init(address=address)

    import grpc
    from ray.core.generated import node_manager_pb2
    from ray.core.generated import node_manager_pb2_grpc

    for raylet in ray.nodes():
        raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                        ray.nodes()[0]["NodeManagerPort"])
        logger.info("Querying raylet {}".format(raylet_address))

        channel = grpc.insecure_channel(raylet_address)
        stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
        reply = stub.GetNodeStats(
            node_manager_pb2.GetNodeStatsRequest(include_memory_info=False),
            timeout=2.0)
        print(reply)
コード例 #21
0
    def _update_nodes(self):
        with self.nodes_lock:
            self.nodes = ray.nodes()
            node_ids = [node["NodeID"] for node in self.nodes]

            # First remove node connections of disconnected nodes.
            for node_id in self.stubs.keys():
                if node_id not in node_ids:
                    stub = self.stubs.pop(node_id)
                    stub.close()
                    reporter_stub = self.reporter_stubs.pop(node_id)
                    reporter_stub.close()

            # Now add node connections of new nodes.
            for node in self.nodes:
                node_id = node["NodeID"]
                if node_id not in self.stubs:
                    node_ip = node["NodeManagerAddress"]
                    channel = grpc.insecure_channel("{}:{}".format(
                        node_ip, node["NodeManagerPort"]))
                    stub = node_manager_pb2_grpc.NodeManagerServiceStub(
                        channel)
                    self.stubs[node_id] = stub
                    # Block wait until the reporter for the node starts.
                    while True:
                        reporter_port = self.redis_client.get(
                            "REPORTER_PORT:{}".format(node_ip))
                        if reporter_port:
                            break
                    reporter_channel = grpc.insecure_channel("{}:{}".format(
                        node_ip, int(reporter_port)))
                    reporter_stub = reporter_pb2_grpc.ReporterServiceStub(
                        reporter_channel)
                    self.reporter_stubs[node_id] = reporter_stub

            assert len(self.stubs) == len(
                self.reporter_stubs), (self.stubs.keys(),
                                       self.reporter_stubs.keys())
コード例 #22
0
def test_worker_stats(shutdown_only):
    ray.init(num_cpus=1, include_webui=False)
    raylet = ray.nodes()[0]
    num_cpus = raylet["Resources"]["CPU"]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])

    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)

    def try_get_node_stats(num_retry=5, timeout=2):
        reply = None
        for _ in range(num_retry):
            try:
                reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest(),
                                          timeout=timeout)
                break
            except grpc.RpcError:
                continue
        assert reply is not None
        return reply

    reply = try_get_node_stats()
    # Check that there is one connected driver.
    drivers = [worker for worker in reply.workers_stats if worker.is_driver]
    assert len(drivers) == 1
    assert os.getpid() == drivers[0].pid

    @ray.remote
    def f():
        ray.show_in_webui("test")
        return os.getpid()

    @ray.remote
    class Actor(object):
        def __init__(self):
            pass

        def f(self):
            ray.show_in_webui("test")
            return os.getpid()

    # Test show_in_webui for remote functions.
    worker_pid = ray.get(f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for worker in reply.workers_stats:
        stats = worker.core_worker_stats
        if stats.webui_display == "test":
            target_worker_present = True
            assert worker.pid == worker_pid
        else:
            assert stats.webui_display == ""
    assert target_worker_present

    # Test show_in_webui for remote actors.
    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for worker in reply.workers_stats:
        stats = worker.core_worker_stats
        if stats.webui_display == "test":
            target_worker_present = True
            assert worker.pid == worker_pid
        else:
            assert stats.webui_display == ""
    assert target_worker_present

    timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException(
                "Timed out while waiting for worker processes")

        # Wait for the workers to start.
        if len(reply.workers_stats) < num_cpus + 1:
            time.sleep(1)
            reply = try_get_node_stats()
            continue

        # Check that the rest of the processes are workers, 1 for each CPU.
        assert len(reply.workers_stats) == num_cpus + 1
        views = [view.view_name for view in reply.view_data]
        assert "redis_latency" in views
        assert "local_available_resource" in views
        # Check that all processes are Python.
        pids = [worker.pid for worker in reply.workers_stats]
        processes = [
            p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
            if p.info["pid"] in pids
        ]
        for process in processes:
            # TODO(ekl) why does travis/mi end up in the process list
            assert ("python" in process or "ray" in process
                    or "travis" in process)
        break
コード例 #23
0
 def _kill_raylet(self, ip, port, graceful=False):
     raylet_address = f"{ip}:{port}"
     channel = grpc.insecure_channel(raylet_address)
     stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)
     stub.ShutdownRaylet(
         node_manager_pb2.ShutdownRayletRequest(graceful=graceful))
コード例 #24
0
def test_worker_stats(shutdown_only):
    ray.init(num_cpus=1, include_dashboard=True)
    raylet = ray.nodes()[0]
    num_cpus = raylet["Resources"]["CPU"]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])

    channel = init_grpc_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)

    def try_get_node_stats(num_retry=5, timeout=2):
        reply = None
        for _ in range(num_retry):
            try:
                reply = stub.GetNodeStats(
                    node_manager_pb2.GetNodeStatsRequest(), timeout=timeout)
                break
            except grpc.RpcError:
                continue
        assert reply is not None
        return reply

    reply = try_get_node_stats()
    # Check that there is one connected driver.
    drivers = [
        worker for worker in reply.core_workers_stats
        if worker.worker_type == common_pb2.DRIVER
    ]
    assert len(drivers) == 1
    assert os.getpid() == drivers[0].pid

    @ray.remote
    def f():
        ray.worker.show_in_dashboard("test")
        return os.getpid()

    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def f(self):
            ray.worker.show_in_dashboard("test")
            return os.getpid()

    # Test show_in_dashboard for remote functions.
    worker_pid = ray.get(f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for stats in reply.core_workers_stats:
        if stats.webui_display[""] == '{"message": "test", "dtype": "text"}':
            target_worker_present = True
            assert stats.pid == worker_pid
        else:
            assert stats.webui_display[""] == ""  # Empty proto
    assert target_worker_present

    # Test show_in_dashboard for remote actors.
    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for stats in reply.core_workers_stats:
        if stats.webui_display[""] == '{"message": "test", "dtype": "text"}':
            target_worker_present = True
        else:
            assert stats.webui_display[""] == ""  # Empty proto
    assert target_worker_present

    if _WIN32:
        timeout_seconds = 40
    else:
        timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException(
                "Timed out while waiting for worker processes")

        # Wait for the workers to start.
        if len(reply.core_workers_stats) < num_cpus + 2:
            time.sleep(1)
            reply = try_get_node_stats()
            continue

        # Check that the rest of the processes are workers, 1 for each CPU.
        assert len(reply.core_workers_stats) == num_cpus + 2
        # Check that all processes are Python.
        pids = [worker.pid for worker in reply.core_workers_stats]
        processes = [
            p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
            if p.info["pid"] in pids
        ]
        for process in processes:
            # TODO(ekl) why does travis/mi end up in the process list
            assert ("python" in process or "mini" in process
                    or "conda" in process or "travis" in process
                    or "runner" in process or "pytest" in process
                    or "ray" in process), process
        break
コード例 #25
0
def test_worker_stats(shutdown_only):
    addresses = ray.init(num_cpus=1, include_webui=True)
    raylet = ray.nodes()[0]
    num_cpus = raylet["Resources"]["CPU"]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])

    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)

    def try_get_node_stats(num_retry=5, timeout=2):
        reply = None
        for _ in range(num_retry):
            try:
                reply = stub.GetNodeStats(
                    node_manager_pb2.GetNodeStatsRequest(), timeout=timeout)
                break
            except grpc.RpcError:
                continue
        assert reply is not None
        return reply

    reply = try_get_node_stats()
    # Check that there is one connected driver.
    drivers = [worker for worker in reply.workers_stats if worker.is_driver]
    assert len(drivers) == 1
    assert os.getpid() == drivers[0].pid

    @ray.remote
    def f():
        ray.show_in_webui("test")
        return os.getpid()

    @ray.remote
    class Actor:
        def __init__(self):
            pass

        def f(self):
            ray.show_in_webui("test")
            return os.getpid()

    # Test show_in_webui for remote functions.
    worker_pid = ray.get(f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for worker in reply.workers_stats:
        stats = worker.core_worker_stats
        if stats.webui_display[""] == '{"message": "test", "dtype": "text"}':
            target_worker_present = True
            assert worker.pid == worker_pid
        else:
            assert stats.webui_display[""] == ""  # Empty proto
    assert target_worker_present

    # Test show_in_webui for remote actors.
    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for worker in reply.workers_stats:
        stats = worker.core_worker_stats
        if stats.webui_display[""] == '{"message": "test", "dtype": "text"}':
            target_worker_present = True
            assert worker.pid == worker_pid
        else:
            assert stats.webui_display[""] == ""  # Empty proto
    assert target_worker_present

    timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException(
                "Timed out while waiting for worker processes")

        # Wait for the workers to start.
        if len(reply.workers_stats) < num_cpus + 1:
            time.sleep(1)
            reply = try_get_node_stats()
            continue

        # Check that the rest of the processes are workers, 1 for each CPU.
        assert len(reply.workers_stats) == num_cpus + 1
        views = [view.view_name for view in reply.view_data]
        assert "local_available_resource" in views
        # Check that all processes are Python.
        pids = [worker.pid for worker in reply.workers_stats]
        processes = [
            p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
            if p.info["pid"] in pids
        ]
        for process in processes:
            # TODO(ekl) why does travis/mi end up in the process list
            assert ("python" in process or "ray" in process
                    or "travis" in process)
        break

    # Test kill_actor.
    def actor_killed(PID):
        """Check For the existence of a unix pid."""
        try:
            os.kill(PID, 0)
        except OSError:
            return True
        else:
            return False

    assert (wait_until_server_available(addresses["webui_url"]) is True)

    webui_url = addresses["webui_url"]
    webui_url = webui_url.replace("localhost", "http://127.0.0.1")
    for worker in reply.workers_stats:
        if worker.is_driver:
            continue
        requests.get(webui_url + "/api/kill_actor",
                     params={
                         "actor_id":
                         ray.utils.binary_to_hex(
                             worker.core_worker_stats.actor_id),
                         "ip_address":
                         worker.core_worker_stats.ip_address,
                         "port":
                         worker.core_worker_stats.port
                     })
    timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException("Timed out while killing actors")
        if all(
                actor_killed(worker.pid) for worker in reply.workers_stats
                if not worker.is_driver):
            break