Esempio n. 1
0
def test_cached_object(ray_start_cluster):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_period_milliseconds": 100,
        "object_timeout_milliseconds": 200,
    }
    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(num_cpus=0, _system_config=config)
    ray.init(address=cluster.address)
    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1,
                                    resources={"node1": 1},
                                    object_store_memory=10**8)
    cluster.add_node(num_cpus=1,
                     resources={"node2": 1},
                     object_store_memory=10**8)
    cluster.wait_for_nodes()

    @ray.remote
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def dependent_task(x):
        return

    obj = large_object.options(resources={"node1": 1}).remote()
    ray.get(dependent_task.options(resources={"node2": 1}).remote(obj))

    cluster.remove_node(node_to_kill, allow_graceful=False)
    cluster.add_node(num_cpus=1,
                     resources={"node1": 1},
                     object_store_memory=10**8)
    wait_for_condition(lambda: not all(node["Alive"] for node in ray.nodes()),
                       timeout=10)

    for _ in range(20):
        large_object.options(resources={"node2": 1}).remote()

    ray.get(dependent_task.remote(obj))
Esempio n. 2
0
def test_scheduling_class_depth(ray_start_regular):

    node_info = ray.nodes()[0]
    metrics_export_port = node_info["MetricsExportPort"]
    addr = node_info["NodeManagerAddress"]
    prom_addr = f"{addr}:{metrics_export_port}"

    @ray.remote(num_cpus=1000)
    def infeasible():
        pass

    @ray.remote(num_cpus=0)
    def start_infeasible(n):
        if n == 1:
            ray.get(infeasible.remote())
        ray.get(start_infeasible.remote(n - 1))

    start_infeasible.remote(1)
    infeasible.remote()

    # We expect the 2 calls to `infeasible` to be separate scheduling classes
    # because one has depth=1, and the other has depth=2.

    metric_name = "ray_internal_num_infeasible_scheduling_classes"

    def make_condition(n):
        def condition():
            _, metric_names, metric_samples = fetch_prometheus([prom_addr])
            if metric_name in metric_names:
                for sample in metric_samples:
                    if sample.name == metric_name and sample.value == n:
                        return True
            return False

        return condition

    wait_for_condition(make_condition(2))
    start_infeasible.remote(2)
    wait_for_condition(make_condition(3))
    start_infeasible.remote(4)
    wait_for_condition(make_condition(4))
Esempio n. 3
0
    def init(self):
        if ray.is_initialized():
            self._manage_ray = False
        if self._manage_ray:
            ray.init()
        # Compute available nodes, based on CPU resource.
        if settings.head_ip is None:
            # TODO (hme): Have this be a class argument vs. using what's set in settings directly.
            logging.getLogger(__name__).info(
                "Using driver node ip as head node.")
            head_ip = get_private_ip()
        else:
            head_ip = settings.head_ip
        total_cpus = 0
        nodes = ray.nodes()
        for node in nodes:
            node_ip = self._node_ip(node)
            if head_ip == node_ip:
                logging.getLogger(__name__).info("head node %s", node_ip)
                self._head_node = node
            elif self._has_cpu_resources(node):
                logging.getLogger(__name__).info("worker node %s", node_ip)
                total_cpus += node["Resources"]["CPU"]
                self._worker_nodes.append(node)
                self._available_nodes.append(node)
        if self._head_node is None:
            if self._use_head:
                logging.getLogger(__name__).warning(
                    "Failed to determine which node is the head."
                    " The head node will be used even though"
                    " nums.core.settings.use_head = False.")
        elif self._use_head and self._has_cpu_resources(self._head_node):
            total_cpus += self._head_node["Resources"]["CPU"]
            self._available_nodes.append(self._head_node)
        logging.getLogger(__name__).info("total cpus %s", total_cpus)

        if self._num_nodes is None:
            self._num_nodes = len(self._available_nodes)
        assert self._num_nodes <= len(self._available_nodes)

        self.init_devices()
Esempio n. 4
0
        async def run(self):
            self.is_running = True
            while self.is_running:
                node_to_kill_ip = None
                node_to_kill_port = None
                while node_to_kill_port is None and self.is_running:
                    nodes = ray.nodes()
                    alive_nodes = self._get_alive_nodes(nodes)
                    for node in nodes:
                        node_id = node["NodeID"]
                        # make sure at least 1 worker node is alive.
                        if (node["Alive"] and node_id != self.head_node_id
                                and node_id not in self.killed_nodes
                                and alive_nodes > 2):
                            node_to_kill_ip = node["NodeManagerAddress"]
                            node_to_kill_port = node["NodeManagerPort"]
                            break
                    # Give the cluster some time to start.
                    await asyncio.sleep(0.1)

                if not self.is_running:
                    break

                sleep_interval = np.random.rand() * self.node_kill_interval_s
                time.sleep(sleep_interval)

                if node_to_kill_port is not None:
                    try:
                        self._kill_raylet(
                            node_to_kill_ip, node_to_kill_port, graceful=False)
                    except Exception:
                        pass
                    logging.info(
                        f"Killed node {node_id} at address: "
                        f"{node_to_kill_ip}, port: {node_to_kill_port}")
                    self.killed_nodes.add(node_id)
                if len(self.killed_nodes) >= self.max_nodes_to_kill:
                    break
                await asyncio.sleep(self.node_kill_interval_s - sleep_interval)

            self.done.set_result(True)
def get_existing_files_old(flame_fitting_dir):
    @ray.remote
    def check_remote_dirs():
        return [
            int(file_.parent.name)
            for file_ in flame_fitting_dir.glob("*/flame_params.npy")
        ]

    checks = []
    for x in ray.nodes():
        if not x.get("Alive"):
            continue
        for key, item in x["Resources"].items():
            if key.startswith("node:"):
                checks.append(check_remote_dirs.options(resources={key: item}).remote())

    data = set()
    for x in ray.get(checks):
        data |= set(str(y).zfill(5) for y in x)

    return data
Esempio n. 6
0
def get_checkpoint_from_remote_node(
        checkpoint_path: str,
        node_ip: str,
        timeout: float = 300.0) -> Optional[Checkpoint]:
    if not any(node["NodeManagerAddress"] == node_ip for node in ray.nodes()):
        logger.warning(
            f"Could not fetch checkpoint with path {checkpoint_path} from "
            f"node with IP {node_ip} because the node is not available "
            f"anymore.")
        return None
    fut = _serialize_checkpoint.options(resources={
        f"node:{node_ip}": 0.01
    },
                                        num_cpus=0).remote(checkpoint_path)
    try:
        checkpoint_data = ray.get(fut, timeout=timeout)
    except Exception as e:
        logger.warning(
            f"Could not fetch checkpoint with path {checkpoint_path} from "
            f"node with IP {node_ip} because serialization failed: {e}")
        return None
    return Checkpoint.from_bytes(checkpoint_data)
Esempio n. 7
0
def main():
    ray.init()

    head_node_ip = ray.util.get_node_ip_address()

    assert (len([n for n in ray.nodes() if n["Alive"]
                 ]) == 1), "Too many nodes available at start of script"

    node_counter = NodeCountCallback()

    tune.run(
        train,
        num_samples=3,
        config={"head_node_ip": head_node_ip},
        callbacks=[node_counter],
        resources_per_trial={"cpu": 4},
    )

    node_counts = Counter(node_counter.node_counts)
    assert node_counts[3] > 0, "Cluster never scaled to 3 nodes"
    assert node_counter.node_counts[
        -1] == 1, "Cluster didn't scale down to 1 node."
Esempio n. 8
0
def cleanup_remote_node_experiment_dir(experiment_name: str):
    experiment_dir = os.path.join(os.path.expanduser("~/ray_results"), experiment_name)

    @ray.remote
    def _remove_on_remove_node(path: str):
        return shutil.rmtree(path, ignore_errors=True)

    futures = []
    for node in ray.nodes():
        if not node["Alive"]:
            continue

        hostname = node["NodeManagerHostname"]
        ip = node["NodeManagerAddress"]

        if hostname == platform.node():
            # Skip on driver
            continue

        rfn = _remove_on_remove_node.options(resources={f"node:{ip}": 0.01})
        futures.append(rfn.remote(experiment_dir))
    ray.get(futures)
Esempio n. 9
0
    def _update_nodes(self):
        with self.nodes_lock:
            self.nodes = ray.nodes()
            node_ids = [node["NodeID"] for node in self.nodes]

            # First remove node connections of disconnected nodes.
            for node_id in self.stubs.keys():
                if node_id not in node_ids:
                    stub = self.stubs.pop(node_id)
                    stub.close()
                    reporter_stub = self.reporter_stubs.pop(node_id)
                    reporter_stub.close()

            # Now add node connections of new nodes.
            for node in self.nodes:
                node_id = node["NodeID"]
                if node_id not in self.stubs:
                    node_ip = node["NodeManagerAddress"]
                    channel = grpc.insecure_channel("{}:{}".format(
                        node_ip, node["NodeManagerPort"]))
                    stub = node_manager_pb2_grpc.NodeManagerServiceStub(
                        channel)
                    self.stubs[node_id] = stub
                    # Block wait until the reporter for the node starts.
                    while True:
                        reporter_port = self.redis_client.get(
                            "REPORTER_PORT:{}".format(node_ip))
                        if reporter_port:
                            break
                    reporter_channel = grpc.insecure_channel("{}:{}".format(
                        node_ip, int(reporter_port)))
                    reporter_stub = reporter_pb2_grpc.ReporterServiceStub(
                        reporter_channel)
                    self.reporter_stubs[node_id] = reporter_stub

            assert len(self.stubs) == len(
                self.reporter_stubs), (self.stubs.keys(),
                                       self.reporter_stubs.keys())
Esempio n. 10
0
def ray_start_chaos_cluster(request):
    """Returns the cluster and chaos thread.
    """
    os.environ["RAY_num_heartbeats_timeout"] = "5"
    os.environ["RAY_raylet_heartbeat_period_milliseconds"] = "100"
    param = getattr(request, "param", {})
    kill_interval = param.get("kill_interval", 2)
    # Config of workers that are re-started.
    head_resources = param["head_resources"]
    worker_node_types = param["worker_node_types"]

    cluster = AutoscalingCluster(head_resources, worker_node_types)
    cluster.start()
    ray.init("auto")
    nodes = ray.nodes()
    assert len(nodes) == 1
    node_killer = get_and_run_node_killer(kill_interval)
    yield node_killer
    assert ray.get(node_killer.get_total_killed_nodes.remote()) > 0
    ray.shutdown()
    cluster.shutdown()
    del os.environ["RAY_num_heartbeats_timeout"]
    del os.environ["RAY_raylet_heartbeat_period_milliseconds"]
Esempio n. 11
0
def test_warning_for_dead_node(ray_start_cluster_2_nodes, error_pubsub):
    cluster = ray_start_cluster_2_nodes
    cluster.wait_for_nodes()
    p = error_pubsub

    node_ids = {item["NodeID"] for item in ray.nodes()}

    # Try to make sure that the monitor has received at least one heartbeat
    # from the node.
    time.sleep(0.5)

    # Kill both raylets.
    cluster.list_all_nodes()[1].kill_raylet()
    cluster.list_all_nodes()[0].kill_raylet()

    # Check that we get warning messages for both raylets.
    errors = get_error_message(p, 2, ray_constants.REMOVED_NODE_ERROR, 40)

    # Extract the client IDs from the error messages. This will need to be
    # changed if the error message changes.
    warning_node_ids = {error.error_message.split(" ")[5] for error in errors}

    assert node_ids == warning_node_ids
Esempio n. 12
0
def test_profiling(shutdown_only):
    addresses = ray.init(include_dashboard=True, num_cpus=6)

    @ray.remote(num_cpus=2)
    class Actor:
        def getpid(self):
            return os.getpid()

    c = Actor.remote()
    actor_pid = ray.get(c.getpid.remote())

    webui_url = addresses["webui_url"]
    assert wait_until_server_available(webui_url) is True
    webui_url = format_web_url(webui_url)

    start_time = time.time()
    launch_profiling = None
    while True:
        # Sometimes some startup time is required
        if time.time() - start_time > 15:
            raise RayTestTimeoutException(
                "Timed out while collecting profiling stats, "
                f"launch_profiling: {launch_profiling}"
            )
        launch_profiling = requests.get(
            webui_url + "/api/launch_profiling",
            params={
                "ip": ray.nodes()[0]["NodeManagerAddress"],
                "pid": actor_pid,
                "duration": 5,
            },
        ).json()
        if launch_profiling["result"]:
            profiling_info = launch_profiling["data"]["profilingInfo"]
            break
        time.sleep(1)
    logger.info(profiling_info)
Esempio n. 13
0
def test_dynamic_res_creation_clientid_multiple(ray_start_cluster):
    # This test creates resources on multiple clients using the clientid
    # specifier
    cluster = ray_start_cluster

    res_name = "test_res"
    res_capacity = 1.0
    num_nodes = 3
    for i in range(num_nodes):
        cluster.add_node()

    ray.init(address=cluster.address)

    target_node_ids = [node["NodeID"] for node in ray.nodes()]

    @ray.remote
    def set_res(resource_name, resource_capacity, res_client_id):
        ray.experimental.set_resource(resource_name,
                                      resource_capacity,
                                      client_id=res_client_id)

    results = []
    for nid in target_node_ids:
        results.append(set_res.remote(res_name, res_capacity, nid))
    ray.get(results)

    def check_resources():
        resources_created = []
        for nid in target_node_ids:
            target_node = next(node for node in ray.nodes()
                               if node["NodeID"] == nid)
            resources = target_node["Resources"]
            resources_created.append(
                resources.get(res_name, None) == res_capacity)
        return all(resources_created)

    wait_for_condition(check_resources)
Esempio n. 14
0
def test_cluster_handle_affinity():
    cluster = Cluster()
    # HACK: using two different ip address so the placement constraint for
    # resource check later will work.
    head_node = cluster.add_node(node_ip_address="127.0.0.1", num_cpus=4)
    cluster.add_node(node_ip_address="0.0.0.0", num_cpus=4)

    ray.init(head_node.address)

    # Make sure we have two nodes.
    node_ids = [n["NodeID"] for n in ray.nodes()]
    assert len(node_ids) == 2

    # Start the backend.
    client = serve.start(http_port=randint(10000, 30000), detached=True)
    client.create_backend("hi:v0", lambda _: "hi")
    client.create_endpoint("hi", backend="hi:v0")

    # Try to retrieve the handle from both head and worker node, check the
    # router's node id.
    @ray.remote
    def check_handle_router_id():
        client = serve.connect()
        handle = client.get_handle("hi")
        return get_node_id_for_actor(handle.router_handle)

    router_node_ids = ray.get([
        check_handle_router_id.options(resources={
            node_id: 0.01
        }).remote() for node_id in ray.state.node_ids()
    ])

    assert set(router_node_ids) == set(node_ids)

    # Clean up the nodes (otherwise Ray will segfault).
    ray.shutdown()
    cluster.shutdown()
Esempio n. 15
0
def ray_wait_for_workers(min_workers: int = 1) -> None:
    """
    We don't want to dispatch any work until we have workers nodes ready to go.
    This function is a no-op when running on localhost.
    """
    global _ray_is_local
    carriage_return_needed = False

    if _ray_is_local:
        return

    while True:
        nodes = ray.nodes()
        if len(nodes) >= min_workers:
            if carriage_return_needed:
                print(".", flush=True)
            return
        else:
            if not carriage_return_needed:
                print("Waiting for Ray worker nodes.", end="", flush=True)
                carriage_return_needed = True
            else:
                print(".", end="", flush=True)
        sleep(1)
Esempio n. 16
0
 def get_file_discovery_content(self):
     """Return the content for Prometheus service discovery."""
     nodes = ray.nodes()
     metrics_export_addresses = [
         "{}:{}".format(node["NodeManagerAddress"],
                        node["MetricsExportPort"]) for node in nodes
         if node["alive"] is True
     ]
     if not use_gcs_for_bootstrap():
         redis_client = services.create_redis_client(
             self.redis_address, self.redis_password)
         autoscaler_addr = redis_client.get("AutoscalerMetricsAddress")
     else:
         gcs_client = GcsClient(address=self.gcs_address)
         autoscaler_addr = gcs_client.internal_kv_get(
             b"AutoscalerMetricsAddress", None)
     if autoscaler_addr:
         metrics_export_addresses.append(autoscaler_addr.decode("utf-8"))
     return json.dumps([{
         "labels": {
             "job": "ray"
         },
         "targets": metrics_export_addresses
     }])
Esempio n. 17
0
 def init(self):
     # Compute available nodes, based on CPU resource.
     local_ip = get_private_ip()
     total_cpus = 0
     for node in ray.nodes():
         node_key = list(
             filter(lambda key: "node" in key, node["Resources"].keys()))
         assert len(node_key) == 1
         node_ip = node_key[0].split(":")[1]
         has_cpu_resources = "CPU" in node[
             "Resources"] and node["Resources"]["CPU"] >= 1.0
         if local_ip == node_ip:
             logging.getLogger().info("head node %s", node_ip)
             self.head_node = node
             if self.use_head and has_cpu_resources:
                 total_cpus += node["Resources"]["CPU"]
                 self.available_nodes.append(node)
         elif has_cpu_resources:
             logging.getLogger().info("worker node %s", node_ip)
             total_cpus += node["Resources"]["CPU"]
             self.available_nodes.append(node)
     logging.getLogger().info("total cpus %s", total_cpus)
     # Collect compute functions.
     module_functions = extract_functions(self.compute_imp)
     function_signatures: dict = {}
     required_methods = inspect.getmembers(ComputeInterface(),
                                           predicate=inspect.ismethod)
     for name, func in required_methods:
         function_signatures[name] = func
     for name, func in module_functions.items():
         func_sig = function_signatures[name]
         try:
             remote_params = func_sig.remote_params
         except Exception as _:
             remote_params = {}
         self.remote_functions[name] = self.remote(func, remote_params)
Esempio n. 18
0
def wait_for_and_check_cluster_configuration(num_nodes):
    """Check that the cluster's custom resources are properly configured.

    The ith node should have a resource labeled 'i' with quantity 500.

    Args:
        num_nodes: The number of nodes that we expect to be in the cluster.

    Raises:
        RuntimeError: This exception is raised if the cluster is not configured
            properly for this test.
    """
    logger.warning("Waiting for cluster to have %s nodes.", num_nodes)
    while True:
        nodes = ray.nodes()
        if len(nodes) == num_nodes:
            break
        if len(nodes) > num_nodes:
            raise RuntimeError(
                "The cluster has %s nodes, but it should "
                "only have %s.", len(nodes), num_nodes)
    if not ([set(node["Resources"].keys())
             for node in ray.nodes()] == [{str(i), "CPU"}
                                          for i in range(num_nodes)]):
        raise RuntimeError(
            "The ith node in the cluster should have a "
            "custom resource called 'i' with quantity "
            "500. The nodes are\n%s", ray.nodes())
    if not ([[
            resource_quantity
            for resource_name, resource_quantity in node["Resources"].items()
            if resource_name != "CPU"
    ] for node in ray.nodes()] == num_nodes * [[500.0]]):
        raise RuntimeError(
            "The ith node in the cluster should have a "
            "custom resource called 'i' with quantity "
            "500. The nodes are\n%s", ray.nodes())
    for node in ray.nodes():
        if ("0" in node["Resources"] and node["ObjectStoreSocketName"] !=
                ray.worker.global_worker.plasma_client.store_socket_name):
            raise RuntimeError("The node that this driver is connected to "
                               "must have a custom resource labeled '0'.")
Esempio n. 19
0
def test_placement_group_reschedule_when_node_dead(ray_start_cluster,
                                                   connect_to_client):
    @ray.remote(num_cpus=1)
    class Actor(object):
        def __init__(self):
            self.n = 0

        def value(self):
            return self.n

    cluster = ray_start_cluster
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.add_node(num_cpus=4)
    cluster.wait_for_nodes()
    ray.init(address=cluster.address, namespace="default_test_namespace")

    # Make sure both head and worker node are alive.
    nodes = ray.nodes()
    assert len(nodes) == 3
    assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"]

    with connect_to_client_or_not(connect_to_client):
        placement_group = ray.util.placement_group(name="name",
                                                   strategy="SPREAD",
                                                   bundles=[{
                                                       "CPU": 2
                                                   }, {
                                                       "CPU": 2
                                                   }, {
                                                       "CPU": 2
                                                   }])
        actor_1 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0,
            lifetime="detached",
        ).remote()
        actor_2 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1,
            lifetime="detached",
        ).remote()
        actor_3 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=2,
            lifetime="detached",
        ).remote()
        ray.get(actor_1.value.remote())
        ray.get(actor_2.value.remote())
        ray.get(actor_3.value.remote())

        cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1])
        cluster.wait_for_nodes()

        actor_4 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=0,
            lifetime="detached",
        ).remote()
        actor_5 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=1,
            lifetime="detached",
        ).remote()
        actor_6 = Actor.options(
            placement_group=placement_group,
            placement_group_bundle_index=2,
            lifetime="detached",
        ).remote()
        ray.get(actor_4.value.remote())
        ray.get(actor_5.value.remote())
        ray.get(actor_6.value.remote())
        placement_group_assert_no_leak([placement_group])
        ray.shutdown()
Esempio n. 20
0
def test_metrics_export_end_to_end(ray_start_cluster):
    NUM_NODES = 2
    cluster = ray_start_cluster
    # Add a head node.
    cluster.add_node(
        _internal_config=json.dumps({"metrics_report_interval_ms": 1000}))
    # Add worker nodes.
    [cluster.add_node() for _ in range(NUM_NODES - 1)]
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    signal = SignalActor.remote()

    # Generate some metrics around actor & tasks.
    @ray.remote
    def f():
        counter = Count("test_counter", "desc", "unit", [])
        ray.get(signal.send.remote())
        while True:
            counter.record(1, {})
            time.sleep(0.1)

    @ray.remote
    class A:
        async def ready(self):
            pass

        async def ping(self):
            histogram = Histogram("test_histogram", "desc", "unit", [0, 1, 2],
                                  [])
            while True:
                histogram.record(1, {})
                await asyncio.sleep(0.1)

    obj_refs = [f.remote() for _ in range(30)]
    a = A.remote()
    obj_refs.append(a.ping.remote())

    # Make sure both histogram and counter are created
    ray.get(a.ready.remote())
    ray.get(signal.wait.remote())

    node_info_list = ray.nodes()
    prom_addresses = []
    for node_info in node_info_list:
        metrics_export_port = node_info["MetricsExportPort"]
        addr = node_info["NodeManagerAddress"]
        prom_addresses.append(f"{addr}:{metrics_export_port}")

    # Make sure we can ping Prometheus endpoints.
    def fetch_prometheus(prom_addresses):
        components_dict = {}
        metric_names = set()
        for address in prom_addresses:
            if address not in components_dict:
                components_dict[address] = set()
            try:
                response = requests.get(
                    "http://localhost:{}".format(metrics_export_port))
            except requests.exceptions.ConnectionError:
                return components_dict, metric_names

            for line in response.text.split("\n"):
                for family in text_string_to_metric_families(line):
                    for sample in family.samples:
                        # print(sample)
                        metric_names.add(sample.name)
                        if "Component" in sample.labels:
                            components_dict[address].add(
                                sample.labels["Component"])
        return components_dict, metric_names

    def test_prometheus_endpoint():
        # TODO(Simon): Add a gcs_server after fixing metrics.
        components_dict, metric_names = fetch_prometheus(prom_addresses)

        # Raylet should be on every node
        expected_components = {"raylet"}
        components_found = all(
            expected_components.issubset(components)
            for components in components_dict.values())

        # Core worker should be on at least one node
        components_found = components_found and any(
            "core_worker" in components
            for components in components_dict.values())

        expected_metric_names = {"ray_test_counter", "ray_test_histogram_max"}
        metric_names_found = expected_metric_names.issubset(metric_names)

        return components_found and metric_names_found

    try:
        wait_for_condition(
            test_prometheus_endpoint,
            timeout=20,
            retry_interval_ms=1000,  # Yield resource for other processes
        )
    except RuntimeError:
        # This is for debugging when test failed.
        raise RuntimeError(
            "All components were not visible to "
            "prometheus endpoints on time. "
            f"The compoenents are {fetch_prometheus(prom_addresses)}")
    ray.shutdown()
Esempio n. 21
0
 def wait_until_node_dead(node):
     for n in ray.nodes():
         if (n["ObjectStoreSocketName"] == node.address_info[
                 "object_store_address"]):
             return not n["Alive"]
     return False
Esempio n. 22
0
    np.array(default_pred) == np.array(y_test)) / len(default_pred)

parameter_grid = {
    "n_estimators": [10, 50],
    "max_depth": [5, 50, 100],
    "ccp_alpha": [0.001, 0.01]
}

tune_search = TuneGridSearchCV(RandomForestClassifier(),
                               param_grid=parameter_grid,
                               scoring="accuracy")

start = time.time()
tune_search.fit(x_train, y_train)
end = time.time()

best_score = tune_search.best_score_
best_params = tune_search.best_params_

print('''This cluster consists of
    {} nodes in total
    {} CPU resources in total
'''.format(len(ray.nodes()),
           ray.cluster_resources()['CPU']))

print(f"Default parameters: {default_params}")
print(f"Default accuracy: {default_accuracy}")

print("Tune GridSearch Fit Time:", end - start)
print(f"GridSearch parameters: {best_params}")
print(f"GridSearch score: {best_score}")
Esempio n. 23
0
def test_worker_stats(shutdown_only):
    ray.init(num_cpus=1, include_webui=False)
    raylet = ray.nodes()[0]
    num_cpus = raylet["Resources"]["CPU"]
    raylet_address = "{}:{}".format(raylet["NodeManagerAddress"],
                                    ray.nodes()[0]["NodeManagerPort"])

    channel = grpc.insecure_channel(raylet_address)
    stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel)

    def try_get_node_stats(num_retry=5, timeout=2):
        reply = None
        for _ in range(num_retry):
            try:
                reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest(),
                                          timeout=timeout)
                break
            except grpc.RpcError:
                continue
        assert reply is not None
        return reply

    reply = try_get_node_stats()
    # Check that there is one connected driver.
    drivers = [worker for worker in reply.workers_stats if worker.is_driver]
    assert len(drivers) == 1
    assert os.getpid() == drivers[0].pid

    @ray.remote
    def f():
        ray.show_in_webui("test")
        return os.getpid()

    @ray.remote
    class Actor(object):
        def __init__(self):
            pass

        def f(self):
            ray.show_in_webui("test")
            return os.getpid()

    # Test show_in_webui for remote functions.
    worker_pid = ray.get(f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for worker in reply.workers_stats:
        stats = worker.core_worker_stats
        if stats.webui_display == "test":
            target_worker_present = True
            assert worker.pid == worker_pid
        else:
            assert stats.webui_display == ""
    assert target_worker_present

    # Test show_in_webui for remote actors.
    a = Actor.remote()
    worker_pid = ray.get(a.f.remote())
    reply = try_get_node_stats()
    target_worker_present = False
    for worker in reply.workers_stats:
        stats = worker.core_worker_stats
        if stats.webui_display == "test":
            target_worker_present = True
            assert worker.pid == worker_pid
        else:
            assert stats.webui_display == ""
    assert target_worker_present

    timeout_seconds = 20
    start_time = time.time()
    while True:
        if time.time() - start_time > timeout_seconds:
            raise RayTestTimeoutException(
                "Timed out while waiting for worker processes")

        # Wait for the workers to start.
        if len(reply.workers_stats) < num_cpus + 1:
            time.sleep(1)
            reply = try_get_node_stats()
            continue

        # Check that the rest of the processes are workers, 1 for each CPU.
        assert len(reply.workers_stats) == num_cpus + 1
        views = [view.view_name for view in reply.view_data]
        assert "redis_latency" in views
        assert "local_available_resource" in views
        # Check that all processes are Python.
        pids = [worker.pid for worker in reply.workers_stats]
        processes = [
            p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"])
            if p.info["pid"] in pids
        ]
        for process in processes:
            # TODO(ekl) why does travis/mi end up in the process list
            assert ("python" in process or "ray" in process
                    or "travis" in process)
        break
Esempio n. 24
0
def test_global_state_api(shutdown_only):

    error_message = ("The ray global state API cannot be used "
                     "before ray.init has been called.")

    with pytest.raises(Exception, match=error_message):
        ray.objects()

    with pytest.raises(Exception, match=error_message):
        ray.actors()

    with pytest.raises(Exception, match=error_message):
        ray.nodes()

    with pytest.raises(Exception, match=error_message):
        ray.jobs()

    ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1})

    assert ray.cluster_resources()["CPU"] == 5
    assert ray.cluster_resources()["GPU"] == 3
    assert ray.cluster_resources()["CustomResource"] == 1

    # A driver/worker creates a temporary object during startup. Although the
    # temporary object is freed immediately, in a rare case, we can still find
    # the object ref in GCS because Raylet removes the object ref from GCS
    # asynchronously.
    # Because we can't control when workers create the temporary objects, so
    # We can't assert that `ray.objects()` returns an empty dict. Here we just
    # make sure `ray.objects()` succeeds.
    assert len(ray.objects()) >= 0

    job_id = ray.utils.compute_job_id_from_driver(
        ray.WorkerID(ray.worker.global_worker.worker_id))

    client_table = ray.nodes()
    node_ip_address = ray.worker.global_worker.node_ip_address

    assert len(client_table) == 1
    assert client_table[0]["NodeManagerAddress"] == node_ip_address

    @ray.remote
    class Actor:
        def __init__(self):
            pass

    _ = Actor.remote()  # noqa: F841
    # Wait for actor to be created
    wait_for_num_actors(1)

    actor_table = ray.actors()
    assert len(actor_table) == 1

    actor_info, = actor_table.values()
    assert actor_info["JobID"] == job_id.hex()
    assert "IPAddress" in actor_info["Address"]
    assert "IPAddress" in actor_info["OwnerAddress"]
    assert actor_info["Address"]["Port"] != actor_info["OwnerAddress"]["Port"]

    job_table = ray.jobs()

    assert len(job_table) == 1
    assert job_table[0]["JobID"] == job_id.hex()
    assert job_table[0]["DriverIPAddress"] == node_ip_address
Esempio n. 25
0
def test_nested(ray_start_cluster, reconstruction_enabled):
    config = {
        "num_heartbeats_timeout": 10,
        "raylet_heartbeat_period_milliseconds": 100,
        "object_timeout_milliseconds": 200,
        "fetch_fail_timeout_milliseconds": 10_000,
    }
    # Workaround to reset the config to the default value.
    if not reconstruction_enabled:
        config["lineage_pinning_enabled"] = False

    cluster = ray_start_cluster
    # Head node with no resources.
    cluster.add_node(
        num_cpus=0,
        _system_config=config,
        enable_object_reconstruction=reconstruction_enabled)
    ray.init(address=cluster.address)
    done_signal = SignalActor.remote()
    exit_signal = SignalActor.remote()
    ray.get(done_signal.wait.remote(should_wait=False))
    ray.get(exit_signal.wait.remote(should_wait=False))

    # Node to place the initial object.
    node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8)
    cluster.wait_for_nodes()

    @ray.remote
    def dependent_task(x):
        return

    @ray.remote
    def large_object():
        return np.zeros(10**7, dtype=np.uint8)

    @ray.remote
    def nested(done_signal, exit_signal):
        ref = ray.put(np.zeros(10**7, dtype=np.uint8))
        # Flush object store.
        for _ in range(20):
            ray.put(np.zeros(10**7, dtype=np.uint8))
        dep = dependent_task.options(resources={"node": 1}).remote(ref)
        ray.get(done_signal.send.remote(clear=True))
        ray.get(dep)
        return ray.get(ref)

    ref = nested.remote(done_signal, exit_signal)
    # Wait for task to get scheduled on the node to kill.
    ray.get(done_signal.wait.remote())
    # Wait for ray.put object to get transferred to the other node.
    cluster.add_node(
        num_cpus=2, resources={"node": 10}, object_store_memory=10**8)
    ray.get(dependent_task.remote(ref))

    # Destroy the task's output.
    cluster.remove_node(node_to_kill, allow_graceful=False)
    wait_for_condition(
        lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10)

    if reconstruction_enabled:
        # NOTE(swang): This is supposed to work because nested doesn't actually
        # return any ObjectRefs. However, currently the ray.put in `nested`
        # fails because the object already exists with a different owner.
        # See https://github.com/ray-project/ray/issues/20713.
        try:
            ray.get(ref, timeout=60)
        except ray.exceptions.RayTaskError as e:
            assert isinstance(e.cause, ray.exceptions.ObjectFetchTimedOutError)
    else:
        with pytest.raises(ray.exceptions.ObjectLostError):
            ray.get(ref, timeout=60)
Esempio n. 26
0
args = parser.parse_args()

tf = try_import_tf()
'''
Custom environment
'''

register_env("myenv", lambda config: shape_optimization(config))

if __name__ == "__main__":
    zero_time = time()
    #    print(args.ray_address)
    ray.init(redis_address=args.ray_address)

    with open('Resources.txt', 'w') as f:
        f.write('Nodes used: ' + str(len(ray.nodes())) + '\n')
        f.write('Available resources:' + '\n'),
        f.write(str(ray.available_resources()) + '\n')
        f.flush()
        os.fsync(f)
    f.close()

    connect_time = time()
    register_time = time()

    #    resources = ray.get_resource_ids()
    #    cpus = [v[0] for v in resources['CPU']]

    #    config = appo.DEFAULT_CONFIG.copy()
    config = ppo.DEFAULT_CONFIG.copy()
    #    config = a3c.DEFAULT_CONFIG.copy()
Esempio n. 27
0
def num_alive_nodes():
    n = 0
    for node in ray.nodes():
        if node["Alive"]:
            n += 1
    return n
Esempio n. 28
0
def test_logs_stream_and_tail(ray_start_with_dashboard):
    assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True
    webui_url = ray_start_with_dashboard["webui_url"]
    webui_url = format_web_url(webui_url)
    node_id = list_nodes()[0]["node_id"]

    def verify_basic():
        stream_response = requests.get(
            webui_url
            + f"/api/v0/logs/file?node_id={node_id}&filename=gcs_server.out&lines=5",
            stream=True,
        )
        if stream_response.status_code != 200:
            raise ValueError(stream_response.content.decode("utf-8"))
        lines = []
        for line in stream_response.iter_lines():
            lines.append(line.decode("utf-8"))
        return len(lines) == 5 or len(lines) == 6

    wait_for_condition(verify_basic)

    @ray.remote
    class Actor:
        def write_log(self, strings):
            for s in strings:
                print(s)

        def getpid(self):
            return os.getpid()

    test_log_text = "test_log_text_日志_{}"
    actor = Actor.remote()
    ray.get(actor.write_log.remote([test_log_text.format("XXXXXX")]))

    # Test stream and fetching by actor id
    stream_response = requests.get(
        webui_url
        + "/api/v0/logs/stream?&lines=2"
        + f"&actor_id={actor._ray_actor_id.hex()}",
        stream=True,
    )
    if stream_response.status_code != 200:
        raise ValueError(stream_response.content.decode("utf-8"))
    stream_iterator = stream_response.iter_content(chunk_size=None)
    # NOTE: Prefix 1 indicates the stream has succeeded.
    assert (
        next(stream_iterator).decode("utf-8")
        == "1:actor_name:Actor\n" + test_log_text.format("XXXXXX") + "\n"
    )

    streamed_string = ""
    for i in range(5):
        strings = []
        for j in range(100):
            strings.append(test_log_text.format(f"{100*i + j:06d}"))

        ray.get(actor.write_log.remote(strings))

        string = ""
        for s in strings:
            string += s + "\n"
        streamed_string += string
        # NOTE: Prefix 1 indicates the stream has succeeded.
        assert next(stream_iterator).decode("utf-8") == "1" + string
    del stream_response

    # Test tailing log by actor id
    LINES = 150
    file_response = requests.get(
        webui_url
        + f"/api/v0/logs/file?&lines={LINES}"
        + "&actor_id="
        + actor._ray_actor_id.hex(),
    ).content.decode("utf-8")
    # NOTE: Prefix 1 indicates the stream has succeeded.
    assert file_response == "1" + "\n".join(streamed_string.split("\n")[-(LINES + 1) :])

    # Test query by pid & node_ip instead of actor id.
    node_ip = list(ray.nodes())[0]["NodeManagerAddress"]
    pid = ray.get(actor.getpid.remote())
    file_response = requests.get(
        webui_url
        + f"/api/v0/logs/file?node_ip={node_ip}&lines={LINES}"
        + f"&pid={pid}",
    ).content.decode("utf-8")
    # NOTE: Prefix 1 indicates the stream has succeeded.
    assert file_response == "1" + "\n".join(streamed_string.split("\n")[-(LINES + 1) :])
Esempio n. 29
0
def test_dynamic_res_concurrent_res_delete(ray_start_cluster):
    # This test makes sure resource gets deleted correctly when a task has
    # already acquired the resource

    cluster = ray_start_cluster

    res_name = "test_res"
    res_capacity = 5
    num_nodes = 5
    TIMEOUT_DURATION = 1

    # Create a object ID to have the task wait on
    WAIT_OBJECT_ID_STR = ("a" * 20).encode("ascii")

    # Create a object ID to signal that the task is running
    TASK_RUNNING_OBJECT_ID_STR = ("b" * 20).encode("ascii")

    for i in range(num_nodes):
        cluster.add_node()

    ray.init(redis_address=cluster.redis_address)

    clientids = [client["ClientID"] for client in ray.nodes()]
    target_clientid = clientids[1]

    @ray.remote
    def set_res(resource_name, resource_capacity, res_client_id):
        ray.experimental.set_resource(resource_name,
                                      resource_capacity,
                                      client_id=res_client_id)

    @ray.remote
    def delete_res(resource_name, res_client_id):
        ray.experimental.set_resource(resource_name,
                                      0,
                                      client_id=res_client_id)

    # Create the resource on node 1
    ray.get(set_res.remote(res_name, res_capacity, target_clientid))
    assert ray.cluster_resources()[res_name] == res_capacity

    # Task to hold the resource till the driver signals to finish
    @ray.remote
    def wait_func(running_oid, wait_oid):
        # Signal that the task is running
        ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1)
        # Make the task wait till signalled by driver
        ray.get(ray.ObjectID(wait_oid))

    @ray.remote
    def test_func():
        return 1

    # Launch the task with resource requirement of 4, thus the new available
    # capacity becomes 1
    task = wait_func._remote(
        args=[TASK_RUNNING_OBJECT_ID_STR, WAIT_OBJECT_ID_STR],
        resources={res_name: 4})
    # Wait till wait_func is launched before updating resource
    ray.get(ray.ObjectID(TASK_RUNNING_OBJECT_ID_STR))

    # Delete the resource
    ray.get(delete_res.remote(res_name, target_clientid))

    # Signal task to complete
    ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1)
    ray.get(task)

    # Check if scheduler state is consistent by launching a task requiring
    # the deleted resource  This should not execute
    task_2 = test_func._remote(args=[],
                               resources={res_name:
                                          1})  # This should be infeasible
    successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION)
    assert unsuccessful  # The task did not complete because it's infeasible
    assert res_name not in ray.available_resources()
Esempio n. 30
0
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

ray.init(address="localhost:6379")

# These numbers need to correspond with the autoscaler config file.
# The number of remote nodes in the autoscaler should upper bound
# these because sometimes nodes fail to update.
num_remote_nodes = 100
head_node_cpus = 2
num_remote_cpus = num_remote_nodes * head_node_cpus

# Wait until the expected number of nodes have joined the cluster.
while True:
    num_nodes = len(ray.nodes())
    logger.info("Waiting for nodes {}/{}".format(num_nodes,
                                                 num_remote_nodes + 1))
    if num_nodes >= num_remote_nodes + 1:
        break
    time.sleep(5)
logger.info("Nodes have all joined. There are %s resources.",
            ray.cluster_resources())


# Require 1 GPU to force the tasks to be on remote machines.
@ray.remote(num_gpus=1)
def f(size, *xs):
    return np.ones(size, dtype=np.uint8)