def test_cached_object(ray_start_cluster): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, } cluster = ray_start_cluster # Head node with no resources. cluster.add_node(num_cpus=0, _system_config=config) ray.init(address=cluster.address) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) cluster.add_node(num_cpus=1, resources={"node2": 1}, object_store_memory=10**8) cluster.wait_for_nodes() @ray.remote def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def dependent_task(x): return obj = large_object.options(resources={"node1": 1}).remote() ray.get(dependent_task.options(resources={"node2": 1}).remote(obj)) cluster.remove_node(node_to_kill, allow_graceful=False) cluster.add_node(num_cpus=1, resources={"node1": 1}, object_store_memory=10**8) wait_for_condition(lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) for _ in range(20): large_object.options(resources={"node2": 1}).remote() ray.get(dependent_task.remote(obj))
def test_scheduling_class_depth(ray_start_regular): node_info = ray.nodes()[0] metrics_export_port = node_info["MetricsExportPort"] addr = node_info["NodeManagerAddress"] prom_addr = f"{addr}:{metrics_export_port}" @ray.remote(num_cpus=1000) def infeasible(): pass @ray.remote(num_cpus=0) def start_infeasible(n): if n == 1: ray.get(infeasible.remote()) ray.get(start_infeasible.remote(n - 1)) start_infeasible.remote(1) infeasible.remote() # We expect the 2 calls to `infeasible` to be separate scheduling classes # because one has depth=1, and the other has depth=2. metric_name = "ray_internal_num_infeasible_scheduling_classes" def make_condition(n): def condition(): _, metric_names, metric_samples = fetch_prometheus([prom_addr]) if metric_name in metric_names: for sample in metric_samples: if sample.name == metric_name and sample.value == n: return True return False return condition wait_for_condition(make_condition(2)) start_infeasible.remote(2) wait_for_condition(make_condition(3)) start_infeasible.remote(4) wait_for_condition(make_condition(4))
def init(self): if ray.is_initialized(): self._manage_ray = False if self._manage_ray: ray.init() # Compute available nodes, based on CPU resource. if settings.head_ip is None: # TODO (hme): Have this be a class argument vs. using what's set in settings directly. logging.getLogger(__name__).info( "Using driver node ip as head node.") head_ip = get_private_ip() else: head_ip = settings.head_ip total_cpus = 0 nodes = ray.nodes() for node in nodes: node_ip = self._node_ip(node) if head_ip == node_ip: logging.getLogger(__name__).info("head node %s", node_ip) self._head_node = node elif self._has_cpu_resources(node): logging.getLogger(__name__).info("worker node %s", node_ip) total_cpus += node["Resources"]["CPU"] self._worker_nodes.append(node) self._available_nodes.append(node) if self._head_node is None: if self._use_head: logging.getLogger(__name__).warning( "Failed to determine which node is the head." " The head node will be used even though" " nums.core.settings.use_head = False.") elif self._use_head and self._has_cpu_resources(self._head_node): total_cpus += self._head_node["Resources"]["CPU"] self._available_nodes.append(self._head_node) logging.getLogger(__name__).info("total cpus %s", total_cpus) if self._num_nodes is None: self._num_nodes = len(self._available_nodes) assert self._num_nodes <= len(self._available_nodes) self.init_devices()
async def run(self): self.is_running = True while self.is_running: node_to_kill_ip = None node_to_kill_port = None while node_to_kill_port is None and self.is_running: nodes = ray.nodes() alive_nodes = self._get_alive_nodes(nodes) for node in nodes: node_id = node["NodeID"] # make sure at least 1 worker node is alive. if (node["Alive"] and node_id != self.head_node_id and node_id not in self.killed_nodes and alive_nodes > 2): node_to_kill_ip = node["NodeManagerAddress"] node_to_kill_port = node["NodeManagerPort"] break # Give the cluster some time to start. await asyncio.sleep(0.1) if not self.is_running: break sleep_interval = np.random.rand() * self.node_kill_interval_s time.sleep(sleep_interval) if node_to_kill_port is not None: try: self._kill_raylet( node_to_kill_ip, node_to_kill_port, graceful=False) except Exception: pass logging.info( f"Killed node {node_id} at address: " f"{node_to_kill_ip}, port: {node_to_kill_port}") self.killed_nodes.add(node_id) if len(self.killed_nodes) >= self.max_nodes_to_kill: break await asyncio.sleep(self.node_kill_interval_s - sleep_interval) self.done.set_result(True)
def get_existing_files_old(flame_fitting_dir): @ray.remote def check_remote_dirs(): return [ int(file_.parent.name) for file_ in flame_fitting_dir.glob("*/flame_params.npy") ] checks = [] for x in ray.nodes(): if not x.get("Alive"): continue for key, item in x["Resources"].items(): if key.startswith("node:"): checks.append(check_remote_dirs.options(resources={key: item}).remote()) data = set() for x in ray.get(checks): data |= set(str(y).zfill(5) for y in x) return data
def get_checkpoint_from_remote_node( checkpoint_path: str, node_ip: str, timeout: float = 300.0) -> Optional[Checkpoint]: if not any(node["NodeManagerAddress"] == node_ip for node in ray.nodes()): logger.warning( f"Could not fetch checkpoint with path {checkpoint_path} from " f"node with IP {node_ip} because the node is not available " f"anymore.") return None fut = _serialize_checkpoint.options(resources={ f"node:{node_ip}": 0.01 }, num_cpus=0).remote(checkpoint_path) try: checkpoint_data = ray.get(fut, timeout=timeout) except Exception as e: logger.warning( f"Could not fetch checkpoint with path {checkpoint_path} from " f"node with IP {node_ip} because serialization failed: {e}") return None return Checkpoint.from_bytes(checkpoint_data)
def main(): ray.init() head_node_ip = ray.util.get_node_ip_address() assert (len([n for n in ray.nodes() if n["Alive"] ]) == 1), "Too many nodes available at start of script" node_counter = NodeCountCallback() tune.run( train, num_samples=3, config={"head_node_ip": head_node_ip}, callbacks=[node_counter], resources_per_trial={"cpu": 4}, ) node_counts = Counter(node_counter.node_counts) assert node_counts[3] > 0, "Cluster never scaled to 3 nodes" assert node_counter.node_counts[ -1] == 1, "Cluster didn't scale down to 1 node."
def cleanup_remote_node_experiment_dir(experiment_name: str): experiment_dir = os.path.join(os.path.expanduser("~/ray_results"), experiment_name) @ray.remote def _remove_on_remove_node(path: str): return shutil.rmtree(path, ignore_errors=True) futures = [] for node in ray.nodes(): if not node["Alive"]: continue hostname = node["NodeManagerHostname"] ip = node["NodeManagerAddress"] if hostname == platform.node(): # Skip on driver continue rfn = _remove_on_remove_node.options(resources={f"node:{ip}": 0.01}) futures.append(rfn.remote(experiment_dir)) ray.get(futures)
def _update_nodes(self): with self.nodes_lock: self.nodes = ray.nodes() node_ids = [node["NodeID"] for node in self.nodes] # First remove node connections of disconnected nodes. for node_id in self.stubs.keys(): if node_id not in node_ids: stub = self.stubs.pop(node_id) stub.close() reporter_stub = self.reporter_stubs.pop(node_id) reporter_stub.close() # Now add node connections of new nodes. for node in self.nodes: node_id = node["NodeID"] if node_id not in self.stubs: node_ip = node["NodeManagerAddress"] channel = grpc.insecure_channel("{}:{}".format( node_ip, node["NodeManagerPort"])) stub = node_manager_pb2_grpc.NodeManagerServiceStub( channel) self.stubs[node_id] = stub # Block wait until the reporter for the node starts. while True: reporter_port = self.redis_client.get( "REPORTER_PORT:{}".format(node_ip)) if reporter_port: break reporter_channel = grpc.insecure_channel("{}:{}".format( node_ip, int(reporter_port))) reporter_stub = reporter_pb2_grpc.ReporterServiceStub( reporter_channel) self.reporter_stubs[node_id] = reporter_stub assert len(self.stubs) == len( self.reporter_stubs), (self.stubs.keys(), self.reporter_stubs.keys())
def ray_start_chaos_cluster(request): """Returns the cluster and chaos thread. """ os.environ["RAY_num_heartbeats_timeout"] = "5" os.environ["RAY_raylet_heartbeat_period_milliseconds"] = "100" param = getattr(request, "param", {}) kill_interval = param.get("kill_interval", 2) # Config of workers that are re-started. head_resources = param["head_resources"] worker_node_types = param["worker_node_types"] cluster = AutoscalingCluster(head_resources, worker_node_types) cluster.start() ray.init("auto") nodes = ray.nodes() assert len(nodes) == 1 node_killer = get_and_run_node_killer(kill_interval) yield node_killer assert ray.get(node_killer.get_total_killed_nodes.remote()) > 0 ray.shutdown() cluster.shutdown() del os.environ["RAY_num_heartbeats_timeout"] del os.environ["RAY_raylet_heartbeat_period_milliseconds"]
def test_warning_for_dead_node(ray_start_cluster_2_nodes, error_pubsub): cluster = ray_start_cluster_2_nodes cluster.wait_for_nodes() p = error_pubsub node_ids = {item["NodeID"] for item in ray.nodes()} # Try to make sure that the monitor has received at least one heartbeat # from the node. time.sleep(0.5) # Kill both raylets. cluster.list_all_nodes()[1].kill_raylet() cluster.list_all_nodes()[0].kill_raylet() # Check that we get warning messages for both raylets. errors = get_error_message(p, 2, ray_constants.REMOVED_NODE_ERROR, 40) # Extract the client IDs from the error messages. This will need to be # changed if the error message changes. warning_node_ids = {error.error_message.split(" ")[5] for error in errors} assert node_ids == warning_node_ids
def test_profiling(shutdown_only): addresses = ray.init(include_dashboard=True, num_cpus=6) @ray.remote(num_cpus=2) class Actor: def getpid(self): return os.getpid() c = Actor.remote() actor_pid = ray.get(c.getpid.remote()) webui_url = addresses["webui_url"] assert wait_until_server_available(webui_url) is True webui_url = format_web_url(webui_url) start_time = time.time() launch_profiling = None while True: # Sometimes some startup time is required if time.time() - start_time > 15: raise RayTestTimeoutException( "Timed out while collecting profiling stats, " f"launch_profiling: {launch_profiling}" ) launch_profiling = requests.get( webui_url + "/api/launch_profiling", params={ "ip": ray.nodes()[0]["NodeManagerAddress"], "pid": actor_pid, "duration": 5, }, ).json() if launch_profiling["result"]: profiling_info = launch_profiling["data"]["profilingInfo"] break time.sleep(1) logger.info(profiling_info)
def test_dynamic_res_creation_clientid_multiple(ray_start_cluster): # This test creates resources on multiple clients using the clientid # specifier cluster = ray_start_cluster res_name = "test_res" res_capacity = 1.0 num_nodes = 3 for i in range(num_nodes): cluster.add_node() ray.init(address=cluster.address) target_node_ids = [node["NodeID"] for node in ray.nodes()] @ray.remote def set_res(resource_name, resource_capacity, res_client_id): ray.experimental.set_resource(resource_name, resource_capacity, client_id=res_client_id) results = [] for nid in target_node_ids: results.append(set_res.remote(res_name, res_capacity, nid)) ray.get(results) def check_resources(): resources_created = [] for nid in target_node_ids: target_node = next(node for node in ray.nodes() if node["NodeID"] == nid) resources = target_node["Resources"] resources_created.append( resources.get(res_name, None) == res_capacity) return all(resources_created) wait_for_condition(check_resources)
def test_cluster_handle_affinity(): cluster = Cluster() # HACK: using two different ip address so the placement constraint for # resource check later will work. head_node = cluster.add_node(node_ip_address="127.0.0.1", num_cpus=4) cluster.add_node(node_ip_address="0.0.0.0", num_cpus=4) ray.init(head_node.address) # Make sure we have two nodes. node_ids = [n["NodeID"] for n in ray.nodes()] assert len(node_ids) == 2 # Start the backend. client = serve.start(http_port=randint(10000, 30000), detached=True) client.create_backend("hi:v0", lambda _: "hi") client.create_endpoint("hi", backend="hi:v0") # Try to retrieve the handle from both head and worker node, check the # router's node id. @ray.remote def check_handle_router_id(): client = serve.connect() handle = client.get_handle("hi") return get_node_id_for_actor(handle.router_handle) router_node_ids = ray.get([ check_handle_router_id.options(resources={ node_id: 0.01 }).remote() for node_id in ray.state.node_ids() ]) assert set(router_node_ids) == set(node_ids) # Clean up the nodes (otherwise Ray will segfault). ray.shutdown() cluster.shutdown()
def ray_wait_for_workers(min_workers: int = 1) -> None: """ We don't want to dispatch any work until we have workers nodes ready to go. This function is a no-op when running on localhost. """ global _ray_is_local carriage_return_needed = False if _ray_is_local: return while True: nodes = ray.nodes() if len(nodes) >= min_workers: if carriage_return_needed: print(".", flush=True) return else: if not carriage_return_needed: print("Waiting for Ray worker nodes.", end="", flush=True) carriage_return_needed = True else: print(".", end="", flush=True) sleep(1)
def get_file_discovery_content(self): """Return the content for Prometheus service discovery.""" nodes = ray.nodes() metrics_export_addresses = [ "{}:{}".format(node["NodeManagerAddress"], node["MetricsExportPort"]) for node in nodes if node["alive"] is True ] if not use_gcs_for_bootstrap(): redis_client = services.create_redis_client( self.redis_address, self.redis_password) autoscaler_addr = redis_client.get("AutoscalerMetricsAddress") else: gcs_client = GcsClient(address=self.gcs_address) autoscaler_addr = gcs_client.internal_kv_get( b"AutoscalerMetricsAddress", None) if autoscaler_addr: metrics_export_addresses.append(autoscaler_addr.decode("utf-8")) return json.dumps([{ "labels": { "job": "ray" }, "targets": metrics_export_addresses }])
def init(self): # Compute available nodes, based on CPU resource. local_ip = get_private_ip() total_cpus = 0 for node in ray.nodes(): node_key = list( filter(lambda key: "node" in key, node["Resources"].keys())) assert len(node_key) == 1 node_ip = node_key[0].split(":")[1] has_cpu_resources = "CPU" in node[ "Resources"] and node["Resources"]["CPU"] >= 1.0 if local_ip == node_ip: logging.getLogger().info("head node %s", node_ip) self.head_node = node if self.use_head and has_cpu_resources: total_cpus += node["Resources"]["CPU"] self.available_nodes.append(node) elif has_cpu_resources: logging.getLogger().info("worker node %s", node_ip) total_cpus += node["Resources"]["CPU"] self.available_nodes.append(node) logging.getLogger().info("total cpus %s", total_cpus) # Collect compute functions. module_functions = extract_functions(self.compute_imp) function_signatures: dict = {} required_methods = inspect.getmembers(ComputeInterface(), predicate=inspect.ismethod) for name, func in required_methods: function_signatures[name] = func for name, func in module_functions.items(): func_sig = function_signatures[name] try: remote_params = func_sig.remote_params except Exception as _: remote_params = {} self.remote_functions[name] = self.remote(func, remote_params)
def wait_for_and_check_cluster_configuration(num_nodes): """Check that the cluster's custom resources are properly configured. The ith node should have a resource labeled 'i' with quantity 500. Args: num_nodes: The number of nodes that we expect to be in the cluster. Raises: RuntimeError: This exception is raised if the cluster is not configured properly for this test. """ logger.warning("Waiting for cluster to have %s nodes.", num_nodes) while True: nodes = ray.nodes() if len(nodes) == num_nodes: break if len(nodes) > num_nodes: raise RuntimeError( "The cluster has %s nodes, but it should " "only have %s.", len(nodes), num_nodes) if not ([set(node["Resources"].keys()) for node in ray.nodes()] == [{str(i), "CPU"} for i in range(num_nodes)]): raise RuntimeError( "The ith node in the cluster should have a " "custom resource called 'i' with quantity " "500. The nodes are\n%s", ray.nodes()) if not ([[ resource_quantity for resource_name, resource_quantity in node["Resources"].items() if resource_name != "CPU" ] for node in ray.nodes()] == num_nodes * [[500.0]]): raise RuntimeError( "The ith node in the cluster should have a " "custom resource called 'i' with quantity " "500. The nodes are\n%s", ray.nodes()) for node in ray.nodes(): if ("0" in node["Resources"] and node["ObjectStoreSocketName"] != ray.worker.global_worker.plasma_client.store_socket_name): raise RuntimeError("The node that this driver is connected to " "must have a custom resource labeled '0'.")
def test_placement_group_reschedule_when_node_dead(ray_start_cluster, connect_to_client): @ray.remote(num_cpus=1) class Actor(object): def __init__(self): self.n = 0 def value(self): return self.n cluster = ray_start_cluster cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) cluster.add_node(num_cpus=4) cluster.wait_for_nodes() ray.init(address=cluster.address, namespace="default_test_namespace") # Make sure both head and worker node are alive. nodes = ray.nodes() assert len(nodes) == 3 assert nodes[0]["alive"] and nodes[1]["alive"] and nodes[2]["alive"] with connect_to_client_or_not(connect_to_client): placement_group = ray.util.placement_group(name="name", strategy="SPREAD", bundles=[{ "CPU": 2 }, { "CPU": 2 }, { "CPU": 2 }]) actor_1 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0, lifetime="detached", ).remote() actor_2 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1, lifetime="detached", ).remote() actor_3 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2, lifetime="detached", ).remote() ray.get(actor_1.value.remote()) ray.get(actor_2.value.remote()) ray.get(actor_3.value.remote()) cluster.remove_node(get_other_nodes(cluster, exclude_head=True)[-1]) cluster.wait_for_nodes() actor_4 = Actor.options( placement_group=placement_group, placement_group_bundle_index=0, lifetime="detached", ).remote() actor_5 = Actor.options( placement_group=placement_group, placement_group_bundle_index=1, lifetime="detached", ).remote() actor_6 = Actor.options( placement_group=placement_group, placement_group_bundle_index=2, lifetime="detached", ).remote() ray.get(actor_4.value.remote()) ray.get(actor_5.value.remote()) ray.get(actor_6.value.remote()) placement_group_assert_no_leak([placement_group]) ray.shutdown()
def test_metrics_export_end_to_end(ray_start_cluster): NUM_NODES = 2 cluster = ray_start_cluster # Add a head node. cluster.add_node( _internal_config=json.dumps({"metrics_report_interval_ms": 1000})) # Add worker nodes. [cluster.add_node() for _ in range(NUM_NODES - 1)] cluster.wait_for_nodes() ray.init(address=cluster.address) signal = SignalActor.remote() # Generate some metrics around actor & tasks. @ray.remote def f(): counter = Count("test_counter", "desc", "unit", []) ray.get(signal.send.remote()) while True: counter.record(1, {}) time.sleep(0.1) @ray.remote class A: async def ready(self): pass async def ping(self): histogram = Histogram("test_histogram", "desc", "unit", [0, 1, 2], []) while True: histogram.record(1, {}) await asyncio.sleep(0.1) obj_refs = [f.remote() for _ in range(30)] a = A.remote() obj_refs.append(a.ping.remote()) # Make sure both histogram and counter are created ray.get(a.ready.remote()) ray.get(signal.wait.remote()) node_info_list = ray.nodes() prom_addresses = [] for node_info in node_info_list: metrics_export_port = node_info["MetricsExportPort"] addr = node_info["NodeManagerAddress"] prom_addresses.append(f"{addr}:{metrics_export_port}") # Make sure we can ping Prometheus endpoints. def fetch_prometheus(prom_addresses): components_dict = {} metric_names = set() for address in prom_addresses: if address not in components_dict: components_dict[address] = set() try: response = requests.get( "http://localhost:{}".format(metrics_export_port)) except requests.exceptions.ConnectionError: return components_dict, metric_names for line in response.text.split("\n"): for family in text_string_to_metric_families(line): for sample in family.samples: # print(sample) metric_names.add(sample.name) if "Component" in sample.labels: components_dict[address].add( sample.labels["Component"]) return components_dict, metric_names def test_prometheus_endpoint(): # TODO(Simon): Add a gcs_server after fixing metrics. components_dict, metric_names = fetch_prometheus(prom_addresses) # Raylet should be on every node expected_components = {"raylet"} components_found = all( expected_components.issubset(components) for components in components_dict.values()) # Core worker should be on at least one node components_found = components_found and any( "core_worker" in components for components in components_dict.values()) expected_metric_names = {"ray_test_counter", "ray_test_histogram_max"} metric_names_found = expected_metric_names.issubset(metric_names) return components_found and metric_names_found try: wait_for_condition( test_prometheus_endpoint, timeout=20, retry_interval_ms=1000, # Yield resource for other processes ) except RuntimeError: # This is for debugging when test failed. raise RuntimeError( "All components were not visible to " "prometheus endpoints on time. " f"The compoenents are {fetch_prometheus(prom_addresses)}") ray.shutdown()
def wait_until_node_dead(node): for n in ray.nodes(): if (n["ObjectStoreSocketName"] == node.address_info[ "object_store_address"]): return not n["Alive"] return False
np.array(default_pred) == np.array(y_test)) / len(default_pred) parameter_grid = { "n_estimators": [10, 50], "max_depth": [5, 50, 100], "ccp_alpha": [0.001, 0.01] } tune_search = TuneGridSearchCV(RandomForestClassifier(), param_grid=parameter_grid, scoring="accuracy") start = time.time() tune_search.fit(x_train, y_train) end = time.time() best_score = tune_search.best_score_ best_params = tune_search.best_params_ print('''This cluster consists of {} nodes in total {} CPU resources in total '''.format(len(ray.nodes()), ray.cluster_resources()['CPU'])) print(f"Default parameters: {default_params}") print(f"Default accuracy: {default_accuracy}") print("Tune GridSearch Fit Time:", end - start) print(f"GridSearch parameters: {best_params}") print(f"GridSearch score: {best_score}")
def test_worker_stats(shutdown_only): ray.init(num_cpus=1, include_webui=False) raylet = ray.nodes()[0] num_cpus = raylet["Resources"]["CPU"] raylet_address = "{}:{}".format(raylet["NodeManagerAddress"], ray.nodes()[0]["NodeManagerPort"]) channel = grpc.insecure_channel(raylet_address) stub = node_manager_pb2_grpc.NodeManagerServiceStub(channel) def try_get_node_stats(num_retry=5, timeout=2): reply = None for _ in range(num_retry): try: reply = stub.GetNodeStats(node_manager_pb2.NodeStatsRequest(), timeout=timeout) break except grpc.RpcError: continue assert reply is not None return reply reply = try_get_node_stats() # Check that there is one connected driver. drivers = [worker for worker in reply.workers_stats if worker.is_driver] assert len(drivers) == 1 assert os.getpid() == drivers[0].pid @ray.remote def f(): ray.show_in_webui("test") return os.getpid() @ray.remote class Actor(object): def __init__(self): pass def f(self): ray.show_in_webui("test") return os.getpid() # Test show_in_webui for remote functions. worker_pid = ray.get(f.remote()) reply = try_get_node_stats() target_worker_present = False for worker in reply.workers_stats: stats = worker.core_worker_stats if stats.webui_display == "test": target_worker_present = True assert worker.pid == worker_pid else: assert stats.webui_display == "" assert target_worker_present # Test show_in_webui for remote actors. a = Actor.remote() worker_pid = ray.get(a.f.remote()) reply = try_get_node_stats() target_worker_present = False for worker in reply.workers_stats: stats = worker.core_worker_stats if stats.webui_display == "test": target_worker_present = True assert worker.pid == worker_pid else: assert stats.webui_display == "" assert target_worker_present timeout_seconds = 20 start_time = time.time() while True: if time.time() - start_time > timeout_seconds: raise RayTestTimeoutException( "Timed out while waiting for worker processes") # Wait for the workers to start. if len(reply.workers_stats) < num_cpus + 1: time.sleep(1) reply = try_get_node_stats() continue # Check that the rest of the processes are workers, 1 for each CPU. assert len(reply.workers_stats) == num_cpus + 1 views = [view.view_name for view in reply.view_data] assert "redis_latency" in views assert "local_available_resource" in views # Check that all processes are Python. pids = [worker.pid for worker in reply.workers_stats] processes = [ p.info["name"] for p in psutil.process_iter(attrs=["pid", "name"]) if p.info["pid"] in pids ] for process in processes: # TODO(ekl) why does travis/mi end up in the process list assert ("python" in process or "ray" in process or "travis" in process) break
def test_global_state_api(shutdown_only): error_message = ("The ray global state API cannot be used " "before ray.init has been called.") with pytest.raises(Exception, match=error_message): ray.objects() with pytest.raises(Exception, match=error_message): ray.actors() with pytest.raises(Exception, match=error_message): ray.nodes() with pytest.raises(Exception, match=error_message): ray.jobs() ray.init(num_cpus=5, num_gpus=3, resources={"CustomResource": 1}) assert ray.cluster_resources()["CPU"] == 5 assert ray.cluster_resources()["GPU"] == 3 assert ray.cluster_resources()["CustomResource"] == 1 # A driver/worker creates a temporary object during startup. Although the # temporary object is freed immediately, in a rare case, we can still find # the object ref in GCS because Raylet removes the object ref from GCS # asynchronously. # Because we can't control when workers create the temporary objects, so # We can't assert that `ray.objects()` returns an empty dict. Here we just # make sure `ray.objects()` succeeds. assert len(ray.objects()) >= 0 job_id = ray.utils.compute_job_id_from_driver( ray.WorkerID(ray.worker.global_worker.worker_id)) client_table = ray.nodes() node_ip_address = ray.worker.global_worker.node_ip_address assert len(client_table) == 1 assert client_table[0]["NodeManagerAddress"] == node_ip_address @ray.remote class Actor: def __init__(self): pass _ = Actor.remote() # noqa: F841 # Wait for actor to be created wait_for_num_actors(1) actor_table = ray.actors() assert len(actor_table) == 1 actor_info, = actor_table.values() assert actor_info["JobID"] == job_id.hex() assert "IPAddress" in actor_info["Address"] assert "IPAddress" in actor_info["OwnerAddress"] assert actor_info["Address"]["Port"] != actor_info["OwnerAddress"]["Port"] job_table = ray.jobs() assert len(job_table) == 1 assert job_table[0]["JobID"] == job_id.hex() assert job_table[0]["DriverIPAddress"] == node_ip_address
def test_nested(ray_start_cluster, reconstruction_enabled): config = { "num_heartbeats_timeout": 10, "raylet_heartbeat_period_milliseconds": 100, "object_timeout_milliseconds": 200, "fetch_fail_timeout_milliseconds": 10_000, } # Workaround to reset the config to the default value. if not reconstruction_enabled: config["lineage_pinning_enabled"] = False cluster = ray_start_cluster # Head node with no resources. cluster.add_node( num_cpus=0, _system_config=config, enable_object_reconstruction=reconstruction_enabled) ray.init(address=cluster.address) done_signal = SignalActor.remote() exit_signal = SignalActor.remote() ray.get(done_signal.wait.remote(should_wait=False)) ray.get(exit_signal.wait.remote(should_wait=False)) # Node to place the initial object. node_to_kill = cluster.add_node(num_cpus=1, object_store_memory=10**8) cluster.wait_for_nodes() @ray.remote def dependent_task(x): return @ray.remote def large_object(): return np.zeros(10**7, dtype=np.uint8) @ray.remote def nested(done_signal, exit_signal): ref = ray.put(np.zeros(10**7, dtype=np.uint8)) # Flush object store. for _ in range(20): ray.put(np.zeros(10**7, dtype=np.uint8)) dep = dependent_task.options(resources={"node": 1}).remote(ref) ray.get(done_signal.send.remote(clear=True)) ray.get(dep) return ray.get(ref) ref = nested.remote(done_signal, exit_signal) # Wait for task to get scheduled on the node to kill. ray.get(done_signal.wait.remote()) # Wait for ray.put object to get transferred to the other node. cluster.add_node( num_cpus=2, resources={"node": 10}, object_store_memory=10**8) ray.get(dependent_task.remote(ref)) # Destroy the task's output. cluster.remove_node(node_to_kill, allow_graceful=False) wait_for_condition( lambda: not all(node["Alive"] for node in ray.nodes()), timeout=10) if reconstruction_enabled: # NOTE(swang): This is supposed to work because nested doesn't actually # return any ObjectRefs. However, currently the ray.put in `nested` # fails because the object already exists with a different owner. # See https://github.com/ray-project/ray/issues/20713. try: ray.get(ref, timeout=60) except ray.exceptions.RayTaskError as e: assert isinstance(e.cause, ray.exceptions.ObjectFetchTimedOutError) else: with pytest.raises(ray.exceptions.ObjectLostError): ray.get(ref, timeout=60)
args = parser.parse_args() tf = try_import_tf() ''' Custom environment ''' register_env("myenv", lambda config: shape_optimization(config)) if __name__ == "__main__": zero_time = time() # print(args.ray_address) ray.init(redis_address=args.ray_address) with open('Resources.txt', 'w') as f: f.write('Nodes used: ' + str(len(ray.nodes())) + '\n') f.write('Available resources:' + '\n'), f.write(str(ray.available_resources()) + '\n') f.flush() os.fsync(f) f.close() connect_time = time() register_time = time() # resources = ray.get_resource_ids() # cpus = [v[0] for v in resources['CPU']] # config = appo.DEFAULT_CONFIG.copy() config = ppo.DEFAULT_CONFIG.copy() # config = a3c.DEFAULT_CONFIG.copy()
def num_alive_nodes(): n = 0 for node in ray.nodes(): if node["Alive"]: n += 1 return n
def test_logs_stream_and_tail(ray_start_with_dashboard): assert wait_until_server_available(ray_start_with_dashboard["webui_url"]) is True webui_url = ray_start_with_dashboard["webui_url"] webui_url = format_web_url(webui_url) node_id = list_nodes()[0]["node_id"] def verify_basic(): stream_response = requests.get( webui_url + f"/api/v0/logs/file?node_id={node_id}&filename=gcs_server.out&lines=5", stream=True, ) if stream_response.status_code != 200: raise ValueError(stream_response.content.decode("utf-8")) lines = [] for line in stream_response.iter_lines(): lines.append(line.decode("utf-8")) return len(lines) == 5 or len(lines) == 6 wait_for_condition(verify_basic) @ray.remote class Actor: def write_log(self, strings): for s in strings: print(s) def getpid(self): return os.getpid() test_log_text = "test_log_text_日志_{}" actor = Actor.remote() ray.get(actor.write_log.remote([test_log_text.format("XXXXXX")])) # Test stream and fetching by actor id stream_response = requests.get( webui_url + "/api/v0/logs/stream?&lines=2" + f"&actor_id={actor._ray_actor_id.hex()}", stream=True, ) if stream_response.status_code != 200: raise ValueError(stream_response.content.decode("utf-8")) stream_iterator = stream_response.iter_content(chunk_size=None) # NOTE: Prefix 1 indicates the stream has succeeded. assert ( next(stream_iterator).decode("utf-8") == "1:actor_name:Actor\n" + test_log_text.format("XXXXXX") + "\n" ) streamed_string = "" for i in range(5): strings = [] for j in range(100): strings.append(test_log_text.format(f"{100*i + j:06d}")) ray.get(actor.write_log.remote(strings)) string = "" for s in strings: string += s + "\n" streamed_string += string # NOTE: Prefix 1 indicates the stream has succeeded. assert next(stream_iterator).decode("utf-8") == "1" + string del stream_response # Test tailing log by actor id LINES = 150 file_response = requests.get( webui_url + f"/api/v0/logs/file?&lines={LINES}" + "&actor_id=" + actor._ray_actor_id.hex(), ).content.decode("utf-8") # NOTE: Prefix 1 indicates the stream has succeeded. assert file_response == "1" + "\n".join(streamed_string.split("\n")[-(LINES + 1) :]) # Test query by pid & node_ip instead of actor id. node_ip = list(ray.nodes())[0]["NodeManagerAddress"] pid = ray.get(actor.getpid.remote()) file_response = requests.get( webui_url + f"/api/v0/logs/file?node_ip={node_ip}&lines={LINES}" + f"&pid={pid}", ).content.decode("utf-8") # NOTE: Prefix 1 indicates the stream has succeeded. assert file_response == "1" + "\n".join(streamed_string.split("\n")[-(LINES + 1) :])
def test_dynamic_res_concurrent_res_delete(ray_start_cluster): # This test makes sure resource gets deleted correctly when a task has # already acquired the resource cluster = ray_start_cluster res_name = "test_res" res_capacity = 5 num_nodes = 5 TIMEOUT_DURATION = 1 # Create a object ID to have the task wait on WAIT_OBJECT_ID_STR = ("a" * 20).encode("ascii") # Create a object ID to signal that the task is running TASK_RUNNING_OBJECT_ID_STR = ("b" * 20).encode("ascii") for i in range(num_nodes): cluster.add_node() ray.init(redis_address=cluster.redis_address) clientids = [client["ClientID"] for client in ray.nodes()] target_clientid = clientids[1] @ray.remote def set_res(resource_name, resource_capacity, res_client_id): ray.experimental.set_resource(resource_name, resource_capacity, client_id=res_client_id) @ray.remote def delete_res(resource_name, res_client_id): ray.experimental.set_resource(resource_name, 0, client_id=res_client_id) # Create the resource on node 1 ray.get(set_res.remote(res_name, res_capacity, target_clientid)) assert ray.cluster_resources()[res_name] == res_capacity # Task to hold the resource till the driver signals to finish @ray.remote def wait_func(running_oid, wait_oid): # Signal that the task is running ray.worker.global_worker.put_object(ray.ObjectID(running_oid), 1) # Make the task wait till signalled by driver ray.get(ray.ObjectID(wait_oid)) @ray.remote def test_func(): return 1 # Launch the task with resource requirement of 4, thus the new available # capacity becomes 1 task = wait_func._remote( args=[TASK_RUNNING_OBJECT_ID_STR, WAIT_OBJECT_ID_STR], resources={res_name: 4}) # Wait till wait_func is launched before updating resource ray.get(ray.ObjectID(TASK_RUNNING_OBJECT_ID_STR)) # Delete the resource ray.get(delete_res.remote(res_name, target_clientid)) # Signal task to complete ray.worker.global_worker.put_object(ray.ObjectID(WAIT_OBJECT_ID_STR), 1) ray.get(task) # Check if scheduler state is consistent by launching a task requiring # the deleted resource This should not execute task_2 = test_func._remote(args=[], resources={res_name: 1}) # This should be infeasible successful, unsuccessful = ray.wait([task_2], timeout=TIMEOUT_DURATION) assert unsuccessful # The task did not complete because it's infeasible assert res_name not in ray.available_resources()
logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) ray.init(address="localhost:6379") # These numbers need to correspond with the autoscaler config file. # The number of remote nodes in the autoscaler should upper bound # these because sometimes nodes fail to update. num_remote_nodes = 100 head_node_cpus = 2 num_remote_cpus = num_remote_nodes * head_node_cpus # Wait until the expected number of nodes have joined the cluster. while True: num_nodes = len(ray.nodes()) logger.info("Waiting for nodes {}/{}".format(num_nodes, num_remote_nodes + 1)) if num_nodes >= num_remote_nodes + 1: break time.sleep(5) logger.info("Nodes have all joined. There are %s resources.", ray.cluster_resources()) # Require 1 GPU to force the tasks to be on remote machines. @ray.remote(num_gpus=1) def f(size, *xs): return np.ones(size, dtype=np.uint8)