def test_dashboard_agent_restart(set_agent_failure_env_var, ray_start_cluster_head, error_pubsub, log_pubsub): """Test that when the agent fails to start many times in a row if the error message is suppressed correctly without spamming the driver. """ # Choose a duplicated port for the agent so that it will crash. errors = get_error_message(error_pubsub, 1, ray_constants.DASHBOARD_AGENT_DIED_ERROR, timeout=10) assert len(errors) == 1 for e in errors: assert ("There are 3 possible problems " "if you see this error." in e.error_message) # Make sure the agent process is not started anymore. cluster = ray_start_cluster_head wait_for_condition(lambda: search_agents(cluster) is None) # Make sure there's no spammy message for 5 seconds. def matcher(log_batch): return log_batch["pid"] != "autoscaler" match = get_log_batch(log_pubsub, 1, timeout=5, matcher=matcher) assert len(match) == 0, \ "There are spammy logs during Ray agent restart process. "\ f"Logs: {match}"
def test_gcs_server_failiure_report(ray_start_regular, log_pubsub): # Get gcs server pid to send a signal. all_processes = ray.worker._global_node.all_processes gcs_server_process = all_processes["gcs_server"][0].process gcs_server_pid = gcs_server_process.pid os.kill(gcs_server_pid, signal.SIGBUS) # wait for 30 seconds, for the 1st batch of logs. batches = get_log_batch(log_pubsub, 1, timeout=30) assert len(batches) == 1 assert batches[0]["pid"] == "gcs_server", batches
def test_runtime_env_logging_to_driver(ray_start_regular_shared, log_pubsub): @ray.remote(runtime_env={"pip": [f"requests=={REQUEST_VERSIONS[0]}"]}) def func(): pass ray.get(func.remote()) # Check the stderr from the worker. def matcher(log_batch): return log_batch["pid"] == "runtime_env" match = get_log_batch(log_pubsub, 1, timeout=5, matcher=matcher) assert len(match) > 0
def test_raylet_node_manager_server_failure(ray_start_cluster_head, log_pubsub): cluster = ray_start_cluster_head redis_port = int(cluster.address.split(":")[1]) # Reuse redis port to make node manager grpc server fail to start. with pytest.raises(Exception): cluster.add_node(wait=False, node_manager_port=redis_port) # wait for max 10 seconds. def matcher(log_batch): return log_batch["pid"] == "raylet" and any( "Failed to start the grpc server." in line for line in log_batch["lines"]) match = get_log_batch(log_pubsub, 1, timeout=10, matcher=matcher) assert len(match) > 0
def test_metrics_override_shouldnt_warn(ray_start_regular, log_pubsub): # https://github.com/ray-project/ray/issues/12859 @ray.remote def override(): a = Counter("num_count", description="") b = Counter("num_count", description="") a.inc(1) b.inc(1) ray.get(override.remote()) # Check the stderr from the worker. def matcher(log_batch): return any("Attempt to register measure" in line for line in log_batch["lines"]) match = get_log_batch(log_pubsub, 1, timeout=5, matcher=matcher) assert len(match) == 0, match
def test_gcs_server_failiure_report(ray_start_regular, log_pubsub): # Get gcs server pid to send a signal. all_processes = ray.worker._global_node.all_processes gcs_server_process = all_processes["gcs_server"][0].process gcs_server_pid = gcs_server_process.pid # TODO(mwtian): make sure logs are delivered after GCS is restarted. if sys.platform == "win32": sig = 9 else: sig = signal.SIGBUS os.kill(gcs_server_pid, sig) # wait for 30 seconds, for the 1st batch of logs. batches = get_log_batch(log_pubsub, 1, timeout=30) assert gcs_server_process.poll() is not None if sys.platform != "win32": # Windows signal handler does not run when process is terminated assert len(batches) == 1 assert batches[0]["pid"] == "gcs_server", batches
def submit_job(): # Connect a driver to the Ray cluster. ray.init(address=cluster.address, ignore_reinit_error=True) p = init_log_pubsub() # It always prints the monitor messages. logs = get_log_message(p, 1) @ray.remote def f(): print("remote func") ray.get(f.remote()) def matcher(log_batch): return log_batch["task_name"] == "f" logs = get_log_batch(p, 1, matcher=matcher) # It should logs with pid of hex job id instead of None assert logs[0]["pid"] is not None ray.shutdown()