def assert_no_system_failure(p, timeout): # Get all logs for 20 seconds. logs = get_log_message(p, timeout=timeout) for log in logs: assert "SIG" not in log, ("There's the segfault or SIGBART reported.") assert "Check failed" not in log, ( "There's the check failure reported.")
def test_log_monitor_backpressure(ray_start_cluster, monkeypatch): update_interval = 3 monkeypatch.setenv("LOG_NAME_UPDATE_INTERVAL_S", str(update_interval)) # Intentionally set low to trigger the backpressure condition. monkeypatch.setenv("RAY_LOG_MONITOR_MANY_FILES_THRESHOLD", "1") expected_str = "abcxyz" def matcher(line): return line == expected_str # Test log monitor still works with backpressure. cluster = ray_start_cluster cluster.add_node(num_cpus=4) # Connect a driver to the Ray cluster. ray.init(address=cluster.address) p = init_log_pubsub() @ray.remote class Actor: def print(self): print(expected_str) now = datetime.now() a = Actor.remote() ray.get(a.print.remote()) logs = get_log_message(p, 1, matcher=matcher) assert logs[0][0] == expected_str # Since the log file update is delayed, # it should take more than update_interval # to publish a message for a new worker. assert (datetime.now() - now).seconds >= update_interval now = datetime.now() a = Actor.remote() ray.get(a.print.remote()) logs = get_log_message(p, 1, matcher=matcher) assert logs[0][0] == expected_str assert (datetime.now() - now).seconds >= update_interval
def test_log_monitor_backpressure(ray_start_cluster): update_interval = 3 os.environ["LOG_NAME_UPDATE_INTERVAL_S"] = str(update_interval) # Intentionally set low to trigger the backpressure condition. os.environ["RAY_LOG_MONITOR_MANY_FILES_THRESHOLD"] = "1" expected_str = "abc" # Test log monitor still works with backpressure. cluster = ray_start_cluster cluster.add_node(num_cpus=4) # Connect a driver to the Ray cluster. ray.init(address=cluster.address) p = init_log_pubsub() # It always prints the monitor messages. logs = get_log_message(p, 1) @ray.remote class Actor: def print(self): print(expected_str) now = datetime.now() a = Actor.remote() a.print.remote() logs = get_log_message(p, 1) assert logs[0] == expected_str # Since the log file update is delayed, # it should take more than update_interval # to publish a message for a new worker. assert (datetime.now() - now).seconds >= update_interval now = datetime.now() a = Actor.remote() a.print.remote() logs = get_log_message(p, 1) assert logs[0] == expected_str assert (datetime.now() - now).seconds >= update_interval
def test_ignore_windows_access_violation(ray_start_regular_shared): @ray.remote def print_msg(): print("Windows fatal exception: access violation\n") @ray.remote def print_after(_obj): print("done") p = init_log_pubsub() print_after.remote(print_msg.remote()) msgs = get_log_message( p, num=3, timeout=1, job_id=ray.get_runtime_context().job_id.hex() ) assert len(msgs) == 1, msgs assert msgs[0][0] == "done"
def submit_job(): # Connect a driver to the Ray cluster. ray.init(address=cluster.address, ignore_reinit_error=True) p = init_log_pubsub() # It always prints the monitor messages. logs = get_log_message(p, 1) @ray.remote def f(): print("remote func") ray.get(f.remote()) def matcher(log_batch): return log_batch["task_name"] == "f" logs = get_log_batch(p, 1, matcher=matcher) # It should logs with pid of hex job id instead of None assert logs[0]["pid"] is not None ray.shutdown()
def test_threaded_actor_integration_test_stress(ray_start_cluster_head, log_pubsub, error_pubsub): """This is a sanity test that checks threaded actors are working with the nightly stress test. """ cluster = ray_start_cluster_head p = log_pubsub e = error_pubsub # Prepare the config num_remote_nodes = 4 num_parents = 6 num_children = 6 death_probability = 0.95 max_concurrency = 10 for _ in range(num_remote_nodes): cluster.add_node(num_cpus=2) @ray.remote class Child(object): def __init__(self, death_probability): self.death_probability = death_probability def ping(self): # Exit process with some probability. exit_chance = np.random.rand() if exit_chance > self.death_probability: sys.exit(-1) @ray.remote class Parent(object): def __init__(self, num_children, death_probability=0.95): self.death_probability = death_probability self.children = [ Child.options( max_concurrency=max_concurrency).remote(death_probability) for _ in range(num_children) ] def ping(self, num_pings): children_outputs = [] for _ in range(num_pings): children_outputs += [ child.ping.remote() for child in self.children ] try: ray.get(children_outputs) except Exception: # Replace the children if one of them died. self.__init__(len(self.children), self.death_probability) def kill(self): # Clean up children. ray.get( [child.__ray_terminate__.remote() for child in self.children]) parents = [ Parent.options(max_concurrency=max_concurrency).remote( num_children, death_probability) for _ in range(num_parents) ] start = time.time() loop_times = [] for _ in range(10): loop_start = time.time() ray.get([parent.ping.remote(10) for parent in parents]) # Kill a parent actor with some probability. exit_chance = np.random.rand() if exit_chance > death_probability: parent_index = np.random.randint(len(parents)) parents[parent_index].kill.remote() parents[parent_index] = Parent.options( max_concurrency=max_concurrency).remote( num_children, death_probability) loop_times.append(time.time() - loop_start) result = {} print("Finished in: {}s".format(time.time() - start)) print("Average iteration time: {}s".format( sum(loop_times) / len(loop_times))) print("Max iteration time: {}s".format(max(loop_times))) print("Min iteration time: {}s".format(min(loop_times))) result["total_time"] = time.time() - start result["avg_iteration_time"] = sum(loop_times) / len(loop_times) result["max_iteration_time"] = max(loop_times) result["min_iteration_time"] = min(loop_times) result["success"] = 1 print(result) ensure_cpu_returned(10) del parents # Make sure parents are still scheduleable. parents = [ Parent.options(max_concurrency=max_concurrency).remote( num_children, death_probability) for _ in range(num_parents) ] ray.get([parent.ping.remote(10) for parent in parents]) """ Make sure there are not SIGSEGV, SIGBART, or other odd check failures. """ # Get all logs for 20 seconds. logs = test_utils.get_log_message(p, timeout=20) for log in logs: assert "SIG" not in log, "There's the segfault or SIGBART reported." assert "Check failed" not in log, ( "There's the check failure reported.") # Get error messages for 10 seconds. errors = test_utils.get_error_message(e, timeout=10) for error in errors: print(error) assert "You can ignore this message if" not in error.error_message, ( "Resource deadlock warning shouldn't be printed, but it did.")