def test_actor_scope_or_intentionally_killed_message(ray_start_regular): @ray.remote class Actor(object): pass a = Actor.remote() a = Actor.remote() a.__ray_terminate__.remote() time.sleep(1) assert len( ray.errors()) == 0, ("Should not have propogated an error - {}".format( ray.errors()))
def test_fail_importing_actor(ray_start_regular): # Create the contents of a temporary Python file. temporary_python_file = """ def temporary_helper_function(): return 1 """ f = tempfile.NamedTemporaryFile(suffix=".py") f.write(temporary_python_file.encode("ascii")) f.flush() directory = os.path.dirname(f.name) # Get the module name and strip ".py" from the end. module_name = os.path.basename(f.name)[:-3] sys.path.append(directory) module = __import__(module_name) # Define an actor that closes over this temporary module. This should # fail when it is unpickled. @ray.remote class Foo(object): def __init__(self): self.x = module.temporary_python_file() def get_val(self): return 1 # There should be no errors yet. assert len(ray.errors()) == 0 # Create an actor. foo = Foo.remote() # Wait for the error to arrive. wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1) errors = relevant_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR) assert "No module named" in errors[0]["message"] # Wait for the error from when the __init__ tries to run. wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1) errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) assert ("failed to be imported, and so cannot execute this method" in errors[0]["message"]) # Check that if we try to get the function it throws an exception and # does not hang. with pytest.raises(Exception): ray.get(foo.get_val.remote()) # Wait for the error from when the call to get_val. wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2) errors = relevant_errors(ray_constants.TASK_PUSH_ERROR) assert ("failed to be imported, and so cannot execute this method" in errors[1]["message"]) f.close() # Clean up the junk we added to sys.path. sys.path.pop(-1)
def get_node_stats(self) -> Dict: with self._node_stats_lock: self.purge_outdated_stats() node_stats = sorted((v for v in self._node_stats.values()), key=itemgetter("boot_time")) return { "totals": self.calculate_totals(), "tasks": self.calculate_tasks(), "clients": node_stats, "logs": self._logs, "errors": ray.errors(all_jobs=True), }
def wait_for_errors(error_check): # Wait for errors from all the nondeterministic tasks. errors = [] time_left = 100 while time_left > 0: errors = ray.errors() if error_check(errors): break time_left -= 1 time.sleep(1) # Make sure that enough errors came through. assert error_check(errors) return errors
def flat_errors(): errors = [] for job_errors in ray.errors(all_jobs=True).values(): errors.extend(job_errors) return errors
def test_error_isolation(call_ray_start): redis_address = call_ray_start # Connect a driver to the Ray cluster. ray.init(redis_address=redis_address) # There shouldn't be any errors yet. assert len(ray.errors()) == 0 error_string1 = "error_string1" error_string2 = "error_string2" @ray.remote def f(): raise Exception(error_string1) # Run a remote function that throws an error. with pytest.raises(Exception): ray.get(f.remote()) # Wait for the error to appear in Redis. while len(ray.errors()) != 1: time.sleep(0.1) print("Waiting for error to appear.") # Make sure we got the error. assert len(ray.errors()) == 1 assert error_string1 in ray.errors()[0]["message"] # Start another driver and make sure that it does not receive this # error. Make the other driver throw an error, and make sure it # receives that error. driver_script = """ import ray import time ray.init(redis_address="{}") time.sleep(1) assert len(ray.errors()) == 0 @ray.remote def f(): raise Exception("{}") try: ray.get(f.remote()) except Exception as e: pass while len(ray.errors()) != 1: print(len(ray.errors())) time.sleep(0.1) assert len(ray.errors()) == 1 assert "{}" in ray.errors()[0]["message"] print("success") """.format(redis_address, error_string2, error_string2) out = run_string_as_driver(driver_script) # Make sure the other driver succeeded. assert "success" in out # Make sure that the other error message doesn't show up for this # driver. assert len(ray.errors()) == 1 assert error_string1 in ray.errors()[0]["message"]
def error_messages(self, job_id=None): logger.warning( "ray.global_state.error_messages() is deprecated and will be " "removed in a subsequent release. Use ray.errors() " "instead.") return ray.errors(job_id=job_id)
def relevant_errors(error_type): return [info for info in ray.errors() if info["type"] == error_type]
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',filename='app.log',level=logging.DEBUG) print(os.getpid()) print(os.getppid()) if not ray.is_initialized(): ray.init(include_webui=True) files = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M'] db.setup() print(os.getpid()) print(os.getppid()) with ray.profile('Event'): for i in range(10): time.sleep(randint(0, 4)) try: ray.get(worker.remote(i)) except Exception as e: raise e print(e.message) finally: print('finally') #results = [worker.remote(file) for file in (files)] #ray.get(results) if __name__ == '__main__': main() ray.timeline(filename='timeline.dump') print(ray.errors())
def error_messages(self, all_jobs=False): logger.warning( "ray.global_state.error_messages() is deprecated and will be " "removed in a subsequent release. Use ray.errors() instead.") return ray.errors(all_jobs=all_jobs)