Example #1
0
    def testFailedActorInit(self):
        ray.init(num_workers=0, driver_mode=ray.SILENT_MODE)

        error_message1 = "actor constructor failed"
        error_message2 = "actor method failed"

        @ray.remote
        class FailedActor(object):
            def __init__(self):
                raise Exception(error_message1)

            def get_val(self):
                return 1

            def fail_method(self):
                raise Exception(error_message2)

        a = FailedActor.remote()

        # Make sure that we get errors from a failed constructor.
        wait_for_errors(b"task", 1)
        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn(error_message1,
                      ray.error_info()[0][b"message"].decode("ascii"))

        # Make sure that we get errors from a failed method.
        a.fail_method.remote()
        wait_for_errors(b"task", 2)
        self.assertEqual(len(ray.error_info()), 2)
        self.assertIn(error_message2,
                      ray.error_info()[1][b"message"].decode("ascii"))
Example #2
0
    def testFailImportingActor(self):
        ray.init(num_workers=2, driver_mode=ray.SILENT_MODE)

        # Create the contents of a temporary Python file.
        temporary_python_file = """
def temporary_helper_function():
    return 1
"""

        f = tempfile.NamedTemporaryFile(suffix=".py")
        f.write(temporary_python_file.encode("ascii"))
        f.flush()
        directory = os.path.dirname(f.name)
        # Get the module name and strip ".py" from the end.
        module_name = os.path.basename(f.name)[:-3]
        sys.path.append(directory)
        module = __import__(module_name)

        # Define an actor that closes over this temporary module. This should
        # fail when it is unpickled.
        @ray.remote
        class Foo(object):
            def __init__(self):
                self.x = module.temporary_python_file()

            def get_val(self):
                return 1

        # There should be no errors yet.
        self.assertEqual(len(ray.error_info()), 0)

        # Create an actor.
        foo = Foo.remote()

        # Wait for the error to arrive.
        wait_for_errors(b"register_actor", 1)
        self.assertIn(b"No module named", ray.error_info()[0][b"message"])

        # Wait for the error from when the __init__ tries to run.
        wait_for_errors(b"task", 1)
        self.assertIn(
            b"failed to be imported, and so cannot execute this method",
            ray.error_info()[1][b"message"])

        # Check that if we try to get the function it throws an exception and
        # does not hang.
        with self.assertRaises(Exception):
            ray.get(foo.get_val.remote())

        # Wait for the error from when the call to get_val.
        wait_for_errors(b"task", 2)
        self.assertIn(
            b"failed to be imported, and so cannot execute this method",
            ray.error_info()[2][b"message"])

        f.close()

        # Clean up the junk we added to sys.path.
        sys.path.pop(-1)
Example #3
0
def test_actor_scope_or_intentionally_killed_message(ray_start_regular):
    @ray.remote
    class Actor(object):
        pass

    a = Actor.remote()
    a = Actor.remote()
    a.__ray_terminate__.remote()
    time.sleep(1)
    assert len(ray.error_info()) == 0, (
        "Should not have propogated an error - {}".format(ray.error_info()))
Example #4
0
    def testFailedFunctionToRun(self):
        ray.init(num_workers=2, driver_mode=ray.SILENT_MODE)

        def f(worker):
            if ray.worker.global_worker.mode == ray.WORKER_MODE:
                raise Exception("Function to run failed.")
        ray.worker.global_worker.run_function_on_all_workers(f)
        wait_for_errors(b"function_to_run", 2)
        # Check that the error message is in the task info.
        self.assertEqual(len(ray.error_info()), 2)
        self.assertIn(b"Function to run failed.",
                      ray.error_info()[0][b"message"])
        self.assertIn(b"Function to run failed.",
                      ray.error_info()[1][b"message"])
Example #5
0
    def testWorkerDying(self):
        ray.init(num_workers=0, driver_mode=ray.SILENT_MODE)

        # Define a remote function that will kill the worker that runs it.
        @ray.remote
        def f():
            eval("exit()")

        f.remote()

        wait_for_errors(b"worker_died", 1)

        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn("A worker died or was killed while executing a task.",
                      ray.error_info()[0][b"message"].decode("ascii"))
Example #6
0
def test_fail_importing_actor(ray_start_regular):
    # Create the contents of a temporary Python file.
    temporary_python_file = """
def temporary_helper_function():
    return 1
"""

    f = tempfile.NamedTemporaryFile(suffix=".py")
    f.write(temporary_python_file.encode("ascii"))
    f.flush()
    directory = os.path.dirname(f.name)
    # Get the module name and strip ".py" from the end.
    module_name = os.path.basename(f.name)[:-3]
    sys.path.append(directory)
    module = __import__(module_name)

    # Define an actor that closes over this temporary module. This should
    # fail when it is unpickled.
    @ray.remote
    class Foo(object):
        def __init__(self):
            self.x = module.temporary_python_file()

        def get_val(self):
            return 1

    # There should be no errors yet.
    assert len(ray.error_info()) == 0

    # Create an actor.
    foo = Foo.remote()

    # Wait for the error to arrive.
    wait_for_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR, 1)
    errors = relevant_errors(ray_constants.REGISTER_ACTOR_PUSH_ERROR)
    assert "No module named" in errors[0]["message"]

    # Wait for the error from when the __init__ tries to run.
    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
    assert ("failed to be imported, and so cannot execute this method" in
            errors[0]["message"])

    # Check that if we try to get the function it throws an exception and
    # does not hang.
    with pytest.raises(Exception):
        ray.get(foo.get_val.remote())

    # Wait for the error from when the call to get_val.
    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
    errors = relevant_errors(ray_constants.TASK_PUSH_ERROR)
    assert ("failed to be imported, and so cannot execute this method" in
            errors[1]["message"])

    f.close()

    # Clean up the junk we added to sys.path.
    sys.path.pop(-1)
Example #7
0
def wait_for_errors(error_check):
    # Wait for errors from all the nondeterministic tasks.
    errors = []
    time_left = 100
    while time_left > 0:
        errors = ray.error_info()
        if error_check(errors):
            break
        time_left -= 1
        time.sleep(1)

        # Make sure that enough errors came through.
    assert error_check(errors)
    return errors
Example #8
0
def main(cmd_line_args) :
    t0 = time.perf_counter()
    listOfRuns = []
    seeds = []
    num_seeds = 100000
    #For reproducibility
    random.seed(1237)
    for p in gen_primes():
        seeds.append(p)
        if len(seeds) == num_seeds:
            break
    for Domain, H in [("Skiing-machado-sticky-v0", 3600)]:
    #for Domain, H, numberRollouts in [("GridWorld-4x4-initS{}-v{}".format(intState, gridVersion), 20,4), ("GridWorld-10x10-initS{}-v{}".format(intState, gridVersion), 50, 10), ("GridWorld-20x20-initS{}-v{}".format(intState, gridVersion), 100, 20), ("GridWorld-50x50-initS{}-v{}".format(intState, gridVersion), 250, 50), ("GridWorld-100x100-initS{}-v{}".format(intState, gridVersion), 500, 100)]:
#for Domain, H, numberRollouts in [("CTP-4x4-initS{}-v{}".format(intState, gridVersion), 20,4), ("CTP-10x10-initS{}-v{}".format(intState, gridVersion), 50, 10), ("CTP-20x20-initS{}-v{}".format(intState, gridVersion), 100, 20), ("CTP-50x50-initS{}-v{}".format(intState, gridVersion), 250, 50)]:
        for lookaheadBudget in [100]:
            numberOfRuns = 100
            seed = random.randint(0,100000)
            numberRollouts = 1
            listOfRuns.append(run_experiment.remote(Domain, 99999999, lookaheadBudget, H, numberRollouts, numberOfRuns, seeds, seed, True, False) )
    ray.get(listOfRuns)
    ray.error_info()
    tf = time.perf_counter()
    print("Time taken = {}".format(tf-t0))
Example #9
0
    def testWorkerRaisingException(self):
        ray.init(num_workers=1, driver_mode=ray.SILENT_MODE)

        @ray.remote
        def f():
            ray.worker.global_worker._get_next_task_from_local_scheduler = None

        # Running this task should cause the worker to raise an exception after
        # the task has successfully completed.
        f.remote()

        wait_for_errors(b"worker_crash", 1)
        wait_for_errors(b"worker_died", 1)
        self.assertEqual(len(ray.error_info()), 2)
Example #10
0
    def testWorkerRaisingException(self):
        ray.init(num_workers=1, driver_mode=ray.SILENT_MODE)

        @ray.remote
        def f():
            ray.worker.global_worker._get_next_task_from_local_scheduler = None

        # Running this task should cause the worker to raise an exception after
        # the task has successfully completed.
        f.remote()

        wait_for_errors(ray_constants.WORKER_CRASH_PUSH_ERROR, 1)
        wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)
        self.assertEqual(len(ray.error_info()), 2)
Example #11
0
    def testFailedFunctionToRun(self):
        ray.init(num_workers=2)

        def f(worker):
            if ray.worker.global_worker.mode == ray.WORKER_MODE:
                raise Exception("Function to run failed.")

        ray.worker.global_worker.run_function_on_all_workers(f)
        wait_for_errors(ray_constants.FUNCTION_TO_RUN_PUSH_ERROR, 2)
        # Check that the error message is in the task info.
        error_info = ray.error_info()
        assert len(error_info) == 2
        assert "Function to run failed." in error_info[0]["message"]
        assert "Function to run failed." in error_info[1]["message"]
Example #12
0
    def wait_for_errors(self, error_check):
        # Wait for errors from all the nondeterministic tasks.
        errors = []
        time_left = 100
        while time_left > 0:
            errors = ray.error_info()
            if error_check(errors):
                break
            time_left -= 1
            time.sleep(1)

        # Make sure that enough errors came through.
        self.assertTrue(error_check(errors))
        return errors
Example #13
0
    def testFailImportingRemoteFunction(self):
        ray.init(num_workers=2, driver_mode=ray.SILENT_MODE)

        # Create the contents of a temporary Python file.
        temporary_python_file = """
def temporary_helper_function():
    return 1
"""

        f = tempfile.NamedTemporaryFile(suffix=".py")
        f.write(temporary_python_file.encode("ascii"))
        f.flush()
        directory = os.path.dirname(f.name)
        # Get the module name and strip ".py" from the end.
        module_name = os.path.basename(f.name)[:-3]
        sys.path.append(directory)
        module = __import__(module_name)

        # Define a function that closes over this temporary module. This should
        # fail when it is unpickled.
        @ray.remote
        def g():
            return module.temporary_python_file()

        wait_for_errors(b"register_remote_function", 2)
        self.assertIn(b"No module named", ray.error_info()[0][b"message"])
        self.assertIn(b"No module named", ray.error_info()[1][b"message"])

        # Check that if we try to call the function it throws an exception and
        # does not hang.
        for _ in range(10):
            self.assertRaises(Exception, lambda: ray.get(g.remote()))

        f.close()

        # Clean up the junk we added to sys.path.
        sys.path.pop(-1)
Example #14
0
    def testFailImportingRemoteFunction(self):
        ray.init(num_workers=2, driver_mode=ray.SILENT_MODE)

        # Create the contents of a temporary Python file.
        temporary_python_file = """
def temporary_helper_function():
    return 1
"""

        f = tempfile.NamedTemporaryFile(suffix=".py")
        f.write(temporary_python_file.encode("ascii"))
        f.flush()
        directory = os.path.dirname(f.name)
        # Get the module name and strip ".py" from the end.
        module_name = os.path.basename(f.name)[:-3]
        sys.path.append(directory)
        module = __import__(module_name)

        # Define a function that closes over this temporary module. This should
        # fail when it is unpickled.
        @ray.remote
        def g():
            return module.temporary_python_file()

        wait_for_errors(ray_constants.REGISTER_REMOTE_FUNCTION_PUSH_ERROR, 2)
        self.assertIn("No module named", ray.error_info()[0]["message"])
        self.assertIn("No module named", ray.error_info()[1]["message"])

        # Check that if we try to call the function it throws an exception and
        # does not hang.
        for _ in range(10):
            self.assertRaises(Exception, lambda: ray.get(g.remote()))

        f.close()

        # Clean up the junk we added to sys.path.
        sys.path.pop(-1)
Example #15
0
 def _collect_distributed_fits(self, n_min=0):
     n_min = max(0, n_min)
     moment_updates = self.moment_updates
     num_moments = len(moment_updates)
     while len(moment_updates) > n_min:
         errors = ray.error_info()
         if len(errors) > 0:
             print('errors:', errors)
         print('n min =', n_min, 'remaining =', len(moment_updates), 'object ids =', moment_updates)
         ready_id, moment_updates = ray.wait(moment_updates, num_returns=1)
         print('processing', ready_id[0])
         update = ray.get(ready_id[0])
         self.moments.moments += update
     self.moment_updates = moment_updates
     return num_moments - len(moment_updates)
Example #16
0
    def testWorkerDying(self):
        ray.init(num_workers=0)

        # Define a remote function that will kill the worker that runs it.
        @ray.remote
        def f():
            eval("exit()")

        f.remote()

        wait_for_errors(ray_constants.WORKER_DIED_PUSH_ERROR, 1)

        error_info = ray.error_info()
        assert len(error_info) == 1
        assert "died or was killed while executing" in error_info[0]["message"]
Example #17
0
def test_failed_actor_init(ray_start_regular):
    error_message1 = "actor constructor failed"
    error_message2 = "actor method failed"

    @ray.remote
    class FailedActor(object):
        def __init__(self):
            raise Exception(error_message1)

        def fail_method(self):
            raise Exception(error_message2)

    a = FailedActor.remote()

    # Make sure that we get errors from a failed constructor.
    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 1)
    assert len(ray.error_info()) == 1
    assert error_message1 in ray.error_info()[0]["message"]

    # Make sure that we get errors from a failed method.
    a.fail_method.remote()
    wait_for_errors(ray_constants.TASK_PUSH_ERROR, 2)
    assert len(ray.error_info()) == 2
    assert error_message1 in ray.error_info()[1]["message"]
Example #18
0
    def testFailImportingEnvironmentVariable(self):
        ray.init(num_workers=2, driver_mode=ray.SILENT_MODE)

        # This will throw an exception when the environment variable is imported on
        # the workers.
        def initializer():
            if ray.worker.global_worker.mode == ray.WORKER_MODE:
                raise Exception("The initializer failed.")
            return 0

        ray.env.foo = ray.EnvironmentVariable(initializer)
        wait_for_errors(b"register_environment_variable", 2)
        # Check that the error message is in the task info.
        self.assertIn(b"The initializer failed.",
                      ray.error_info()[0][b"message"])

        ray.worker.cleanup()
Example #19
0
    def testFailImportingReusableVariable(self):
        ray.init(start_ray_local=True,
                 num_workers=2,
                 driver_mode=ray.SILENT_MODE)

        # This will throw an exception when the reusable variable is imported on the
        # workers.
        def initializer():
            if ray.worker.global_worker.mode == ray.WORKER_MODE:
                raise Exception("The initializer failed.")
            return 0

        ray.reusables.foo = ray.Reusable(initializer)
        wait_for_errors("ReusableVariableImportError", 1)
        # Check that the error message is in the task info.
        self.assertTrue("The initializer failed." in ray.error_info()
                        ["ReusableVariableImportError"][0]["message"])

        ray.worker.cleanup()
Example #20
0
    def testFailReinitializingVariable(self):
        ray.init(num_workers=2, driver_mode=ray.SILENT_MODE)

        def initializer():
            return 0

        def reinitializer(foo):
            raise Exception("The reinitializer failed.")

        ray.env.foo = ray.EnvironmentVariable(initializer, reinitializer)

        @ray.remote
        def use_foo():
            ray.env.foo

        use_foo.remote()
        wait_for_errors(b"reinitialize_environment_variable", 1)
        # Check that the error message is in the task info.
        self.assertIn(b"The reinitializer failed.",
                      ray.error_info()[0][b"message"])

        ray.worker.cleanup()
Example #21
0
    def testFailReinitializingVariable(self):
        ray.init(start_ray_local=True,
                 num_workers=2,
                 driver_mode=ray.SILENT_MODE)

        def initializer():
            return 0

        def reinitializer(foo):
            raise Exception("The reinitializer failed.")

        ray.reusables.foo = ray.Reusable(initializer, reinitializer)

        @ray.remote
        def use_foo():
            ray.reusables.foo

        use_foo.remote()
        wait_for_errors("ReusableVariableReinitializeError", 1)
        # Check that the error message is in the task info.
        self.assertTrue("The reinitializer failed." in ray.error_info()
                        ["ReusableVariableReinitializeError"][0]["message"])

        ray.worker.cleanup()
Example #22
0
def main(cmd_line_args):
    t0 = time.perf_counter()
    listOfRuns = []
    seeds = []
    num_seeds = 100000
    #For reproducibility
    random.seed(1337)
    for p in gen_primes():
        seeds.append(p)
        if len(seeds) == num_seeds:
            break
    for intState in range(10):
        for num_states in [10, 50]:
            for Domain, H in [
                ("Antishape-{}-initS{}-v2".format(num_states,
                                                  intState), 4 * num_states),
                ("Combolock-{}-initS{}-v2".format(num_states,
                                                  intState), 4 * num_states)
            ]:
                #for Domain, H, numberRollouts in [("GridWorld-4x4-initS{}-v{}".format(intState, gridVersion), 20,4), ("GridWorld-10x10-initS{}-v{}".format(intState, gridVersion), 50, 10), ("GridWorld-20x20-initS{}-v{}".format(intState, gridVersion), 100, 20), ("GridWorld-50x50-initS{}-v{}".format(intState, gridVersion), 250, 50), ("GridWorld-100x100-initS{}-v{}".format(intState, gridVersion), 500, 100)]:
                #for Domain, H, numberRollouts in [("CTP-4x4-initS{}-v{}".format(intState, gridVersion), 20,4), ("CTP-10x10-initS{}-v{}".format(intState, gridVersion), 50, 10), ("CTP-20x20-initS{}-v{}".format(intState, gridVersion), 100, 20), ("CTP-50x50-initS{}-v{}".format(intState, gridVersion), 250, 50)]:
                for lookaheadBudget in [100, 500, 1000]:
                    numberOfRuns = 20
                    seed = random.randint(0, 100000)
                    numberRollouts = 1
                    listOfRuns.append(
                        run_experiment.remote(Domain, 99999999,
                                              lookaheadBudget, H,
                                              numberRollouts, numberOfRuns,
                                              seeds, seed, False, False))

        for gridVersion in [2, 3, 5]:
            for gridDim in [10, 20, 50]:
                for Domain, H, numberRollouts in [
                    ("GridWorld-{}x{}-initS{}-v{}".format(
                        gridDim, gridDim, intState,
                        gridVersion), gridDim * 5, 1)
                ]:
                    #for Domain, H, numberRollouts in [("CTP-4x4-initS{}-v{}".format(intState, gridVersion), 20,4), ("CTP-10x10-initS{}-v{}".format(intState, gridVersion), 50, 10), ("CTP-20x20-initS{}-v{}".format(intState, gridVersion), 100, 20), ("CTP-50x50-initS{}-v{}".format(intState, gridVersion), 250, 50)]:
                    for lookaheadBudget in [100, 1000, 10000]:
                        numberOfRuns = 20
                        seed = random.randint(0, 100000)
                        numberRollouts = 1
                        if gridVersion == 5:
                            listOfRuns.append(
                                run_experiment.remote(Domain, 99999999,
                                                      lookaheadBudget, H,
                                                      numberRollouts,
                                                      numberOfRuns, seeds,
                                                      seed, True, False))
                        else:
                            listOfRuns.append(
                                run_experiment.remote(Domain, 99999999,
                                                      lookaheadBudget, H,
                                                      numberRollouts,
                                                      numberOfRuns, seeds,
                                                      seed, False, False))
        for gridDim in [10, 20]:
            for Domain, H, numberRollouts in [
                ("CTP-{}x{}-initS{}-v1".format(gridDim, gridDim,
                                               intState), gridDim * 5, 1)
            ]:
                for lookaheadBudget in [100, 1000, 10000]:
                    numberOfRuns = 20
                    seed = random.randint(0, 100000)
                    numberRollouts = 1
                    listOfRuns.append(
                        run_experiment.remote(Domain, 99999999,
                                              lookaheadBudget, H,
                                              numberRollouts, numberOfRuns,
                                              seeds, seed, True, True))
        ray.get(listOfRuns)
    ray.error_info()
    tf = time.perf_counter()
    print("Time taken = {}".format(tf - t0))
Example #23
0
def relevant_errors(error_type):
    return [info for info in ray.error_info() if info["type"] == error_type]
Example #24
0
def test_error_isolation(ray_start_head):
    redis_address = ray_start_head
    # Connect a driver to the Ray cluster.
    ray.init(redis_address=redis_address)

    # There shouldn't be any errors yet.
    assert len(ray.error_info()) == 0

    error_string1 = "error_string1"
    error_string2 = "error_string2"

    @ray.remote
    def f():
        raise Exception(error_string1)

    # Run a remote function that throws an error.
    with pytest.raises(Exception):
        ray.get(f.remote())

    # Wait for the error to appear in Redis.
    while len(ray.error_info()) != 1:
        time.sleep(0.1)
        print("Waiting for error to appear.")

    # Make sure we got the error.
    assert len(ray.error_info()) == 1
    assert error_string1 in ray.error_info()[0]["message"]

    # Start another driver and make sure that it does not receive this
    # error. Make the other driver throw an error, and make sure it
    # receives that error.
    driver_script = """
import ray
import time

ray.init(redis_address="{}")

time.sleep(1)
assert len(ray.error_info()) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

while len(ray.error_info()) != 1:
    print(len(ray.error_info()))
    time.sleep(0.1)
assert len(ray.error_info()) == 1

assert "{}" in ray.error_info()[0]["message"]

print("success")
""".format(redis_address, error_string2, error_string2)

    out = run_string_as_driver(driver_script)
    # Make sure the other driver succeeded.
    assert "success" in out

    # Make sure that the other error message doesn't show up for this
    # driver.
    assert len(ray.error_info()) == 1
    assert error_string1 in ray.error_info()[0]["message"]
Example #25
0
    def testErrorIsolation(self):
        # Start the Ray processes on this machine.
        out = subprocess.check_output([start_ray_script,
                                       "--head"]).decode("ascii")
        # Get the redis address from the output.
        redis_substring_prefix = "redis_address=\""
        redis_address_location = out.find(redis_substring_prefix) + len(
            redis_substring_prefix)
        redis_address = out[redis_address_location:]
        redis_address = redis_address.split("\"")[0]
        # Connect a driver to the Ray cluster.
        ray.init(redis_address=redis_address, driver_mode=ray.SILENT_MODE)

        # There shouldn't be any errors yet.
        self.assertEqual(len(ray.error_info()), 0)

        error_string1 = "error_string1"
        error_string2 = "error_string2"

        @ray.remote
        def f():
            raise Exception(error_string1)

        # Run a remote function that throws an error.
        with self.assertRaises(Exception):
            ray.get(f.remote())

        # Wait for the error to appear in Redis.
        while len(ray.error_info()) != 1:
            time.sleep(0.1)
            print("Waiting for error to appear.")

        # Make sure we got the error.
        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn(error_string1,
                      ray.error_info()[0][b"message"].decode("ascii"))

        # Start another driver and make sure that it does not receive this error.
        # Make the other driver throw an error, and make sure it receives that
        # error.
        driver_script = """
import ray
import time

ray.init(redis_address="{}")

time.sleep(1)
assert len(ray.error_info()) == 0

@ray.remote
def f():
  raise Exception("{}")

try:
  ray.get(f.remote())
except Exception as e:
  pass

while len(ray.error_info()) != 1:
  print(len(ray.error_info()))
  time.sleep(0.1)
assert len(ray.error_info()) == 1

assert "{}" in ray.error_info()[0][b"message"].decode("ascii")

print("success")
""".format(redis_address, error_string2, error_string2)

        # Save the driver script as a file so we can call it using subprocess.
        with tempfile.NamedTemporaryFile() as f:
            f.write(driver_script.encode("ascii"))
            f.flush()
            out = subprocess.check_output(["python", f.name]).decode("ascii")

        # Make sure the other driver succeeded.
        self.assertIn("success", out)

        # Make sure that the other error message doesn't show up for this driver.
        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn(error_string1,
                      ray.error_info()[0][b"message"].decode("ascii"))

        ray.worker.cleanup()
        subprocess.Popen([stop_ray_script]).wait()
Example #26
0
    def testErrorIsolation(self):
        # Connect a driver to the Ray cluster.
        ray.init(redis_address=self.redis_address, driver_mode=ray.SILENT_MODE)

        # There shouldn't be any errors yet.
        self.assertEqual(len(ray.error_info()), 0)

        error_string1 = "error_string1"
        error_string2 = "error_string2"

        @ray.remote
        def f():
            raise Exception(error_string1)

        # Run a remote function that throws an error.
        with self.assertRaises(Exception):
            ray.get(f.remote())

        # Wait for the error to appear in Redis.
        while len(ray.error_info()) != 1:
            time.sleep(0.1)
            print("Waiting for error to appear.")

        # Make sure we got the error.
        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn(error_string1,
                      ray.error_info()[0][b"message"].decode("ascii"))

        # Start another driver and make sure that it does not receive this
        # error. Make the other driver throw an error, and make sure it
        # receives that error.
        driver_script = """
import ray
import time

ray.init(redis_address="{}")

time.sleep(1)
assert len(ray.error_info()) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

while len(ray.error_info()) != 1:
    print(len(ray.error_info()))
    time.sleep(0.1)
assert len(ray.error_info()) == 1

assert "{}" in ray.error_info()[0][b"message"].decode("ascii")

print("success")
""".format(self.redis_address, error_string2, error_string2)

        out = run_string_as_driver(driver_script)
        # Make sure the other driver succeeded.
        self.assertIn("success", out)

        # Make sure that the other error message doesn't show up for this
        # driver.
        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn(error_string1,
                      ray.error_info()[0][b"message"].decode("ascii"))
Example #27
0
    def testErrorIsolation(self):
        # Connect a driver to the Ray cluster.
        ray.init(redis_address=self.redis_address, driver_mode=ray.SILENT_MODE)

        # There shouldn't be any errors yet.
        self.assertEqual(len(ray.error_info()), 0)

        error_string1 = "error_string1"
        error_string2 = "error_string2"

        @ray.remote
        def f():
            raise Exception(error_string1)

        # Run a remote function that throws an error.
        with self.assertRaises(Exception):
            ray.get(f.remote())

        # Wait for the error to appear in Redis.
        while len(ray.error_info()) != 1:
            time.sleep(0.1)
            print("Waiting for error to appear.")

        # Make sure we got the error.
        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn(error_string1,
                      ray.error_info()[0][b"message"].decode("ascii"))

        # Start another driver and make sure that it does not receive this
        # error. Make the other driver throw an error, and make sure it
        # receives that error.
        driver_script = """
import ray
import time

ray.init(redis_address="{}")

time.sleep(1)
assert len(ray.error_info()) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

while len(ray.error_info()) != 1:
    print(len(ray.error_info()))
    time.sleep(0.1)
assert len(ray.error_info()) == 1

assert "{}" in ray.error_info()[0][b"message"].decode("ascii")

print("success")
""".format(self.redis_address, error_string2, error_string2)

        out = run_string_as_driver(driver_script)
        # Make sure the other driver succeeded.
        self.assertIn("success", out)

        # Make sure that the other error message doesn't show up for this
        # driver.
        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn(error_string1,
                      ray.error_info()[0][b"message"].decode("ascii"))
Example #28
0
def relevant_errors(error_type):
    return [info for info in ray.error_info() if info[b"type"] == error_type]
Example #29
0
    def testIncorrectMethodCalls(self):
        ray.init(num_workers=0, driver_mode=ray.SILENT_MODE)

        @ray.actor
        class Actor(object):
            def __init__(self, missing_variable_name):
                pass

            def get_val(self, x):
                pass

        # Make sure that we get errors if we call the constructor incorrectly.
        # TODO(rkn): These errors should instead be thrown when the method is
        # called.

        # Create an actor with too few arguments.
        a = Actor()
        wait_for_errors(b"task", 1)
        self.assertEqual(len(ray.error_info()), 1)
        if sys.version_info >= (3, 0):
            self.assertIn("missing 1 required",
                          ray.error_info()[0][b"message"].decode("ascii"))
        else:
            self.assertIn("takes exactly 2 arguments",
                          ray.error_info()[0][b"message"].decode("ascii"))

        # Create an actor with too many arguments.
        a = Actor(1, 2)
        wait_for_errors(b"task", 2)
        self.assertEqual(len(ray.error_info()), 2)
        if sys.version_info >= (3, 0):
            self.assertIn("but 3 were given",
                          ray.error_info()[1][b"message"].decode("ascii"))
        else:
            self.assertIn("takes exactly 2 arguments",
                          ray.error_info()[1][b"message"].decode("ascii"))

        # Create an actor the correct number of arguments.
        a = Actor(1)

        # Call a method with too few arguments.
        a.get_val()
        wait_for_errors(b"task", 3)
        self.assertEqual(len(ray.error_info()), 3)
        if sys.version_info >= (3, 0):
            self.assertIn("missing 1 required",
                          ray.error_info()[2][b"message"].decode("ascii"))
        else:
            self.assertIn("takes exactly 2 arguments",
                          ray.error_info()[2][b"message"].decode("ascii"))

        # Call a method with too many arguments.
        a.get_val(1, 2)
        wait_for_errors(b"task", 4)
        self.assertEqual(len(ray.error_info()), 4)
        if sys.version_info >= (3, 0):
            self.assertIn("but 3 were given",
                          ray.error_info()[3][b"message"].decode("ascii"))
        else:
            self.assertIn("takes exactly 2 arguments",
                          ray.error_info()[3][b"message"].decode("ascii"))
        # Call a method that doesn't exist.
        with self.assertRaises(AttributeError):
            a.nonexistent_method()

        ray.worker.cleanup()
Example #30
0
def test_error_isolation(ray_start_head):
    redis_address = ray_start_head
    # Connect a driver to the Ray cluster.
    ray.init(redis_address=redis_address)

    # There shouldn't be any errors yet.
    assert len(ray.error_info()) == 0

    error_string1 = "error_string1"
    error_string2 = "error_string2"

    @ray.remote
    def f():
        raise Exception(error_string1)

    # Run a remote function that throws an error.
    with pytest.raises(Exception):
        ray.get(f.remote())

    # Wait for the error to appear in Redis.
    while len(ray.error_info()) != 1:
        time.sleep(0.1)
        print("Waiting for error to appear.")

    # Make sure we got the error.
    assert len(ray.error_info()) == 1
    assert error_string1 in ray.error_info()[0]["message"]

    # Start another driver and make sure that it does not receive this
    # error. Make the other driver throw an error, and make sure it
    # receives that error.
    driver_script = """
import ray
import time

ray.init(redis_address="{}")

time.sleep(1)
assert len(ray.error_info()) == 0

@ray.remote
def f():
    raise Exception("{}")

try:
    ray.get(f.remote())
except Exception as e:
    pass

while len(ray.error_info()) != 1:
    print(len(ray.error_info()))
    time.sleep(0.1)
assert len(ray.error_info()) == 1

assert "{}" in ray.error_info()[0]["message"]

print("success")
""".format(redis_address, error_string2, error_string2)

    out = run_string_as_driver(driver_script)
    # Make sure the other driver succeeded.
    assert "success" in out

    # Make sure that the other error message doesn't show up for this
    # driver.
    assert len(ray.error_info()) == 1
    assert error_string1 in ray.error_info()[0]["message"]
Example #31
0
                 train_loss=loss.item(),
                 best_test_acc=100. * test_correct /
                 test_num)  # report metrics


tune.register_trainable("train", train)

all_trials = tune.run_experiments({
    "awesome": {
        "run": "train",
        "repeat": 1,
        # "trial_resources": {
        #     "cpu": 8,
        #     "gpu": 1,
        # },
        "stop": {
            "best_test_acc": 90,
        },
        #"stop": {"epoch": 1},
        "config": {
            "lr": tune.grid_search(list(uniform.rvs(0, size=3))),
            "momentum": tune.grid_search(list(uniform.rvs(0, size=1))),
        },
        "local_dir": "ray_results",
        "max_failures": 1
    }
})

ray.error_info()

ray.global_state.log_files()