def test_driver_lives_sequential(shutdown_only):
    ray.init(num_cpus=1)
    ray.worker._global_node.kill_raylet()
    ray.worker._global_node.kill_plasma_store()
    ray.worker._global_node.kill_log_monitor()
    ray.worker._global_node.kill_monitor()
    ray.worker._global_node.kill_raylet_monitor()
Exemple #2
0
  def testCachingReusables(self):
    # Test that we can define reusable variables before the driver is connected.
    def foo_initializer():
      return 1
    def bar_initializer():
      return []
    def bar_reinitializer(bar):
      return []
    ray.reusables.foo = ray.Reusable(foo_initializer)
    ray.reusables.bar = ray.Reusable(bar_initializer, bar_reinitializer)

    @ray.remote
    def use_foo():
      return ray.reusables.foo
    @ray.remote
    def use_bar():
      ray.reusables.bar.append(1)
      return ray.reusables.bar

    ray.init(start_ray_local=True, num_workers=2)

    self.assertEqual(ray.get(use_foo.remote()), 1)
    self.assertEqual(ray.get(use_foo.remote()), 1)
    self.assertEqual(ray.get(use_bar.remote()), [1])
    self.assertEqual(ray.get(use_bar.remote()), [1])

    ray.worker.cleanup()
Exemple #3
0
def test_connect_with_disconnected_node(shutdown_only):
    config = json.dumps({
        "num_heartbeats_timeout": 50,
        "heartbeat_timeout_milliseconds": 10,
    })
    cluster = Cluster()
    cluster.add_node(num_cpus=0, _internal_config=config)
    ray.init(redis_address=cluster.redis_address)
    info = relevant_errors(ray_constants.REMOVED_NODE_ERROR)
    assert len(info) == 0
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(dead_node, allow_graceful=False)
    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 1, timeout=2)
    # This node is killed by SIGKILL, ray_monitor will mark it to dead.
    dead_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(dead_node, allow_graceful=False)
    wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 2, timeout=2)
    # This node is killed by SIGTERM, ray_monitor will not mark it again.
    removing_node = cluster.add_node(num_cpus=0, _internal_config=config)
    cluster.remove_node(removing_node, allow_graceful=True)
    with pytest.raises(Exception, match=('Timing out of wait.')):
        wait_for_errors(ray_constants.REMOVED_NODE_ERROR, 3, timeout=2)
    # There is no connection error to a dead node.
    info = relevant_errors(ray_constants.RAYLET_CONNECTION_ERROR)
    assert len(info) == 0
Exemple #4
0
def ray_start_object_store_memory():
    # Start the Ray processes.
    store_size = 10**6
    ray.init(num_cpus=1, object_store_memory=store_size)
    yield None
    # The code after the yield will run as teardown code.
    ray.shutdown()
Exemple #5
0
  def testPutGet(self):
    ray.init(start_ray_local=True, num_workers=0)

    for i in range(100):
      value_before = i * 10 ** 6
      objectid = ray.put(value_before)
      value_after = ray.get(objectid)
      self.assertEqual(value_before, value_after)

    for i in range(100):
      value_before = i * 10 ** 6 * 1.0
      objectid = ray.put(value_before)
      value_after = ray.get(objectid)
      self.assertEqual(value_before, value_after)

    for i in range(100):
      value_before = "h" * i
      objectid = ray.put(value_before)
      value_after = ray.get(objectid)
      self.assertEqual(value_before, value_after)

    for i in range(100):
      value_before = [1] * i
      objectid = ray.put(value_before)
      value_after = ray.get(objectid)
      self.assertEqual(value_before, value_after)

    ray.worker.cleanup()
Exemple #6
0
  def testWait(self):
    ray.init(start_ray_local=True, num_workers=1)

    @ray.remote
    def f(delay):
      time.sleep(delay)
      return 1

    objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
    ready_ids, remaining_ids = ray.wait(objectids)
    self.assertTrue(len(ready_ids) == 1)
    self.assertTrue(len(remaining_ids) == 3)
    ready_ids, remaining_ids = ray.wait(objectids, num_returns=4)
    self.assertEqual(ready_ids, objectids)
    self.assertEqual(remaining_ids, [])

    objectids = [f.remote(0.5), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
    start_time = time.time()
    ready_ids, remaining_ids = ray.wait(objectids, timeout=1.75, num_returns=4)
    self.assertTrue(time.time() - start_time < 2)
    self.assertEqual(len(ready_ids), 3)
    self.assertEqual(len(remaining_ids), 1)
    ray.wait(objectids)
    objectids = [f.remote(1.0), f.remote(0.5), f.remote(0.5), f.remote(0.5)]
    start_time = time.time()
    ready_ids, remaining_ids = ray.wait(objectids, timeout=5)
    self.assertTrue(time.time() - start_time < 5)
    self.assertEqual(len(ready_ids), 1)
    self.assertEqual(len(remaining_ids), 3)

    ray.worker.cleanup()
def create_cluster(num_nodes):
    cluster = Cluster()
    for i in range(num_nodes):
        cluster.add_node(resources={str(i): 100}, object_store_memory=10**9)

    ray.init(redis_address=cluster.redis_address)
    return cluster
Exemple #8
0
  def testRecursiveObjects(self):
    ray.init(start_ray_local=True, num_workers=0)

    class ClassA(object):
      pass

    ray.register_class(ClassA)

    # Make a list that contains itself.
    l = []
    l.append(l)
    # Make an object that contains itself as a field.
    a1 = ClassA()
    a1.field = a1
    # Make two objects that contain each other as fields.
    a2 = ClassA()
    a3 = ClassA()
    a2.field = a3
    a3.field = a2
    # Make a dictionary that contains itself.
    d1 = {}
    d1["key"] = d1
    # Create a list of recursive objects.
    recursive_objects = [l, a1, a2, a3, d1]

    # Check that exceptions are thrown when we serialize the recursive objects.
    for obj in recursive_objects:
      self.assertRaises(Exception, lambda : ray.put(obj))

    ray.worker.cleanup()
Exemple #9
0
def ray_start_driver_put_errors():
    plasma_store_memory = 10**9
    # Start the Ray processes.
    ray.init(num_cpus=1, object_store_memory=plasma_store_memory)
    yield plasma_store_memory
    # The code after the yield will run as teardown code.
    ray.shutdown()
Exemple #10
0
  def testFailImportingRemoteFunction(self):
    ray.init(start_ray_local=True, num_workers=2, driver_mode=ray.SILENT_MODE)

    # This example is somewhat contrived. It should be successfully pickled, and
    # then it should throw an exception when it is unpickled. This may depend a
    # bit on the specifics of our pickler.
    def reducer(*args):
      raise Exception("There is a problem here.")
    class Foo(object):
      def __init__(self):
        self.__name__ = "Foo_object"
        self.func_doc = ""
        self.__globals__ = {}
      def __reduce__(self):
        return reducer, ()
      def __call__(self):
        return
    ray.remote(Foo())
    for _ in range(100): # Retry if we need to wait longer.
      if len(ray.task_info()["failed_remote_function_imports"]) >= 1:
        break
      time.sleep(0.1)
    self.assertTrue("There is a problem here." in ray.task_info()["failed_remote_function_imports"][0]["error_message"])

    ray.worker.cleanup()
Exemple #11
0
def init():
    ray.init(num_cpus=4)
    async_api.init()
    asyncio.get_event_loop().set_debug(False)
    yield
    async_api.shutdown()
    ray.shutdown()
    def launch(self):
        """Actual entry point into the class instance where everything happens.
        Lots of delegating to classes that are in subclass or can be over-ridden.
        """
        self.register_env_creator()

        # All worker nodes will block at this step during training
        ray_cluster_config = self.ray_init_config()
        if not self.is_master_node:
            return

        # Start the driver on master node
        ray.init(**ray_cluster_config)
        experiment_config = self.get_experiment_config()
        experiment_config = self.customize_experiment_config(experiment_config)
        print("Running experiment with config %s" % json.dumps(experiment_config, indent=2))
        run_experiments(experiment_config)
        all_wokers_host_names = self.get_all_host_names()[1:]
        # If distributed job, send TERMINATION_SIGNAL to all workers.
        if len(all_wokers_host_names) > 0:
            self.sage_cluster_communicator.create_s3_signal(TERMINATION_SIGNAL)

        algo = experiment_config["training"]["run"]
        env_string = experiment_config["training"]["config"]["env"]
        config = experiment_config["training"]["config"]
        self.save_checkpoint_and_serving_model(algorithm=algo,
                                               env_string=env_string,
                                               config=config)
Exemple #13
0
    def testMethods(self):
        for module in [ra.core, ra.random, ra.linalg, da.core, da.random,
                       da.linalg]:
            reload(module)
        ray.init()

        # test eye
        object_id = ra.eye.remote(3)
        val = ray.get(object_id)
        assert_almost_equal(val, np.eye(3))

        # test zeros
        object_id = ra.zeros.remote([3, 4, 5])
        val = ray.get(object_id)
        assert_equal(val, np.zeros([3, 4, 5]))

        # test qr - pass by value
        a_val = np.random.normal(size=[10, 11])
        q_id, r_id = ra.linalg.qr.remote(a_val)
        q_val = ray.get(q_id)
        r_val = ray.get(r_id)
        assert_almost_equal(np.dot(q_val, r_val), a_val)

        # test qr - pass by objectid
        a = ra.random.normal.remote([10, 13])
        q_id, r_id = ra.linalg.qr.remote(a)
        a_val = ray.get(a)
        q_val = ray.get(q_id)
        r_val = ray.get(r_id)
        assert_almost_equal(np.dot(q_val, r_val), a_val)
def router():
    # We need at least 5 workers so resource won't be oversubscribed
    ray.init(num_cpus=5)

    # The following two blobs are equivalent
    #
    # handle = DeadlineAwareRouter.remote("DefaultTestRouter")
    # ray.experimental.register_actor("DefaultTestRouter", handle)
    # handle.start.remote()
    #
    # handle = start_router(DeadlineAwareRouter, "DefaultRouter")
    handle = start_router(DeadlineAwareRouter, "DefaultRouter")

    handle.register_actor.remote(
        "VAdder", VectorizedAdder,
        init_kwargs={"scaler_increment": 1})  # init args
    handle.register_actor.remote(
        "SAdder", ScalerAdder, init_kwargs={"scaler_increment": 2})
    handle.register_actor.remote(
        "SleepFirst", SleepOnFirst, init_kwargs={"sleep_time": 1})
    handle.register_actor.remote(
        "SleepCounter", SleepCounter, max_batch_size=1)

    yield handle

    ray.shutdown()
Exemple #15
0
    def testFailedTask(self):
        reload(test_functions)
        ray.init(num_workers=3, driver_mode=ray.SILENT_MODE)

        test_functions.throw_exception_fct1.remote()
        test_functions.throw_exception_fct1.remote()
        wait_for_errors(b"task", 2)
        self.assertEqual(len(relevant_errors(b"task")), 2)
        for task in relevant_errors(b"task"):
            self.assertIn(b"Test function 1 intentionally failed.",
                          task.get(b"message"))

        x = test_functions.throw_exception_fct2.remote()
        try:
            ray.get(x)
        except Exception as e:
            self.assertIn("Test function 2 intentionally failed.", str(e))
        else:
            # ray.get should throw an exception.
            self.assertTrue(False)

        x, y, z = test_functions.throw_exception_fct3.remote(1.0)
        for ref in [x, y, z]:
            try:
                ray.get(ref)
            except Exception as e:
                self.assertIn("Test function 3 intentionally failed.", str(e))
            else:
                # ray.get should throw an exception.
                self.assertTrue(False)
Exemple #16
0
    def testFailedActorInit(self):
        ray.init(num_workers=0, driver_mode=ray.SILENT_MODE)

        error_message1 = "actor constructor failed"
        error_message2 = "actor method failed"

        @ray.remote
        class FailedActor(object):
            def __init__(self):
                raise Exception(error_message1)

            def get_val(self):
                return 1

            def fail_method(self):
                raise Exception(error_message2)

        a = FailedActor.remote()

        # Make sure that we get errors from a failed constructor.
        wait_for_errors(b"task", 1)
        self.assertEqual(len(ray.error_info()), 1)
        self.assertIn(error_message1,
                      ray.error_info()[0][b"message"].decode("ascii"))

        # Make sure that we get errors from a failed method.
        a.fail_method.remote()
        wait_for_errors(b"task", 2)
        self.assertEqual(len(ray.error_info()), 2)
        self.assertIn(error_message2,
                      ray.error_info()[1][b"message"].decode("ascii"))
Exemple #17
0
def run(args, parser):
    config = {}
    # Load configuration from file
    config_dir = os.path.dirname(args.checkpoint)
    config_path = os.path.join(config_dir, "params.pkl")
    if not os.path.exists(config_path):
        config_path = os.path.join(config_dir, "../params.pkl")
    if not os.path.exists(config_path):
        if not args.config:
            raise ValueError(
                "Could not find params.pkl in either the checkpoint dir or "
                "its parent directory.")
    else:
        with open(config_path, 'rb') as f:
            config = pickle.load(f)
    if "num_workers" in config:
        config["num_workers"] = min(2, config["num_workers"])
    config = merge_dicts(config, args.config)
    if not args.env:
        if not config.get("env"):
            parser.error("the following arguments are required: --env")
        args.env = config.get("env")

    ray.init()

    cls = get_agent_class(args.run)
    agent = cls(env=args.env, config=config)
    agent.restore(args.checkpoint)
    num_steps = int(args.steps)
    rollout(agent, args.env, num_steps, args.out, args.no_render)
def test_dying_worker_wait(shutdown_only):
    ray.init(num_cpus=2)

    @ray.remote
    def sleep_forever():
        time.sleep(10**6)

    @ray.remote
    def get_pid():
        return os.getpid()

    x_id = sleep_forever.remote()
    # Get the PID of the worker that block_in_wait will run on (sleep a little
    # to make sure that sleep_forever has already started).
    time.sleep(0.1)
    worker_pid = ray.get(get_pid.remote())

    @ray.remote
    def block_in_wait(object_id_in_list):
        ray.wait(object_id_in_list)

    # Have the worker wait in a wait call.
    block_in_wait.remote([x_id])
    time.sleep(0.1)

    # Kill the worker.
    os.kill(worker_pid, signal.SIGKILL)
    time.sleep(0.1)

    # Create the object.
    ray.worker.global_worker.put_object(x_id, 1)
    time.sleep(0.1)

    # Make sure that nothing has died.
    assert ray.services.all_processes_alive()
Exemple #19
0
    def testNetworksIndependent(self):
        # Note we use only one worker to ensure that all of the remote
        # functions run on the same worker.
        ray.init(num_workers=1)
        net1 = NetActor()
        net2 = NetActor()

        # Make sure the two networks have different weights. TODO(rkn): Note
        # that equality comparisons of numpy arrays normally does not work.
        # This only works because at the moment they have size 1.
        weights1 = net1.get_weights()
        weights2 = net2.get_weights()
        self.assertNotEqual(weights1, weights2)

        # Set the weights and get the weights, and make sure they are
        # unchanged.
        new_weights1 = net1.set_and_get_weights(weights1)
        new_weights2 = net2.set_and_get_weights(weights2)
        self.assertEqual(weights1, new_weights1)
        self.assertEqual(weights2, new_weights2)

        # Swap the weights.
        new_weights1 = net2.set_and_get_weights(weights1)
        new_weights2 = net1.set_and_get_weights(weights2)
        self.assertEqual(weights1, new_weights1)
        self.assertEqual(weights2, new_weights2)
Exemple #20
0
def ray_start_reconstruction(request):
    num_nodes = request.param

    plasma_store_memory = int(0.5 * 10**9)

    cluster = Cluster(
        initialize_head=True,
        head_node_args={
            "num_cpus": 1,
            "object_store_memory": plasma_store_memory // num_nodes,
            "redis_max_memory": 10**7,
            "_internal_config": json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            })
        })
    for i in range(num_nodes - 1):
        cluster.add_node(
            num_cpus=1,
            object_store_memory=plasma_store_memory // num_nodes,
            _internal_config=json.dumps({
                "initial_reconstruction_timeout_milliseconds": 200
            }))
    ray.init(redis_address=cluster.redis_address)

    yield plasma_store_memory, num_nodes, cluster

    # Clean up the Ray cluster.
    ray.shutdown()
    cluster.shutdown()
Exemple #21
0
def driver_0(redis_address, driver_index):
    """The script for driver 0.

    This driver should create five actors that each use one GPU and some actors
    that use no GPUs. After a while, it should exit.
    """
    ray.init(redis_address=redis_address)

    # Wait for all the nodes to join the cluster.
    _wait_for_nodes_to_join(total_num_nodes)

    # Start some long running task. Driver 2 will make sure the worker running
    # this task has been killed.
    for i in range(num_long_running_tasks_per_driver):
        long_running_task.remote(driver_index, i, redis_address)

    # Create some actors that require one GPU.
    actors_one_gpu = [Actor1.remote(driver_index, i, redis_address)
                      for i in range(5)]
    # Create some actors that don't require any GPUs.
    actors_no_gpus = [Actor0.remote(driver_index, 5 + i, redis_address)
                      for i in range(5)]

    for _ in range(1000):
        ray.get([actor.check_ids.remote() for actor in actors_one_gpu])
        ray.get([actor.check_ids.remote() for actor in actors_no_gpus])

    # Start a long-running method on one actor and make sure this doesn't
    # affect anything.
    actors_no_gpus[0].long_running_method.remote()

    _broadcast_event("DRIVER_0_DONE", redis_address)
Exemple #22
0
    def testFailImportingActor(self):
        ray.init(num_workers=2, driver_mode=ray.SILENT_MODE)

        # Create the contents of a temporary Python file.
        temporary_python_file = """
def temporary_helper_function():
    return 1
"""

        f = tempfile.NamedTemporaryFile(suffix=".py")
        f.write(temporary_python_file.encode("ascii"))
        f.flush()
        directory = os.path.dirname(f.name)
        # Get the module name and strip ".py" from the end.
        module_name = os.path.basename(f.name)[:-3]
        sys.path.append(directory)
        module = __import__(module_name)

        # Define an actor that closes over this temporary module. This should
        # fail when it is unpickled.
        @ray.remote
        class Foo(object):
            def __init__(self):
                self.x = module.temporary_python_file()

            def get_val(self):
                return 1

        # There should be no errors yet.
        self.assertEqual(len(ray.error_info()), 0)

        # Create an actor.
        foo = Foo.remote()

        # Wait for the error to arrive.
        wait_for_errors(b"register_actor", 1)
        self.assertIn(b"No module named", ray.error_info()[0][b"message"])

        # Wait for the error from when the __init__ tries to run.
        wait_for_errors(b"task", 1)
        self.assertIn(
            b"failed to be imported, and so cannot execute this method",
            ray.error_info()[1][b"message"])

        # Check that if we try to get the function it throws an exception and
        # does not hang.
        with self.assertRaises(Exception):
            ray.get(foo.get_val.remote())

        # Wait for the error from when the call to get_val.
        wait_for_errors(b"task", 2)
        self.assertIn(
            b"failed to be imported, and so cannot execute this method",
            ray.error_info()[2][b"message"])

        f.close()

        # Clean up the junk we added to sys.path.
        sys.path.pop(-1)
Exemple #23
0
 def testCustomModel(self):
     ray.init()
     ModelCatalog.register_custom_model("foo", CustomModel)
     p1 = ModelCatalog.get_model({
         "obs": tf.constant([1, 2, 3])
     }, Box(0, 1, shape=(3, ), dtype=np.float32), Discrete(5), 5,
                                 {"custom_model": "foo"})
     self.assertEqual(str(type(p1)), str(CustomModel))
Exemple #24
0
def test_using_hostnames(ray_start_head_local):
    ray.init(node_ip_address="localhost", redis_address="localhost:6379")

    @ray.remote
    def f():
        return 1

    assert ray.get(f.remote()) == 1
Exemple #25
0
    def testAdditionalVariablesWithLoss(self):
        ray.init(num_workers=1)

        net = LossActor()
        self.assertEqual(len(net.values[0].variables.items()), 3)
        self.assertEqual(len(net.values[0].placeholders.items()), 3)

        net.values[0].set_weights(net.values[0].get_weights())
    def initialize(self, env, policy, pool):
        super(RemoteSampler, self).initialize(env, policy, pool)

        ray.init()

        env_pkl = pickle.dumps(env)
        policy_pkl = pickle.dumps(policy)

        self._remote_environment = _RemoteEnv.remote(env_pkl, policy_pkl)
Exemple #27
0
    def testVariableNameCollision(self):
        ray.init(num_workers=2)

        net1 = NetActor()
        net2 = NetActor()

        # This is checking that the variable names of the two nets are the
        # same, i.e., that the names in the weight dictionaries are the same.
        net1.values[0].set_weights(net2.values[0].get_weights())
Exemple #28
0
def test_temp_plasma_store_socket():
    ray.init(plasma_store_socket_name="/tmp/i_am_a_temp_socket")
    assert os.path.exists(
        "/tmp/i_am_a_temp_socket"), "Specified socket path not found."
    ray.shutdown()
    try:
        os.remove("/tmp/i_am_a_temp_socket")
    except Exception:
        pass
Exemple #29
0
    def testVersionMismatch(self):
        ray_version = ray.__version__
        ray.__version__ = "fake ray version"

        ray.init(num_workers=1, driver_mode=ray.SILENT_MODE)

        wait_for_errors(b"version_mismatch", 1)

        ray.__version__ = ray_version
Exemple #30
0
def ray_start():
    # Start ray instance
    ray.init(num_cpus=1)

    # Run test using this fixture
    yield None

    # Shutdown ray instance
    ray.shutdown()
def test_capture_child_tasks(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    total_num_tasks = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_tasks, num_gpus=total_num_tasks)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        pg = ray.util.placement_group(
            [{
                "CPU": 2,
                "GPU": 2,
            }, {
                "CPU": 2,
                "GPU": 2,
            }],
            strategy="STRICT_PACK")
        ray.get(pg.ready())

        # If get_current_placement_group is used when the current worker/driver
        # doesn't belong to any of placement group, it should return None.
        assert get_current_placement_group() is None

        # Test if tasks capture child tasks.
        @ray.remote
        def task():
            return get_current_placement_group()

        @ray.remote
        def create_nested_task(child_cpu, child_gpu, set_none=False):
            assert get_current_placement_group() is not None
            kwargs = {
                "num_cpus": child_cpu,
                "num_gpus": child_gpu,
            }
            if set_none:
                kwargs["placement_group"] = None
            return ray.get([task.options(**kwargs).remote() for _ in range(3)])

        t = create_nested_task.options(
            num_cpus=1,
            num_gpus=0,
            placement_group=pg,
            placement_group_capture_child_tasks=True).remote(1, 0)
        pgs = ray.get(t)
        # Every task should have current placement group because they
        # should be implicitly captured by default.
        assert None not in pgs

        t1 = create_nested_task.options(
            num_cpus=1,
            num_gpus=0,
            placement_group=pg,
            placement_group_capture_child_tasks=True).remote(1, 0, True)
        pgs = ray.get(t1)
        # Every task should have no placement group since it's set to None.
        # should be implicitly captured by default.
        assert set(pgs) == {None}

        # Test if tasks don't capture child tasks when the option is off.
        t2 = create_nested_task.options(
            num_cpus=0, num_gpus=1, placement_group=pg).remote(0, 1)
        pgs = ray.get(t2)
        # All placement groups should be None since we don't capture child
        # tasks.
        assert not all(pgs)
def test_automatic_cleanup_job(ray_start_cluster):
    # Make sure the placement groups created by a
    # job, actor, and task are cleaned when the job is done.
    cluster = ray_start_cluster
    num_nodes = 3
    num_cpu_per_node = 4
    # Create 3 nodes cluster.
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpu_per_node)
    cluster.wait_for_nodes()

    info = ray.init(address=cluster.address)
    available_cpus = ray.available_resources()["CPU"]
    assert available_cpus == num_nodes * num_cpu_per_node

    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}")

def create_pg():
    pg = ray.util.placement_group(
            [{{"CPU": 1}} for _ in range(3)],
            strategy="STRICT_SPREAD")
    ray.get(pg.ready())
    return pg

@ray.remote(num_cpus=0)
def f():
    create_pg()

@ray.remote(num_cpus=0)
class A:
    def create_pg(self):
        create_pg()

ray.get(f.remote())
a = A.remote()
ray.get(a.create_pg.remote())
# Create 2 pgs to make sure multiple placement groups that belong
# to a single job will be properly cleaned.
create_pg()
create_pg()

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.state.jobs()
        for job in jobs:
            if job["IsDead"]:
                return True
        return False

    def assert_num_cpus(expected_num_cpus):
        if expected_num_cpus == 0:
            return "CPU" not in ray.available_resources()
        return ray.available_resources()["CPU"] == expected_num_cpus

    wait_for_condition(is_job_done)
    available_cpus = ray.available_resources()["CPU"]
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
Exemple #33
0
def test_int_dataframe():
    ray.init()

    pandas_df = pd.DataFrame({'col1': [0, 1, 2, 3],
                              'col2': [4, 5, 6, 7],
                              'col3': [8, 9, 10, 11],
                              'col4': [12, 13, 14, 15],
                              'col5': [0, 0, 0, 0]})
    ray_df = rdf.from_pandas(pandas_df, 2)

    testfuncs = [lambda x: x + 1,
                 lambda x: str(x),
                 lambda x: x * x,
                 lambda x: x,
                 lambda x: False]

    keys = ['col1',
            'col2',
            'col3',
            'col4']

    test_roundtrip(ray_df, pandas_df)
    test_index(ray_df, pandas_df)
    test_size(ray_df, pandas_df)
    test_ndim(ray_df, pandas_df)
    test_ftypes(ray_df, pandas_df)
    test_values(ray_df, pandas_df)
    test_axes(ray_df, pandas_df)
    test_shape(ray_df, pandas_df)
    test_add_prefix(ray_df, pandas_df)
    test_add_suffix(ray_df, pandas_df)

    for testfunc in testfuncs:
        test_applymap(ray_df, pandas_df, testfunc)

    test_copy(ray_df)
    test_sum(ray_df, pandas_df)
    test_abs(ray_df, pandas_df)
    test_keys(ray_df, pandas_df)
    test_transpose(ray_df, pandas_df)
    test_round(ray_df, pandas_df)

    test_all(ray_df, pandas_df)
    test_any(ray_df, pandas_df)
    test___getitem__(ray_df, pandas_df)
    test___delitem__(ray_df, pandas_df)
    test___copy__(ray_df, pandas_df)
    test___deepcopy__(ray_df, pandas_df)
    test_bool(ray_df, pandas_df)
    test_count(ray_df, pandas_df)
    test_head(ray_df, pandas_df)
    test_tail(ray_df, pandas_df)
    test_idxmax(ray_df, pandas_df)
    test_idxmin(ray_df, pandas_df)
    test_pop(ray_df, pandas_df)

    for key in keys:
        test_get(ray_df, pandas_df, key)

    test_get_dtype_counts(ray_df, pandas_df)
    test_get_ftype_counts(ray_df, pandas_df)

    test_max(ray_df, pandas_df)
    test_min(ray_df, pandas_df)
    test_notna(ray_df, pandas_df)
    test_notnull(ray_df, pandas_df)
Exemple #34
0
def ray_start_single_node():
    # Please start this fixture in a cluster with 2 cpus.
    address_info = ray.init(num_cpus=8)
    yield address_info
    ray.shutdown()
def test_mini_integration(ray_start_cluster, connect_to_client):
    # Create bundles as many as number of gpus in the cluster.
    # Do some random work and make sure all resources are properly recovered.

    cluster = ray_start_cluster

    num_nodes = 5
    per_bundle_gpus = 2
    gpu_per_node = 4
    total_gpus = num_nodes * per_bundle_gpus * gpu_per_node
    per_node_gpus = per_bundle_gpus * gpu_per_node

    bundles_per_pg = 2
    total_num_pg = total_gpus // (bundles_per_pg * per_bundle_gpus)

    [
        cluster.add_node(num_cpus=2, num_gpus=per_bundle_gpus * gpu_per_node)
        for _ in range(num_nodes)
    ]
    cluster.wait_for_nodes()
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):

        @ray.remote(num_cpus=0, num_gpus=1)
        def random_tasks():
            import time
            import random
            sleep_time = random.uniform(0.1, 0.2)
            time.sleep(sleep_time)
            return True

        pgs = []
        pg_tasks = []
        # total bundle gpu usage = bundles_per_pg*total_num_pg*per_bundle_gpus
        # Note this is half of total
        for index in range(total_num_pg):
            pgs.append(
                ray.util.placement_group(
                    name=f"name{index}",
                    strategy="PACK",
                    bundles=[{
                        "GPU": per_bundle_gpus
                    } for _ in range(bundles_per_pg)]))

        # Schedule tasks.
        for i in range(total_num_pg):
            pg = pgs[i]
            pg_tasks.append([
                random_tasks.options(
                    placement_group=pg,
                    placement_group_bundle_index=bundle_index).remote()
                for bundle_index in range(bundles_per_pg)
            ])

        # Make sure tasks are done and we remove placement groups.
        num_removed_pg = 0
        pg_indexes = [2, 3, 1, 7, 8, 9, 0, 6, 4, 5]
        while num_removed_pg < total_num_pg:
            index = pg_indexes[num_removed_pg]
            pg = pgs[index]
            assert all(ray.get(pg_tasks[index]))
            ray.util.remove_placement_group(pg)
            num_removed_pg += 1

        @ray.remote(num_cpus=2, num_gpus=per_node_gpus)
        class A:
            def ping(self):
                return True

        # Make sure all resources are properly returned by scheduling
        # actors that take up all existing resources.
        actors = [A.remote() for _ in range(num_nodes)]
        assert all(ray.get([a.ping.remote() for a in actors]))
Exemple #36
0
        self.register_variables(self.base_model.variables)

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
        out, self._value_out = self.base_model(
            [input_dict["obs"], input_dict["is_training"]])
        return out, []

    @override(ModelV2)
    def value_function(self):
        return tf.reshape(self._value_out, [-1])


if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

    ModelCatalog.register_custom_model("bn_model", BatchNormModel)

    config = {
        "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0",
        "model": {
            "custom_model": "bn_model",
        },
        "num_workers": 0,
    }

    from ray.rllib.agents.ppo import PPOTrainer
    trainer = PPOTrainer(config=config)
    trainer.train()
Exemple #37
0
import unittest

import ray
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.tests.test_rollout_worker import MockPolicy


class TestPerf(unittest.TestCase):
    # Tested on Intel(R) Core(TM) i7-4600U CPU @ 2.10GHz
    # 11/23/18: Samples per second 8501.125113727468
    # 03/01/19: Samples per second 8610.164353268685
    def testBaselinePerformance(self):
        for _ in range(20):
            ev = RolloutWorker(
                env_creator=lambda _: gym.make("CartPole-v0"),
                policy=MockPolicy,
                batch_steps=100)
            start = time.time()
            count = 0
            while time.time() - start < 1:
                count += ev.sample().count
            print()
            print("Samples per second {}".format(
                count / (time.time() - start)))
            print()


if __name__ == "__main__":
    ray.init(num_cpus=5)
    unittest.main(verbosity=2)
Exemple #38
0
    def __init__(self, game_name, config=None, split_resources_in=1):
        # Load the game and the config from the module with the game name
        try:
            game_module = importlib.import_module("games." + game_name)
            self.Game = game_module.Game
            self.config = game_module.MuZeroConfig()
        except ModuleNotFoundError as err:
            print(
                f'{game_name} is not a supported game name, try "cartpole" or refer to the documentation for adding a new game.'
            )
            raise err

        # Overwrite the config
        if config:
            if type(config) is dict:
                for param, value in config.items():
                    setattr(self.config, param, value)
            else:
                self.config = config

        # Fix random generator seed
        numpy.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        # Manage GPUs
        if self.config.max_num_gpus == 0 and (self.config.selfplay_on_gpu
                                              or self.config.train_on_gpu
                                              or self.config.reanalyse_on_gpu):
            raise ValueError(
                "Inconsistent MuZeroConfig: max_num_gpus = 0 but GPU requested by selfplay_on_gpu or train_on_gpu or reanalyse_on_gpu."
            )
        if (self.config.selfplay_on_gpu or self.config.train_on_gpu
                or self.config.reanalyse_on_gpu):
            total_gpus = (self.config.max_num_gpus if self.config.max_num_gpus
                          is not None else torch.cuda.device_count())
        else:
            total_gpus = 0
        self.num_gpus = total_gpus / split_resources_in
        if 1 < self.num_gpus:
            self.num_gpus = math.floor(self.num_gpus)

        ray.init(num_gpus=total_gpus, ignore_reinit_error=True)

        # Checkpoint and replay buffer used to initialize workers
        self.checkpoint = {
            "weights": None,
            "optimizer_state": None,
            "total_reward": 0,
            "muzero_reward": 0,
            "opponent_reward": 0,
            "episode_length": 0,
            "mean_value": 0,
            "training_step": 0,
            "lr": 0,
            "total_loss": 0,
            "value_loss": 0,
            "reward_loss": 0,
            "policy_loss": 0,
            "num_played_games": 0,
            "num_played_steps": 0,
            "num_reanalysed_games": 0,
            "terminate": False,
        }
        self.replay_buffer = {}

        cpu_actor = CPUActor.remote()
        cpu_weights = cpu_actor.get_initial_weights.remote(self.config)
        self.checkpoint["weights"], self.summary = copy.deepcopy(
            ray.get(cpu_weights))

        # Workers
        self.self_play_workers = None
        self.test_worker = None
        self.training_worker = None
        self.reanalyse_worker = None
        self.replay_buffer_worker = None
        self.shared_storage_worker = None
Exemple #39
0
def ray_start_single_node_2_gpus():
    # Please start this fixture in a cluster with 2 GPUs.
    address_info = ray.init(num_gpus=2)
    yield address_info
    ray.shutdown()
Exemple #40
0
 def setUpClass(cls) -> None:
     ray.init()
Exemple #41
0
def main(num_actors,
         env_name="CartPole-v0",
         batch_size=16,
         update_iter=16,
         gamma=0.97,
         eta=0.9,
         alpha=0.9,
         burnin_length=4,
         unroll_length=4):

    s = time.time()

    ray.init(local_mode=False)

    logdir = Path(__file__).parent / "log"
    if logdir.exists():
        shutil.rmtree(logdir)
    summary_writer = tf.summary.create_file_writer(str(logdir))

    history = []

    epsilons = np.linspace(0.1, 0.8, num_actors) if num_actors > 1 else [0.3]
    actors = [
        Actor.remote(pid=i,
                     env_name=env_name,
                     epsilon=epsilons[i],
                     gamma=gamma,
                     eta=eta,
                     alpha=alpha,
                     burnin_length=burnin_length,
                     unroll_length=unroll_length) for i in range(num_actors)
    ]

    replay = SegmentReplayBuffer(buffer_size=2**12)

    learner = Learner.remote(env_name=env_name,
                             gamma=gamma,
                             eta=eta,
                             alpha=alpha,
                             burnin_length=burnin_length,
                             unroll_length=unroll_length)

    current_weights = ray.get(learner.define_network.remote())
    current_weights = ray.put(current_weights)

    tester = Tester.remote(env_name=env_name)

    wip_actors = [
        actor.sync_weights_and_rollout.remote(current_weights)
        for actor in actors
    ]

    for _ in range(10):
        finished, wip_actors = ray.wait(wip_actors, num_returns=1)
        priorities, segments, pid = ray.get(finished[0])
        replay.add(priorities, segments)
        wip_actors.extend(
            [actors[pid].sync_weights_and_rollout.remote(current_weights)])

    # minibatchs: (indices, weights, segments)
    minibatchs = [
        replay.sample_minibatch(batch_size=batch_size)
        for _ in range(update_iter)
    ]
    wip_learner = learner.update_network.remote(minibatchs)

    wip_tester = tester.test_play.remote(current_weights, epsilon=0.05)

    minibatchs = [
        replay.sample_minibatch(batch_size=batch_size)
        for _ in range(update_iter)
    ]

    learner_cycles = 1
    actor_cycles = 0
    n_segment_added = 0

    print("Start learning!!")
    while learner_cycles <= 50:
        actor_cycles += 1
        finished, wip_actors = ray.wait(wip_actors, num_returns=1)
        priorities, segments, pid = ray.get(finished[0])
        replay.add(priorities, segments)
        wip_actors.extend(
            [actors[pid].sync_weights_and_rollout.remote(current_weights)])
        n_segment_added += len(segments)

        finished_learner, _ = ray.wait([wip_learner], timeout=0)
        if finished_learner:
            current_weights, indices, priorities, loss = ray.get(
                finished_learner[0])
            wip_learner = learner.update_network.remote(minibatchs)
            current_weights = ray.put(current_weights)

            #: 優先度の更新とminibatchの作成はlearnerよりも十分に速いという前提
            replay.update_priority(indices, priorities)
            minibatchs = [
                replay.sample_minibatch(batch_size=batch_size)
                for _ in range(update_iter)
            ]

            print("Actor cycle:", actor_cycles, "Added:", n_segment_added)

            learner_cycles += 1
            actor_cycles = 0
            n_segment_added = 0

            with summary_writer.as_default():
                tf.summary.scalar("learner_loss", loss, step=learner_cycles)

            if learner_cycles % 5 == 0:

                test_rewards = ray.get(wip_tester)
                history.append((learner_cycles - 5, test_rewards))
                wip_tester = tester.test_play.remote(current_weights,
                                                     epsilon=0.05)
                print("Cycle:", learner_cycles, "Score:", test_rewards)

                with summary_writer.as_default():
                    tf.summary.scalar("test_rewards",
                                      test_rewards,
                                      step=learner_cycles)
                    tf.summary.scalar("buffer_size",
                                      len(replay),
                                      step=learner_cycles)

    wallclocktime = round(time.time() - s, 2)
    cycles, scores = zip(*history)
    plt.plot(cycles, scores)
    plt.title(f"total time: {wallclocktime} sec")
    plt.ylabel("test_score(epsilon=0.1)")
    plt.savefig("log/history.png")
Exemple #42
0
    arr = np.ones(OBJECT_SIZE, dtype=np.uint8)
    ref = ray.put(arr)

    for actor in tqdm(actors, desc="Ensure all actors have started."):
        ray.get(actor.foo.remote())

    result_refs = []
    for actor in tqdm(actors, desc="Broadcasting objects"):
        result_refs.append(actor.sum.remote(ref))

    results = ray.get(result_refs)
    for result in results:
        assert result == OBJECT_SIZE


ray.init(address="auto")
start = perf_counter()
test_object_broadcast()
end = perf_counter()
print(f"Broadcast time: {end - start} ({OBJECT_SIZE} B x {NUM_NODES} nodes)")

if "TEST_OUTPUT_JSON" in os.environ:
    out_file = open(os.environ["TEST_OUTPUT_JSON"], "w")
    results = {
        "broadcast_time": end - start,
        "object_size": OBJECT_SIZE,
        "num_nodes": NUM_NODES,
        "success": "1"
    }
    json.dump(results, out_file)
def test_capture_child_actors(ray_start_cluster, connect_to_client):
    cluster = ray_start_cluster
    total_num_actors = 4
    for _ in range(2):
        cluster.add_node(num_cpus=total_num_actors)
    ray.init(address=cluster.address)

    with connect_to_client_or_not(connect_to_client):
        pg = ray.util.placement_group(
            [{
                "CPU": 2
            }, {
                "CPU": 2
            }], strategy="STRICT_PACK")
        ray.get(pg.ready())

        # If get_current_placement_group is used when the current worker/driver
        # doesn't belong to any of placement group, it should return None.
        assert get_current_placement_group() is None

        # Test actors first.
        @ray.remote(num_cpus=1)
        class NestedActor:
            def ready(self):
                return True

        @ray.remote(num_cpus=1)
        class Actor:
            def __init__(self):
                self.actors = []

            def ready(self):
                return True

            def schedule_nested_actor(self):
                # Make sure we can capture the current placement group.
                assert get_current_placement_group() is not None
                # Actors should be implicitly captured.
                actor = NestedActor.remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

            def schedule_nested_actor_outside_pg(self):
                # Don't use placement group.
                actor = NestedActor.options(placement_group=None).remote()
                ray.get(actor.ready.remote())
                self.actors.append(actor)

        a = Actor.options(
            placement_group=pg,
            placement_group_capture_child_tasks=True).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor.remote())
        # Make sure all the actors are scheduled on the same node.
        # (why? The placement group has STRICT_PACK strategy).
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        # Since all node id should be identical, set should be equal to 1.
        assert len(node_id_set) == 1

        # Kill an actor and wait until it is killed.
        kill_actor_and_wait_for_failure(a)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(a.ready.remote())

        # Now create an actor, but do not capture the current tasks
        a = Actor.options(placement_group=pg).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor.remote())
        # Make sure all the actors are not scheduled on the same node.
        # It is because the child tasks are not scheduled on the same
        # placement group.
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        assert len(node_id_set) == 2

        # Kill an actor and wait until it is killed.
        kill_actor_and_wait_for_failure(a)
        with pytest.raises(ray.exceptions.RayActorError):
            ray.get(a.ready.remote())

        # Lastly, make sure when None is specified, actors are not scheduled
        # on the same placement group.
        a = Actor.options(placement_group=pg).remote()
        ray.get(a.ready.remote())
        # 1 top level actor + 3 children.
        for _ in range(total_num_actors - 1):
            ray.get(a.schedule_nested_actor_outside_pg.remote())
        # Make sure all the actors are not scheduled on the same node.
        # It is because the child tasks are not scheduled on the same
        # placement group.
        node_id_set = set()
        for actor_info in ray.state.actors().values():
            if actor_info["State"] == convert_actor_state(
                    gcs_utils.ActorTableData.ALIVE):
                node_id = actor_info["Address"]["NodeID"]
                node_id_set.add(node_id)

        assert len(node_id_set) == 2
Exemple #44
0
assert (num_nodes * object_store_memory + num_redis_shards * redis_max_memory <
        ray.utils.get_system_memory() / 2)

# Simulate a cluster on one machine.

cluster = Cluster()
for i in range(num_nodes):
    cluster.add_node(
        redis_port=6379 if i == 0 else None,
        num_redis_shards=num_redis_shards if i == 0 else None,
        num_cpus=4,
        num_gpus=0,
        resources={str(i): 5},
        object_store_memory=object_store_memory,
        redis_max_memory=redis_max_memory)
ray.init(address=cluster.address)

# Run the workload.

# Define a driver script that runs a few tasks and actors on each node in the
# cluster.
driver_script = """
import ray

ray.init(address="{}")

num_nodes = {}


@ray.remote
def f():
Exemple #45
0
parse.add_argument("--batch_size", default=256, type=int, help="train batch size")
parse.add_argument("--lr", default=0.625e-4, type=float, help="learning rate")
parse.add_argument("--suffix", default="default", help="suffix for saving folders")
parse.add_argument("--speed", default=8, type=int, help="training data consum speed. default 8x data generate speed")
args = parse.parse_args()

n_worker = args.num_agents
n_loader = args.num_loaders
env_name = "{}NoFrameskip-v4".format(args.env_name)
buffer = "multi_workers_buffer"
batch_size = args.batch_size
lr = args.lr
suffix = args.suffix
speed = args.speed
n_iter = 40*32//batch_size
ray.init(num_cpus=1+2*n_worker+n_loader, object_store_memory=1*1024**3, memory=6*1024**3)


buffer = lmdb_op.init(buffer)
workers = [ray.remote(DQN_Worker).options(num_gpus=0.1).remote(env_name=env_name, db=buffer, db_write=lmdb_op.write) for _ in range(n_worker)]
test_worker = ray.remote(DQN_Worker).options(num_gpus=0.1).remote(env_name=env_name, phase="test", suffix=suffix)
worker_id = {worker: "worker_{}".format(i) for i, worker in enumerate(workers)}
dataloader = Dataloader(buffer, lmdb_op, worker_num=n_loader, batch_size=batch_size, batch_num=n_iter)
opt = ray.remote(Optimizer).options(num_gpus=0.3).remote(dataloader, env_name, suffix=suffix, iter_steps=n_iter, update_period=10000, lr=lr)
sche = Sched()
eps = 1
opt_start = False
glog = SummaryWriter("./logdir/{}/{}/{}.lr{}.batch{}".format(env_name, suffix, Optimizer.__name__, lr, batch_size))
model_save_period = 100000
train_step = 0
model_idx = 0
    register_env(gym_name, create_env)
    return alg_run, gym_name, config


if __name__ == "__main__":

    parser = ArgumentParser()
    parser.add_argument("-s",
                        "--seeds_file",
                        dest="seeds_file",
                        help="pickle file containing seeds",
                        default=None)
    args = parser.parse_args()

    alg_run, gym_name, config = setup_exps(args.seeds_file)
    ray.init(num_cpus=N_CPUS + 1)
    trials = run_experiments(
        {
            flow_params["exp_tag"]: {
                "run": alg_run,
                "env": gym_name,
                "config": {
                    **config
                },
                "restore":
                '/home/flow/ray_results/central_i696_IDMJunction_horizon2000_Ignore/PPO_MergePOEnv_Ignore-v0_0_2020-04-25_13-49-44m7fk71uh/checkpoint_230',
                #"resume":True,
                "checkpoint_freq": 10,  #20,
                "checkpoint_at_end": True,
                "max_failures": 999,
                "stop": {
def test_atomic_creation(ray_start_cluster, connect_to_client):
    # Setup cluster.
    cluster = ray_start_cluster
    bundle_cpu_size = 2
    bundle_per_node = 2
    num_nodes = 2

    [
        cluster.add_node(num_cpus=bundle_cpu_size * bundle_per_node)
        for _ in range(num_nodes)
    ]
    ray.init(address=cluster.address)

    @ray.remote(num_cpus=1)
    class NormalActor:
        def ping(self):
            pass

    @ray.remote(num_cpus=3)
    def bothering_task():
        time.sleep(6)
        return True

    with connect_to_client_or_not(connect_to_client):
        # Schedule tasks to fail initial placement group creation.
        tasks = [bothering_task.remote() for _ in range(2)]

        # Make sure the two common task has scheduled.
        def tasks_scheduled():
            return ray.available_resources()["CPU"] == 2.0

        wait_for_condition(tasks_scheduled)

        # Create an actor that will fail bundle scheduling.
        # It is important to use pack strategy to make test less flaky.
        pg = ray.util.placement_group(
            name="name",
            strategy="SPREAD",
            bundles=[{
                "CPU": bundle_cpu_size
            } for _ in range(num_nodes * bundle_per_node)])

        # Create a placement group actor.
        # This shouldn't be scheduled because atomic
        # placement group creation should've failed.
        pg_actor = NormalActor.options(
            placement_group=pg,
            placement_group_bundle_index=num_nodes * bundle_per_node -
            1).remote()

        # Wait on the placement group now. It should be unready
        # because normal actor takes resources that are required
        # for one of bundle creation.
        ready, unready = ray.wait([pg.ready()], timeout=0.5)
        assert len(ready) == 0
        assert len(unready) == 1
        # Wait until all tasks are done.
        assert all(ray.get(tasks))

        # Wait on the placement group creation. Since resources are now
        # available, it should be ready soon.
        ready, unready = ray.wait([pg.ready()])
        assert len(ready) == 1
        assert len(unready) == 0

        # Confirm that the placement group actor is created. It will
        # raise an exception if actor was scheduled before placement
        # group was created thus it checks atomicity.
        ray.get(pg_actor.ping.remote(), timeout=3.0)
        ray.kill(pg_actor)

        # Make sure atomic creation failure didn't impact resources.
        @ray.remote(num_cpus=bundle_cpu_size)
        def resource_check():
            return True

        # This should hang because every resources
        # are claimed by placement group.
        check_without_pg = [
            resource_check.remote() for _ in range(bundle_per_node * num_nodes)
        ]

        # This all should scheduled on each bundle.
        check_with_pg = [
            resource_check.options(
                placement_group=pg, placement_group_bundle_index=i).remote()
            for i in range(bundle_per_node * num_nodes)
        ]

        # Make sure these are hanging.
        ready, unready = ray.wait(check_without_pg, timeout=0)
        assert len(ready) == 0
        assert len(unready) == bundle_per_node * num_nodes

        # Make sure these are all scheduled.
        assert all(ray.get(check_with_pg))

        ray.util.remove_placement_group(pg)

        def pg_removed():
            return ray.util.placement_group_table(pg)["state"] == "REMOVED"

        wait_for_condition(pg_removed)

        # Make sure check without pgs are all
        # scheduled properly because resources are cleaned up.
        assert all(ray.get(check_without_pg))
Exemple #48
0

def test_client_serialize_addon(call_ray_stop_only):
    import ray
    import pydantic

    ray.init(num_cpus=0)

    class User(pydantic.BaseModel):
        name: str

    with ray_start_client_server() as ray:
        assert ray.get(ray.put(User(name="ray"))).name == "ray"


object_ref_cleanup_script = """
import ray

ray.init("ray://localhost:50051")

@ray.remote
def f():
    return 42

@ray.remote
class SomeClass:
    pass


obj_ref = f.remote()
actor_ref = SomeClass.remote()
Exemple #49
0
        max_episode_steps=200,
        kwargs={})


def create_env(env_config):
    pass_params_to_gym(env_name)
    env = gym.envs.make(env_name)
    return env


if __name__ == '__main__':
    register_env(env_name, lambda env_config: create_env(env_config))
    config = ppo.DEFAULT_CONFIG.copy()
    horizon = 10
    num_cpus = 4
    ray.init(num_cpus=num_cpus, redirect_output=True)
    config["num_workers"] = num_cpus
    config["train_batch_size"] = 1000
    config["num_sgd_iter"] = 10
    config["gamma"] = 0.999
    config["horizon"] = horizon
    config["use_gae"] = False
    config["model"].update({"fcnet_hiddens": [256, 256]})
    options = {
        "multiagent_obs_shapes": [2, 2],
        "multiagent_act_shapes": [1, 1],
        "multiagent_shared_model": False,
        "multiagent_fcnet_hiddens": [[32, 32]] * 2
    }
    config["model"].update({"custom_options": options})
    alg = ppo.PPOAgent(env=env_name, config=config)
def test_automatic_cleanup_detached_actors(ray_start_cluster):
    # Make sure the placement groups created by a
    # detached actors are cleaned properly.
    cluster = ray_start_cluster
    num_nodes = 3
    num_cpu_per_node = 2
    # Create 3 nodes cluster.
    for _ in range(num_nodes):
        cluster.add_node(num_cpus=num_cpu_per_node)
    cluster.wait_for_nodes()

    info = ray.init(
        address=cluster.address, namespace="default_test_namespace")
    available_cpus = ray.available_resources()["CPU"]
    assert available_cpus == num_nodes * num_cpu_per_node

    driver_code = f"""
import ray

ray.init(address="{info["redis_address"]}", namespace="default_test_namespace")

def create_pg():
    pg = ray.util.placement_group(
            [{{"CPU": 1}} for _ in range(3)],
            strategy="STRICT_SPREAD")
    ray.get(pg.ready())
    return pg

# TODO(sang): Placement groups created by tasks launched by detached actor
# is not cleaned with the current protocol.
# @ray.remote(num_cpus=0)
# def f():
#     create_pg()

@ray.remote(num_cpus=0, max_restarts=1)
class A:
    def create_pg(self):
        create_pg()
    def create_child_pg(self):
        self.a = A.options(name="B").remote()
        ray.get(self.a.create_pg.remote())
    def kill_child_actor(self):
        ray.kill(self.a)
        try:
            ray.get(self.a.create_pg.remote())
        except Exception:
            pass

a = A.options(lifetime="detached", name="A").remote()
ray.get(a.create_pg.remote())
# TODO(sang): Currently, child tasks are cleaned when a detached actor
# is dead. We cannot test this scenario until it is fixed.
# ray.get(a.create_child_pg.remote())

ray.shutdown()
    """

    run_string_as_driver(driver_code)

    # Wait until the driver is reported as dead by GCS.
    def is_job_done():
        jobs = ray.state.jobs()
        for job in jobs:
            if job["IsDead"]:
                return True
        return False

    def assert_num_cpus(expected_num_cpus):
        if expected_num_cpus == 0:
            return "CPU" not in ray.available_resources()
        return ray.available_resources()["CPU"] == expected_num_cpus

    wait_for_condition(is_job_done)
    wait_for_condition(lambda: assert_num_cpus(num_nodes))

    # Make sure when a child actor spawned by a detached actor
    # is killed, the placement group is removed.
    a = ray.get_actor("A")
    # TODO(sang): child of detached actors
    # seem to be killed when jobs are done. We should fix this before
    # testing this scenario.
    # ray.get(a.kill_child_actor.remote())
    # assert assert_num_cpus(num_nodes)

    # Make sure placement groups are cleaned when detached actors are killed.
    ray.kill(a, no_restart=False)
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
    # The detached actor a should've been restarted.
    # Recreate a placement group.
    ray.get(a.create_pg.remote())
    wait_for_condition(lambda: assert_num_cpus(num_nodes))
    # Kill it again and make sure the placement group
    # that is created is deleted again.
    ray.kill(a, no_restart=False)
    wait_for_condition(lambda: assert_num_cpus(num_nodes * num_cpu_per_node))
Exemple #51
0
import time
import ray


@ray.remote
def f():
    time.sleep(0.01)
    return ray.services.get_node_ip_address()


ray.init(redis_address='master:6379')

# Get a list of the IP addresses of the nodes that have joined the cluster.
ids = set(ray.get([f.remote() for _ in range(1000)]))
print(ids)
    N_ROWS = args.num_rows  # number of row of bidirectional lanes
    N_COLUMNS = args.num_cols  # number of columns of bidirectional lanes

    flow_params = make_flow_params(N_ROWS, N_COLUMNS, EDGE_INFLOW)

    upload_dir = args.upload_dir
    RUN_MODE = args.run_mode
    ALGO = args.algo

    if ALGO == 'PPO':
        alg_run, env_name, config = setup_exps_PPO(flow_params)
    else:
        raise NotImplementedError

    if RUN_MODE == 'local':
        ray.init(num_cpus=N_CPUS + 1)
        N_ITER = 1
    elif RUN_MODE == 'cluster':
        ray.init(redis_address="localhost:6379")
        N_ITER = 2000

    exp_tag = {
        'run': alg_run,
        'env': env_name,
        'checkpoint_freq': 25,
        "max_failures": 10,
        'stop': {
            'training_iteration': N_ITER
        },
        'config': config,
        "num_samples": 1,
Exemple #53
0
def ray_start_2_cpus():
    address_info = ray.init(num_cpus=2)
    yield address_info
    # The code after the yield will run as teardown code.
    ray.shutdown()
Exemple #54
0
import io
import base64
from IPython.display import HTML

from dqn_model import DQNModel
from dqn_model import _DQNModel
from memory_remote import ReplayBuffer_remote

import matplotlib.pyplot as plt
from custom_cartpole import CartPoleEnv


# matplotlib inline

ray.shutdown()
ray.init(include_webui=False, ignore_reinit_error=True, redis_max_memory=500000000, object_store_memory=5000000000, temp_dir="../../tmp/")

FloatTensor = torch.FloatTensor



# # Set the Env name and action space for CartPole
# ENV_NAME = 'CartPole-v0'
# # Move left, Move right
# ACTION_DICT = {"LEFT": 0, "RIGHT": 1}
# # Register the environment
# env_CartPole = gym.make(ENV_NAME)
#
# # Set result saveing floder
# result_floder = ENV_NAME
# result_file = ENV_NAME + "/results.txt"
Exemple #55
0
def main(args):
    # ====================================
    # init env config
    # ====================================
    if args.no_debug:
        ray.init(webui_host="127.0.0.1")
    else:
        ray.init(local_mode=True, webui_host="127.0.0.1")
    # use ray cluster for training
    # ray.init(
    #     address="auto" if args.address is None else args.address,
    #     redis_password="******",
    # )
    #
    # print(
    #     "--------------- Ray startup ------------\n{}".format(
    #         ray.state.cluster_resources()
    #     )
    # )

    agent_specs = {"AGENT-007": agent_spec}

    env_config = {
        "seed": 42,
        "scenarios": [scenario_paths],
        "headless": args.headless,
        "agent_specs": agent_specs,
    }

    # ====================================
    # init tune config
    # ====================================
    class MultiEnv(RLlibHiWayEnv):
        def __init__(self, env_config):
            env_config["scenarios"] = [
                scenario_paths[(env_config.worker_index - 1) %
                               len(scenario_paths)]
            ]
            super(MultiEnv, self).__init__(config=env_config)

    tune_config = {
        "env": MultiEnv,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "default_policy": (
                    None,
                    OBSERVATION_SPACE,
                    ACTION_SPACE,
                    {},
                )
            },
            "policy_mapping_fn": lambda agent_id: "default_policy",
        },
        "framework": "torch",
        "callbacks": {
            "on_episode_start": on_episode_start,
            "on_episode_step": on_episode_step,
            "on_episode_end": on_episode_end,
        },
        "lr": 1e-4,
        "log_level": "WARN",
        "num_workers": args.num_workers,
        "horizon": args.horizon,
        "learning_starts": 10000,
        "buffer_size": 50000,
        "rollout_fragment_length": 4,
        "train_batch_size": 32,

        # "exploration_config": {
        #     "epsilon_timesteps": 200000,
        #     "final_epsilon": .01
        # },
        "timesteps_per_iteration": 10000,
        "dueling": False,
        "prioritized_replay": False,

        # "model": {
        #     "fcnet_hiddens": [256, 256],
        #     "fcnet_activation": "relu",
        # },
        "exploration_config": {
            # Exploration sub-class by name or full path to module+class
            # (e.g. “ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy”)
            # "type": "zhr_train_rllib.multiobj_episilongreedy.EpsilonGreedy",
            # Parameters for the Exploration class' constructor:
            "initial_epsilon": 1.0,
            "final_epsilon": 0.02,
            "epsilon_timesteps":
            200000,  # Timesteps over which to anneal epsilon.
        },

        # "observation_filter": "MeanStdFilter",

        # "model":{
        #     "use_lstm": True,
        # },
    }

    tune_config.update({
        "gamma": 0.995,
        "decompose_num": 2,
        # "seed_global": tune.grid_search([10, 20, 30, 40])
    })

    num_samples = 4
    if not args.no_debug:
        tune_config.update({
            "learning_starts": 1000,
            "buffer_size": 5000,
            "timesteps_per_iteration": 1000,
        })
        num_samples = 1

    # ====================================
    # init log and checkpoint dir_info
    # ====================================
    experiment_name = EXPERIMENT_NAME.format(
        scenario=args.exper,
        algorithm="DQN",
        n_agent=1,
    )

    log_dir = Path(args.log_dir).expanduser().absolute() / RUN_NAME
    log_dir.mkdir(parents=True, exist_ok=True)
    print(f"Checkpointing at {log_dir}")

    if args.restore:
        restore_path = Path(args.restore).expanduser()
        print(f"Loading model from {restore_path}")
    else:
        restore_path = None

    # run experiments
    analysis = tune.run(
        DQN,
        # "DQN",
        name=experiment_name,
        stop={"timesteps_total": 10000000 / 2},
        checkpoint_freq=10,
        checkpoint_at_end=True,
        local_dir=str(log_dir),
        resume=args.resume,
        restore=restore_path,
        max_failures=1000,
        export_formats=["model", "checkpoint"],
        config=tune_config,
        num_samples=num_samples,
    )

    print(analysis.dataframe().head())
def tune_random(name, exp, num_samples=2, seed_value=None, **digit_kwargs):
    """Tune hyperparameters of any bandit experiment."""

    # ------------------------------------------------------------------------
    # Init seed
    prng = np.random.RandomState(seed_value)

    # Init ray
    if not ray.is_initialized():
        if "debug" in digit_kwargs:
            ray.init(local_mode=digit_kwargs["debug"])
        else:
            ray.init()

    # ------------------------------------------------------------------------
    # Create the train function. We do it scope to control if it
    # gets remote'ed to GPUs or not.
    try:
        if digit_kwargs["use_gpu"]:

            @ray.remote(num_gpus=0.25)
            def train(name=None, exp_func=None, config=None):
                trial = exp_func(**config)
                trial.update({"config": config, "name": name})
                return trial
        else:

            @ray.remote
            def train(name=None, exp_func=None, config=None):
                trial = exp_func(**config)
                trial.update({"config": config, "name": name})
                return trial
    except KeyError:

        @ray.remote
        def train(name=None, exp_func=None, config=None):
            trial = exp_func(**config)
            trial.update({"config": config, "name": name})
            return trial

    # ------------------------------------------------------------------------
    # Init:
    # Separate name from path
    path, name = os.path.split(name)

    # Look up the bandit run function were using in this tuning.
    exp_func = getattr(glia_digits, exp)

    # ------------------------------------------------------------------------
    # Run!
    # Setup the parallel workers
    runs = []
    for i in range(num_samples):
        # Make a new HP sample
        params = {}
        for k, v in digit_kwargs.items():
            if isinstance(v, str):
                params[k] = v
            elif isinstance(v, bool):
                params[k] = v
            elif isinstance(v, int):
                params[k] = v
            elif isinstance(v, float):
                params[k] = v
            else:
                low, high = v
                params[k] = prng.uniform(low=low, high=high)
        runs.append(train.remote(i, exp_func, params))

    trials = [ray.get(r) for r in runs]

    # ------------------------------------------------------------------------
    # Save configs and correct (full model data is dropped):
    best = get_best_trial(trials, 'correct')

    # Best trial config
    best_config = best["config"]
    best_config.update(get_best_result(trials, 'correct'))
    save_checkpoint(best_config,
                    filename=os.path.join(path, name + "_best.pkl"))

    # Sort and save the configs of all trials
    sorted_configs = {}
    for i, trial in enumerate(get_sorted_trials(trials, 'correct')):
        sorted_configs[i] = trial["config"]
        sorted_configs[i].update({"correct": trial["correct"]})
    save_checkpoint(sorted_configs,
                    filename=os.path.join(path, name + "_sorted.pkl"))

    # kill ray
    ray.shutdown()

    # -
    return best, trials
Exemple #57
0
def ray_init_with_task_retry_delay():
    address = ray.init(_system_config={"task_retry_delay_ms": 100})
    yield address
    ray.shutdown()
import torch
from scipy.signal import savgol_filter
from tqdm import tqdm

import ffmpeg
import ray
from data_segments.get_data_segments import (get_flame_params_for_file,
                                             get_segments_v2)
from misc.read_n_write import flame2glow
from misc.shared import DATA_DIR
from misc.utils import frames2ms, get_participant

WIN_LEN = 9
ONLY_ODD = True

ray.init(num_cpus=10)

segments = get_segments_v2()

sessions_dir = DATA_DIR / "Sessions_50fps_pytorch_data/sessions"


def to_iterator(obj_ids):
    while obj_ids:
        done, obj_ids = ray.wait(obj_ids)
        yield ray.get(done[0])


def get_openface(file_):

    tar_file = TarFile(file_)
Exemple #59
0
            filename="smoketest.parquet",
            num_rows=args.num_workers * 500,
            num_features=4,
            num_classes=2,
            num_partitions=args.num_workers * 10)
        use_gpu = False
    else:
        path = args.file
        if not os.path.exists(path):
            raise ValueError(
                f"Benchmarking data not found: {path}."
                f"\nFIX THIS by running `python create_test_data.py` first.")

    init_start = time.time()
    if args.smoke_test:
        ray.init(num_cpus=num_workers)
    else:
        ray.init(address="auto")
    init_taken = time.time() - init_start

    full_start = time.time()
    bst, train_taken = train_ray(path=path,
                                 num_workers=num_workers,
                                 num_boost_rounds=num_boost_rounds,
                                 num_files=num_files,
                                 regression=args.regression,
                                 use_gpu=use_gpu,
                                 smoke_test=args.smoke_test)
    full_taken = time.time() - full_start
    print(f"TOTAL TIME TAKEN: {full_taken:.2f} seconds "
          f"({init_taken:.2f} for init)")
Exemple #60
0
 def setUpClass(cls):
     ray.init()