def test_allreduce_different_dtype(ray_start_single_node, dtype, backend): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) ray.wait([a.set_buffer.remote(np.ones(10, dtype=dtype)) for a in actors]) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == np.ones((10,), dtype=dtype) * world_size).all() assert (results[1] == np.ones((10,), dtype=dtype) * world_size).all()
def test_reduce_different_name( ray_start_single_node, group_name, array_size, dst_rank, backend ): world_size = 2 actors, _ = create_collective_workers( num_workers=world_size, group_name=group_name, backend=backend ) ray.wait( [ a.set_buffer.remote(np.ones(array_size, dtype=np.float32) * (i + 1)) for i, a in enumerate(actors) ] ) src_rank = 1 - dst_rank refs = [] for i, actor in enumerate(actors): if i != dst_rank: ref = actor.do_send.remote(group_name, dst_rank) else: ref = actor.do_recv.remote(group_name, src_rank) refs.append(ref) results = ray.get(refs) for i in range(world_size): assert ( results[i] == np.ones(array_size, dtype=np.float32) * (src_rank + 1) ).all()
def test_sendrecv_torch_numpy(ray_start_single_node, dst_rank, backend): import torch world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) ray.wait( [ actors[1].set_buffer.remote( torch.ones( 10, ) * 2 ) ] ) src_rank = 1 - dst_rank refs = [] for i, actor in enumerate(actors): if i != dst_rank: ref = actor.do_send.remote(dst_rank=dst_rank) else: ref = actor.do_recv.remote(src_rank=src_rank) refs.append(ref) results = ray.get(refs) if dst_rank == 0: assert (results[0] == np.ones((10,)) * 2).all() assert (results[1] == torch.ones((10,)) * 2).all() else: assert (results[0] == np.ones((10,))).all() assert (results[1] == torch.ones((10,))).all()
def test_broadcast_invalid_rank(ray_start_distributed_2_nodes, backend, src_rank=9): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) with pytest.raises(ValueError): _ = ray.get([a.do_broadcast.remote(src_rank=src_rank) for a in actors])
def test_destroy_group(ray_start_single_node, backend): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) # Now destroy the group at actor0 ray.wait([actors[0].destroy_group.remote()]) actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert not actor0_is_init # should go well as the group `random` does not exist at all ray.wait([actors[0].destroy_group.remote("random")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert actor1_is_init ray.wait([actors[1].destroy_group.remote("random")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert actor1_is_init ray.wait([actors[1].destroy_group.remote("default")]) actor1_is_init = ray.get(actors[1].report_is_group_initialized.remote()) assert not actor1_is_init # Now reconstruct the group using the same name init_results = ray.get([ actor.init_group.remote(world_size, i) for i, actor in enumerate(actors) ]) for i in range(world_size): assert init_results[i] actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor0_is_init actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor1_is_init
def test_init_two_actors(ray_start_distributed_2_nodes, world_size, group_name, backend): actors, results = create_collective_workers(world_size, group_name, backend=backend) for i in range(world_size): assert (results[i])
def test_init_two_actors(ray_start_single_node, group_name, backend): world_size = 2 actors, results = create_collective_workers(world_size, group_name, backend=backend) for i in range(world_size): assert results[i]
def test_allreduce_different_op(ray_start_distributed_2_nodes, backend): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) # check product ray.wait([ a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get( [a.do_allreduce.remote(op=ReduceOp.PRODUCT) for a in actors]) product = 1 for i in range(world_size): product = product * (i + 2) assert (results[0] == np.ones((10, ), dtype=np.float32) * product).all() assert (results[1] == np.ones((10, ), dtype=np.float32) * product).all() # check min ray.wait([ a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([a.do_allreduce.remote(op=ReduceOp.MIN) for a in actors]) assert (results[0] == np.ones((10, ), dtype=np.float32) * 2).all() assert (results[1] == np.ones((10, ), dtype=np.float32) * 2).all() # check max ray.wait([ a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([a.do_allreduce.remote(op=ReduceOp.MAX) for a in actors]) assert (results[0] == np.ones((10, ), dtype=np.float32) * 9).all() assert (results[1] == np.ones((10, ), dtype=np.float32) * 9).all()
def test_allreduce_torch_numpy(ray_start_single_node, backend): # import torch world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) ray.wait( [ actors[1].set_buffer.remote( torch.ones( 10, ) ) ] ) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == np.ones((10,)) * world_size).all() ray.wait( [ actors[0].set_buffer.remote( torch.ones( 10, ) ) ] ) ray.wait([actors[1].set_buffer.remote(np.ones(10, dtype=np.float32))]) ray.get([a.do_allreduce.remote() for a in actors])
def test_allreduce_destroy(ray_start_distributed_2_nodes, backend, group_name="default"): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == np.ones((10, ), dtype=np.float32) * world_size).all() assert (results[1] == np.ones((10, ), dtype=np.float32) * world_size).all() # destroy the group and try do work, should fail ray.get([a.destroy_group.remote() for a in actors]) with pytest.raises(RuntimeError): results = ray.get([a.do_allreduce.remote() for a in actors]) # reinit the same group and all reduce ray.get([ actor.init_group.remote(world_size, i, backend, group_name) for i, actor in enumerate(actors) ]) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == np.ones( (10, ), dtype=np.float32) * world_size * world_size).all() assert (results[1] == np.ones( (10, ), dtype=np.float32) * world_size * world_size).all()
def test_allreduce_different_name(ray_start_single_node, group_name, backend): world_size = 2 actors, _ = create_collective_workers( num_workers=world_size, group_name=group_name, backend=backend ) results = ray.get([a.do_allreduce.remote(group_name) for a in actors]) assert (results[0] == np.ones((10,), dtype=np.float32) * world_size).all() assert (results[1] == np.ones((10,), dtype=np.float32) * world_size).all()
def test_reducescatter_different_dtype(ray_start_single_node, dtype, backend): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) init_tensors_for_gather_scatter(actors, dtype=dtype) results = ray.get([a.do_reducescatter.remote() for a in actors]) for i in range(world_size): for j in range(world_size): assert (results[i] == np.ones(10, dtype=dtype) * world_size).all()
def test_allgather_different_dtype(ray_start_distributed_2_nodes, dtype, backend): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) init_tensors_for_gather_scatter(actors, dtype=dtype) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): assert (results[i][j] == np.ones(10, dtype=dtype) * (j + 1)).all()
def test_unmatched_tensor_list_length(ray_start_distributed_2_nodes, length, backend): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) list_buffer = [np.ones(10, dtype=np.float32) for _ in range(length)] ray.wait([a.set_list_buffer.remote(list_buffer, copy=True) for a in actors]) if length != world_size: with pytest.raises(RuntimeError): ray.get([a.do_allgather.remote() for a in actors]) else: ray.get([a.do_allgather.remote() for a in actors])
def test_allgather_torch_numpy(ray_start_single_node, backend): world_size = 2 shape = [10, 10] actors, _ = create_collective_workers(world_size, backend=backend) # tensor is pytorch, list is numpy for i, a in enumerate(actors): t = torch.ones(shape, dtype=torch.float32) * (i + 1) ray.wait([a.set_buffer.remote(t)]) list_buffer = [ np.ones(shape, dtype=np.float32) for _ in range(world_size) ] ray.wait([a.set_list_buffer.remote(list_buffer, copy=True)]) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): assert (results[i][j] == np.ones(shape, dtype=np.float32) * (j + 1)).all() # tensor is numpy, list is pytorch for i, a in enumerate(actors): t = np.ones(shape, dtype=np.float32) * (i + 1) ray.wait([a.set_buffer.remote(t)]) list_buffer = [ torch.ones(shape, dtype=torch.float32) for _ in range(world_size) ] ray.wait([a.set_list_buffer.remote(list_buffer, copy=True)]) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): assert (results[i][j] == torch.ones(shape, dtype=torch.float32) * (j + 1)).all() # some tensors in the list are pytorch, some are numpy for i, a in enumerate(actors): t = np.ones(shape, dtype=np.float32) * (i + 1) ray.wait([a.set_buffer.remote(t)]) list_buffer = [] for j in range(world_size): if j % 2 == 0: list_buffer.append(torch.ones(shape, dtype=torch.float32)) else: list_buffer.append(np.ones(shape, dtype=np.float32)) ray.wait([a.set_list_buffer.remote(list_buffer, copy=True)]) results = ray.get([a.do_allgather.remote() for a in actors]) for i in range(world_size): for j in range(world_size): if j % 2 == 0: assert ( results[i][j] == torch.ones(shape, dtype=torch.float32) * (j + 1)).all() else: assert (results[i][j] == np.ones(shape, dtype=np.float32) * (j + 1)).all()
def test_unmatched_tensor_shape(ray_start_single_node, shape, backend): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) init_tensors_for_gather_scatter(actors, array_size=10) list_buffer = [np.ones(shape, dtype=np.float32) for _ in range(world_size)] ray.get([a.set_list_buffer.remote(list_buffer, copy=True) for a in actors]) if shape != 10: with pytest.raises(RuntimeError): ray.get([a.do_allgather.remote() for a in actors]) else: ray.get([a.do_allgather.remote() for a in actors])
def test_allreduce_different_array_size( ray_start_distributed_2_nodes, array_size, backend ): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) ray.wait( [a.set_buffer.remote(np.ones(array_size, dtype=np.float32)) for a in actors] ) results = ray.get([a.do_allreduce.remote() for a in actors]) assert (results[0] == np.ones((array_size,), dtype=np.float32) * world_size).all() assert (results[1] == np.ones((array_size,), dtype=np.float32) * world_size).all()
def test_reduce_torch_numpy(ray_start_single_node, dst_rank, backend): import torch world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) ray.wait([actors[1].set_buffer.remote(torch.ones(10, ))]) results = ray.get([a.do_reduce.remote(dst_rank=dst_rank) for a in actors]) if dst_rank == 0: assert (results[0] == np.ones((10, )) * world_size).all() assert (results[1] == torch.ones((10, ))).all() else: assert (results[0] == np.ones((10, ))).all() assert (results[1] == torch.ones((10, )) * world_size).all()
def test_broadcast_different_array_size(ray_start_distributed_2_nodes, array_size, src_rank, backend): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) ray.wait([ a.set_buffer.remote(np.ones(array_size, dtype=np.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get( [a.do_broadcast.remote(src_rank=src_rank) for a in actors]) for i in range(world_size): assert (results[i] == np.ones( (array_size, ), dtype=np.float32) * (src_rank + 2)).all()
def test_broadcast_torch_numpy(ray_start_distributed_2_nodes, src_rank, backend): import torch world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) ray.wait([actors[1].set_buffer.remote(torch.ones(10, ) * world_size)]) results = ray.get( [a.do_broadcast.remote(src_rank=src_rank) for a in actors]) if src_rank == 0: assert (results[0] == np.ones((10, ))).all() assert (results[1] == torch.ones((10, ))).all() else: assert (results[0] == np.ones((10, )) * world_size).all() assert (results[1] == torch.ones((10, )) * world_size).all()
def test_is_group_initialized(ray_start_distributed_2_nodes, backend): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) # check group is_init actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor0_is_init actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote("random")) assert not actor0_is_init actor0_is_init = ray.get(actors[0].report_is_group_initialized.remote("123")) assert not actor0_is_init actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote()) assert actor1_is_init actor1_is_init = ray.get(actors[0].report_is_group_initialized.remote("456")) assert not actor1_is_init
def test_reducescatter_different_array_size(ray_start_single_node, array_size, tensor_backend, backend): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) init_tensors_for_gather_scatter( actors, array_size=array_size, tensor_backend=tensor_backend) results = ray.get([a.do_reducescatter.remote() for a in actors]) for i in range(world_size): if tensor_backend == "numpy": assert (results[i] == np.ones(array_size, dtype=np.float32) * world_size).all() else: assert (results[i] == torch.ones(array_size, dtype=torch.float32) * world_size).all()
def test_reduce_different_name(ray_start_single_node, group_name, dst_rank, backend): world_size = 2 actors, _ = create_collective_workers(num_workers=world_size, group_name=group_name, backend=backend) results = ray.get( [a.do_reduce.remote(group_name, dst_rank) for a in actors]) for i in range(world_size): if i == dst_rank: assert (results[i] == np.ones( (10, ), dtype=np.float32) * world_size).all() else: assert (results[i] == np.ones((10, ), dtype=np.float32)).all()
def test_reduce_different_op(ray_start_single_node, dst_rank, backend): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) # check product ray.wait([ a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([ a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.PRODUCT) for a in actors ]) for i in range(world_size): if i == dst_rank: assert (results[i] == np.ones((10, ), dtype=np.float32) * 6).all() else: assert (results[i] == np.ones( (10, ), dtype=np.float32) * (i + 2)).all() # check min ray.wait([ a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([ a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.MIN) for a in actors ]) for i in range(world_size): if i == dst_rank: assert (results[i] == np.ones((10, ), dtype=np.float32) * 2).all() else: assert (results[i] == np.ones( (10, ), dtype=np.float32) * (i + 2)).all() # check max ray.wait([ a.set_buffer.remote(np.ones(10, dtype=np.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([ a.do_reduce.remote(dst_rank=dst_rank, op=ReduceOp.MAX) for a in actors ]) for i in range(world_size): if i == dst_rank: assert (results[i] == np.ones((10, ), dtype=np.float32) * 3).all() else: assert (results[i] == np.ones( (10, ), dtype=np.float32) * (i + 2)).all()
def test_allreduce_multiple_group(ray_start_distributed_2_nodes, backend, num_groups=5): world_size = 8 actors, _ = create_collective_workers(world_size, backend=backend) for group_name in range(1, num_groups): ray.get([ actor.init_group.remote(world_size, i, backend, str(group_name)) for i, actor in enumerate(actors) ]) for i in range(num_groups): group_name = "default" if i == 0 else str(i) results = ray.get([a.do_allreduce.remote(group_name) for a in actors]) assert (results[0] == np.ones( (10, ), dtype=np.float32) * (world_size**(i + 1))).all()
def test_broadcast_different_name(ray_start_single_node, group_name, src_rank, backend): world_size = 2 actors, _ = create_collective_workers( num_workers=world_size, group_name=group_name, backend=backend) ray.get([ a.set_buffer.remote(np.ones((10, ), dtype=np.float32) * (i + 2)) for i, a in enumerate(actors) ]) results = ray.get([ a.do_broadcast.remote(group_name=group_name, src_rank=src_rank) for a in actors ]) for i in range(world_size): assert (results[i] == np.ones( (10, ), dtype=np.float32) * (src_rank + 2)).all()
def test_reduce_different_array_size(ray_start_single_node, array_size, dst_rank, backend): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) ray.wait([ a.set_buffer.remote(np.ones(array_size, dtype=np.float32)) for a in actors ]) results = ray.get([a.do_reduce.remote(dst_rank=dst_rank) for a in actors]) for i in range(world_size): if i == dst_rank: assert (results[i] == np.ones( (array_size, ), dtype=np.float32) * world_size).all() else: assert (results[i] == np.ones((array_size, ), dtype=np.float32)).all()
def test_get_rank(ray_start_single_node, backend): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) actor0_rank = ray.get(actors[0].report_rank.remote()) assert actor0_rank == 0 actor1_rank = ray.get(actors[1].report_rank.remote()) assert actor1_rank == 1 # create a second group with a different name, # and different order of ranks. new_group_name = "default2" ray.get([ actor.init_group.remote(world_size, world_size - 1 - i, group_name=new_group_name, backend=backend) for i, actor in enumerate(actors) ]) actor0_rank = ray.get(actors[0].report_rank.remote(new_group_name)) assert actor0_rank == 1 actor1_rank = ray.get(actors[1].report_rank.remote(new_group_name)) assert actor1_rank == 0
def test_sendrecv(ray_start_distributed_2_nodes, group_name, array_size, src_rank, dst_rank, backend): if src_rank == dst_rank: return world_size = 8 actors, _ = create_collective_workers(num_workers=world_size, group_name=group_name, backend=backend) ray.get([ a.set_buffer.remote(np.ones(array_size, dtype=np.float32) * (i + 1)) for i, a in enumerate(actors) ]) refs = [] for i in range(world_size): refs.append(actors[i].get_buffer.remote()) refs[src_rank] = actors[src_rank].do_send.remote(group_name, dst_rank) refs[dst_rank] = actors[dst_rank].do_recv.remote(group_name, src_rank) results = ray.get(refs) assert (results[src_rank] == np.ones(array_size, dtype=np.float32) * (src_rank + 1)).all() assert (results[dst_rank] == np.ones(array_size, dtype=np.float32) * (src_rank + 1)).all() ray.get([a.destroy_group.remote(group_name) for a in actors])
def test_reduce_multiple_group(ray_start_single_node, dst_rank, backend, num_groups=5): world_size = 2 actors, _ = create_collective_workers(world_size, backend=backend) for group_name in range(1, num_groups): ray.get([ actor.init_group.remote(world_size, i, backend, str(group_name)) for i, actor in enumerate(actors) ]) for i in range(num_groups): group_name = "default" if i == 0 else str(i) results = ray.get([ a.do_reduce.remote(dst_rank=dst_rank, group_name=group_name) for a in actors ]) for j in range(world_size): if j == dst_rank: assert (results[j] == np.ones( (10, ), dtype=np.float32) * (i + 2)).all() else: assert (results[j] == np.ones((10, ), dtype=np.float32)).all()