def test_send_recv_any_source(self): rank = dist.get_rank() tensor = _build_tensor(10, value=rank) recv_ranks = set() for dst in range(0, dist.get_world_size()): if dst == rank: # Recv mode for dst in range(0, dist.get_world_size()): if dst == rank: continue output_tensor = _build_tensor(10, value=-1) sender = dist.recv(output_tensor) # Assert the scalar value "sender" that should be # equal to the rank of the sender is equal to all # values in the received tensor. self.assertTrue(output_tensor.eq(sender).all()) recv_ranks.add(sender) else: # Send mode dist.send(tensor, dst) self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier()
def test_get_rank_size_group(self): if dist.get_world_size() > 2: group = [1, 2] else: group = [0, 1] group_id = dist.new_group(group) if dist.get_rank() in group: self.assertEqual(dist.get_world_size(group_id), 2) self.assertTrue(dist.get_rank(group_id) in list(range(2))) else: self.assertEqual(dist.get_world_size(group_id), -1) self.assertEqual(dist.get_rank(group_id), -1)
def test_destroy_group(self): if dist.get_world_size() > 2: group = [1, 2] else: group = [0, 1] group_id = dist.new_group(group) dist.destroy_process_group(group_id)
def test_send_recv(self): rank = dist.get_rank() tensor = _build_tensor(rank + 1) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(src + 1, value=-1) expected_tensor = _build_tensor(src + 1) dist.recv(tensor, src) self.assertEqual(tensor, expected_tensor) self._barrier()
def test_send_recv_any_source(self): rank = dist.get_rank() tensor = _build_tensor(10, rank) for dest in range(0, dist.get_world_size()): if dest == rank: continue dist.send(tensor, dest) recv_ranks = set() for src in range(0, dist.get_world_size()): if src == rank: continue tensor = _build_tensor(10, value=-1) sender = dist.recv(tensor) self.assertTrue(tensor.eq(sender).all()) recv_ranks.add(sender) self.assertEqual(len(recv_ranks), dist.get_world_size() - 1) self._barrier()
def test_send_recv(self): rank = dist.get_rank() tensor = _build_tensor(rank + 1) for src in range(0, dist.get_world_size()): if src == rank: # Send mode for dst in range(0, dist.get_world_size()): if dst == rank: continue dist.send(tensor, dst) else: # Recv mode expected_tensor = _build_tensor(src + 1) output_tensor = _build_tensor(src + 1, value=-1) dist.recv(output_tensor, src) self.assertEqual(output_tensor, expected_tensor) self._barrier()
def allreduce_params(): if self.needs_reduction: self.needs_reduction = False buckets = defaultdict(list) for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) buckets[tp].append(param) for bucket in buckets.values(): grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip( grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced)
def test_isend(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: requests = [ dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size) ] for request in requests: request.wait() self.assertTrue(request.is_completed()) else: tensor = _build_tensor(rank, -1) dist.recv(tensor, 0) self.assertEqual(tensor, _build_tensor(rank, 10)) self._barrier()
def _init_multigpu_helper(self): """Multigpu tests are designed to simulate the multi nodes with multi GPUs on each node. Nccl backend requires equal #GPUs in each process. On a single node, all visible GPUs are evenly divided to subsets, each process only uses a subset. """ nGPUs = torch.cuda.device_count() world_size = dist.get_world_size() visible_devices = range(nGPUs) if BACKEND == "nccl": apply_hack_for_nccl() nGPUs_per_process = nGPUs // world_size rank_to_GPU = { i: list(visible_devices[i * nGPUs_per_process:(i + 1) * nGPUs_per_process]) for i in range(world_size) } return rank_to_GPU
def test_irecv(self): rank = dist.get_rank() world_size = dist.get_world_size() if rank == 0: expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)] requests = [ dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size) ] for src in range(1, world_size): requests[src - 1].wait() self.assertTrue(requests[src - 1].is_completed()) self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10)) else: tensor = _build_tensor(rank, 10) dist.send(tensor, 0) self._barrier()
def test_get_rank(self): test_dir = os.path.join(TEMP_DIR, "test_dir") pid = str(os.getpid()) num_processes = dist.get_world_size() with open(os.path.join(test_dir, pid), "w") as f: f.write(str(dist.get_rank())) self._barrier() all_ranks = set() for f_name in os.listdir(test_dir): with open(os.path.join(test_dir, f_name), "r") as f: all_ranks.add(int(f.read())) self.assertEqual(len(all_ranks), num_processes) self._barrier() if dist.get_rank() == 0: for f_name in os.listdir(test_dir): os.unlink(os.path.join(test_dir, f_name)) self._barrier()
def sync(cls, timeout=5): cls.barrier_id += 1 barrier_dir = os.path.join(TEMP_DIR, "barrier") pid = str(os.getpid()) barrier_file = os.path.join(barrier_dir, pid) with _lock(): with open(barrier_file, "w") as f: f.write(str(cls.barrier_id)) start_time = time.time() while True: arrived = 0 with _lock(): for f_name in os.listdir(barrier_dir): with open(os.path.join(barrier_dir, f_name), "r") as f: data = f.read() if int(data) >= cls.barrier_id: arrived += 1 if arrived == dist.get_world_size(): break if time.time() - start_time > timeout: raise RuntimeError("barrier timeout") time.sleep(0.1)
def _init_global_test(self): group = [i for i in range(0, dist.get_world_size())] group_id = dist.group.WORLD rank = dist.get_rank() return (group, group_id, rank)
def test_get_rank_size_full_group(self): _, group_id, _ = self._init_full_group_test() self.assertEqual(dist.get_world_size(group_id), dist.get_world_size()) self.assertEqual(dist.get_rank(group_id), dist.get_rank())
def _init_full_group_test(self): group = [i for i in range(0, dist.get_world_size())] group_id = dist.new_group() rank = dist.get_rank() return (group, group_id, rank)