Example #1
0
    def test_send_recv_any_source(self):
        rank = dist.get_rank()
        tensor = _build_tensor(10, value=rank)
        recv_ranks = set()

        for dst in range(0, dist.get_world_size()):
            if dst == rank:
                # Recv mode
                for dst in range(0, dist.get_world_size()):
                    if dst == rank:
                        continue
                    output_tensor = _build_tensor(10, value=-1)
                    sender = dist.recv(output_tensor)

                    # Assert the scalar value "sender" that should be
                    # equal to the rank of the sender is equal to all
                    # values in the received tensor.
                    self.assertTrue(output_tensor.eq(sender).all())
                    recv_ranks.add(sender)
            else:
                # Send mode
                dist.send(tensor, dst)

        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
        self._barrier()
Example #2
0
 def test_get_rank_size_group(self):
     if dist.get_world_size() > 2:
         group = [1, 2]
     else:
         group = [0, 1]
     group_id = dist.new_group(group)
     if dist.get_rank() in group:
         self.assertEqual(dist.get_world_size(group_id), 2)
         self.assertTrue(dist.get_rank(group_id) in list(range(2)))
     else:
         self.assertEqual(dist.get_world_size(group_id), -1)
         self.assertEqual(dist.get_rank(group_id), -1)
Example #3
0
 def test_destroy_group(self):
     if dist.get_world_size() > 2:
         group = [1, 2]
     else:
         group = [0, 1]
     group_id = dist.new_group(group)
     dist.destroy_process_group(group_id)
Example #4
0
    def test_send_recv(self):
        rank = dist.get_rank()
        tensor = _build_tensor(rank + 1)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(src + 1, value=-1)
            expected_tensor = _build_tensor(src + 1)
            dist.recv(tensor, src)
            self.assertEqual(tensor, expected_tensor)

        self._barrier()
Example #5
0
    def test_send_recv_any_source(self):
        rank = dist.get_rank()
        tensor = _build_tensor(10, rank)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        recv_ranks = set()
        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(10, value=-1)
            sender = dist.recv(tensor)
            self.assertTrue(tensor.eq(sender).all())
            recv_ranks.add(sender)

        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
        self._barrier()
Example #6
0
    def test_send_recv(self):
        rank = dist.get_rank()
        tensor = _build_tensor(rank + 1)

        for src in range(0, dist.get_world_size()):
            if src == rank:
                # Send mode
                for dst in range(0, dist.get_world_size()):
                    if dst == rank:
                        continue
                    dist.send(tensor, dst)
            else:
                # Recv mode
                expected_tensor = _build_tensor(src + 1)
                output_tensor = _build_tensor(src + 1, value=-1)
                dist.recv(output_tensor, src)
                self.assertEqual(output_tensor, expected_tensor)

        self._barrier()
Example #7
0
        def allreduce_params():
            if self.needs_reduction:
                self.needs_reduction = False
                buckets = defaultdict(list)
                for param in self.module.parameters():
                    if param.requires_grad and param.grad is not None:
                        tp = type(param.data)
                        buckets[tp].append(param)

                for bucket in buckets.values():
                    grads = [param.grad.data for param in bucket]
                    coalesced = _flatten_dense_tensors(grads)
                    dist.all_reduce(coalesced)
                    coalesced /= dist.get_world_size()
                    for buf, synced in zip(
                            grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
Example #8
0
    def test_isend(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            requests = [
                dist.isend(_build_tensor(dest, 10), dest)
                for dest in range(1, world_size)
            ]
            for request in requests:
                request.wait()
                self.assertTrue(request.is_completed())
        else:
            tensor = _build_tensor(rank, -1)
            dist.recv(tensor, 0)
            self.assertEqual(tensor, _build_tensor(rank, 10))

        self._barrier()
Example #9
0
    def _init_multigpu_helper(self):
        """Multigpu tests are designed to simulate the multi nodes with multi
        GPUs on each node. Nccl backend requires equal #GPUs in each process.
        On a single node, all visible GPUs are evenly
        divided to subsets, each process only uses a subset.
        """
        nGPUs = torch.cuda.device_count()
        world_size = dist.get_world_size()
        visible_devices = range(nGPUs)

        if BACKEND == "nccl":
            apply_hack_for_nccl()

        nGPUs_per_process = nGPUs // world_size
        rank_to_GPU = {
            i: list(visible_devices[i * nGPUs_per_process:(i + 1) *
                                    nGPUs_per_process])
            for i in range(world_size)
        }
        return rank_to_GPU
Example #10
0
    def test_irecv(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
            requests = [
                dist.irecv(expected_tensors[src - 1], src)
                for src in range(1, world_size)
            ]

            for src in range(1, world_size):
                requests[src - 1].wait()
                self.assertTrue(requests[src - 1].is_completed())
                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
        else:
            tensor = _build_tensor(rank, 10)
            dist.send(tensor, 0)

        self._barrier()
Example #11
0
    def test_get_rank(self):
        test_dir = os.path.join(TEMP_DIR, "test_dir")
        pid = str(os.getpid())
        num_processes = dist.get_world_size()
        with open(os.path.join(test_dir, pid), "w") as f:
            f.write(str(dist.get_rank()))

        self._barrier()

        all_ranks = set()
        for f_name in os.listdir(test_dir):
            with open(os.path.join(test_dir, f_name), "r") as f:
                all_ranks.add(int(f.read()))
        self.assertEqual(len(all_ranks), num_processes)

        self._barrier()

        if dist.get_rank() == 0:
            for f_name in os.listdir(test_dir):
                os.unlink(os.path.join(test_dir, f_name))

        self._barrier()
Example #12
0
    def sync(cls, timeout=5):
        cls.barrier_id += 1
        barrier_dir = os.path.join(TEMP_DIR, "barrier")
        pid = str(os.getpid())
        barrier_file = os.path.join(barrier_dir, pid)
        with _lock():
            with open(barrier_file, "w") as f:
                f.write(str(cls.barrier_id))

        start_time = time.time()
        while True:
            arrived = 0
            with _lock():
                for f_name in os.listdir(barrier_dir):
                    with open(os.path.join(barrier_dir, f_name), "r") as f:
                        data = f.read()
                        if int(data) >= cls.barrier_id:
                            arrived += 1
            if arrived == dist.get_world_size():
                break

            if time.time() - start_time > timeout:
                raise RuntimeError("barrier timeout")
            time.sleep(0.1)
Example #13
0
 def _init_global_test(self):
     group = [i for i in range(0, dist.get_world_size())]
     group_id = dist.group.WORLD
     rank = dist.get_rank()
     return (group, group_id, rank)
Example #14
0
 def test_get_rank_size_full_group(self):
     _, group_id, _ = self._init_full_group_test()
     self.assertEqual(dist.get_world_size(group_id), dist.get_world_size())
     self.assertEqual(dist.get_rank(group_id), dist.get_rank())
Example #15
0
 def _init_full_group_test(self):
     group = [i for i in range(0, dist.get_world_size())]
     group_id = dist.new_group()
     rank = dist.get_rank()
     return (group, group_id, rank)