Example #1
0
    def _test_broadcast_helper(
        self, group, group_id, rank, cuda=False, rank_to_GPU=None
    ):
        for ttype, value, requires_cuda in [
            ("torch.FloatTensor", -1e-10, False),
            ("torch.DoubleTensor", -1e-100, False),
            ("torch.HalfTensor", -0.1, True),
            ("torch.CharTensor", -2, False),
            ("torch.ByteTensor", 129, False),
            ("torch.IntTensor", -1e5, False),
            ("torch.LongTensor", -1e15, False),
        ]:
            if requires_cuda and not cuda:
                continue
            for src in group:
                expected_tensor = _build_tensor(src + 1, value).type(ttype)
                if cuda:
                    expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0])
                if rank == src:
                    dist.broadcast(expected_tensor, src, group_id)
                else:
                    tensor = _build_tensor(src + 1, -1).type(ttype)
                    if cuda:
                        tensor = tensor.cuda(rank_to_GPU[rank][0])
                    dist.broadcast(tensor, src, group_id)
                    self.assertEqual(tensor.size(), expected_tensor.size())
                    self.assertEqual(tensor.ne(expected_tensor).max(), 0)

        self._barrier()
Example #2
0
def init_param(model, src, group):
    for param in model.parameters():
        #print(param)
        sys.stdout.flush()
        dist.broadcast(param.data, src=src, group=group)
        #print('done')
        sys.stdout.flush()
Example #3
0
    def _dist_broadcast_coalesced(self, tensors, buffer_size):
        """
        Broadcast a sequence of tensors to the default group from rank 0.
        Small tensors are first coalesced into a buffer to reduce the number of
        broadcasts.

        tensors (sequence): tensors to broadcast. Each tensor needs to be on the
                            same GPU.
        buffer_size (int): maximum size of the buffer for coalescing
        """
        for tensors in _take_tensors(tensors, buffer_size):
            flat_tensors = _flatten_dense_tensors(tensors)
            dist.broadcast(flat_tensors, 0)
            for tensor, synced in zip(tensors,
                                      _unflatten_dense_tensors(flat_tensors, tensors)):
                tensor.copy_(synced)
Example #4
0
    def _test_barrier_helper(self, group, group_id, rank):
        WAIT_TIME = 0.3  # seconds

        for dest in group:
            expected_time = torch.DoubleTensor(1).fill_(0.0)
            if dest == rank:
                expected_time.fill_(time.time() + WAIT_TIME)
                dist.broadcast(expected_time, dest, group_id)
                time.sleep(WAIT_TIME + 0.1)  # sleep a little bit longer
                dist.barrier(group_id)
            else:
                dist.broadcast(expected_time, dest, group_id)
                dist.barrier(group_id)
                self.assertGreaterEqual(time.time(), expected_time[0])

        self._barrier()
Example #5
0
def run():
    modell = model.CNN()
    # modell = model.AlexNet()

    size = dist.get_world_size()
    rank = dist.get_rank()

    group_list = []
    for i in range(size):
        group_list.append(i)
    group = dist.new_group(group_list)

    while (1):

        for param in modell.parameters():
            # for dst in range(1, size):
            # dist.send(param.data, dst=dst)
            dist.broadcast(param.data, src=0, group=group)

        for param in modell.parameters():
            tensor_temp = torch.zeros_like(param.data)
            dist.reduce(tensor_temp, dst=0, op=dist.reduce_op.SUM, group=group)
            param.data = tensor_temp / (size - 1)
Example #6
0
def init_param(model, src, group):
    for param in model.parameters():
        dist.broadcast(param.data, src=src, group=group)
Example #7
0
def get_new_model(model, group):
    for param in model.parameters():
        dist.broadcast(param.data, src=0, group=group)
    #print(dist.get_rank())
    return  model
Example #8
0
 def sync_parameters(self):
     for param in self.module.parameters():
         dist.broadcast(param.data, 0)