コード例 #1
0
    def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
        for src in group:
            expected_tensor = _build_tensor(src + 1)
            tensors = [
                _build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank]
            ]
            if rank == src:
                tensors[0] = expected_tensor.cuda(device=rank_to_GPU[rank][0])

            dist.broadcast_multigpu(tensors, src, group_id)
            for tensor in tensors:
                self.assertEqual(tensor, expected_tensor)
        self._barrier()
コード例 #2
0
ファイル: test_distributed.py プロジェクト: xiongyw/pytorch
    def _test_broadcast_multigpu_helper(self, group, group_id,
                                        rank, rank_to_GPU):
        for src in group:
            expected_tensor = _build_tensor(src + 1)
            tensors = [_build_tensor(src + 1, -1).cuda(device=i)
                       for i in rank_to_GPU[rank]]
            if rank == src:
                tensors[0] = expected_tensor.cuda(
                    device=rank_to_GPU[rank][0])

            dist.broadcast_multigpu(tensors, src, group_id)
            for tensor in tensors:
                self.assertEqual(tensor, expected_tensor)
        self._barrier()
コード例 #3
0
 def broadcast_multigpu(self,
                        tensor_list,
                        src,
                        async_op=False,
                        src_tensor=0):
     return dist.broadcast_multigpu(tensor_list, src, self.group, async_op,
                                    src_tensor)
コード例 #4
0
ファイル: distributed_adamw.py プロジェクト: yumion/machina
    def step(self, closure=None):
        """Performs a single optimization step.

        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        grads = []
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grads.append(p.grad)
        flat_grads = torch.nn.utils.parameters_to_vector(grads)
        dist.all_reduce_multigpu([flat_grads])
        flat_grads /= self.world_size
        torch.nn.utils.vector_to_parameters(flat_grads, grads)

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    # Exponential moving average of gradient values
                    state['exp_avg'] = grad.new().resize_as_(grad).zero_()
                    # Exponential moving average of squared gradient values
                    state['exp_avg_sq'] = grad.new().resize_as_(grad).zero_()

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                state['step'] += 1

                # Decay the first and second moment running average coefficient
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                denom = exp_avg_sq.sqrt().add_(group['eps'])

                bias_correction1 = 1 - beta1**state['step']
                bias_correction2 = 1 - beta2**state['step']
                step_size = group['lr'] * \
                    math.sqrt(bias_correction2) / bias_correction1

                if group['weight_decay'] != 0:
                    p.data.add_(-group['weight_decay'], p.data)

                p.data.addcdiv_(-step_size, exp_avg, denom)
        params = []
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                params.append(p)
        params_vec = torch.nn.utils.parameters_to_vector(params)
        dist.broadcast_multigpu([params_vec], 0)
        torch.nn.utils.vector_to_parameters(params_vec, params)

        return loss
コード例 #5
0
    tensor_list,
    dst=1,  # destination process rank
    op=dist.reduce_op.PRODUCT)
print('{} AFTER reduce_multigpu {}'.format(local_rank, tensor_list))
if local_rank == 0:
    assert_mean(tensor_list[0], 1.)
    assert_mean(tensor_list[1], 2.)
else:
    assert_mean(tensor_list[0], 24.)
    assert_mean(tensor_list[1], 4.)

# ---------------- BROADCAST -----------------
tensor_list = get_tensor_list()

dist.broadcast_multigpu(
    tensor_list,
    src=1,  # rank 1 tensor_list[0] broadcast to all
)
print('{} AFTER broadcast_multigpu {}'.format(local_rank, tensor_list))
if local_rank == 0:
    assert_mean(tensor_list[0], 3.)
    assert_mean(tensor_list[1], 3.)
else:
    assert_mean(tensor_list[0], 3.)
    assert_mean(tensor_list[1], 3.)

# ---------------- ALL_GATHER -----------------
# all_gather semantics is quite complicated:
# https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_gather_multigpu

tensor_list = get_tensor_list()
"""