def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU): for src in group: expected_tensor = _build_tensor(src + 1) tensors = [ _build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank] ] if rank == src: tensors[0] = expected_tensor.cuda(device=rank_to_GPU[rank][0]) dist.broadcast_multigpu(tensors, src, group_id) for tensor in tensors: self.assertEqual(tensor, expected_tensor) self._barrier()
def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU): for src in group: expected_tensor = _build_tensor(src + 1) tensors = [_build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank]] if rank == src: tensors[0] = expected_tensor.cuda( device=rank_to_GPU[rank][0]) dist.broadcast_multigpu(tensors, src, group_id) for tensor in tensors: self.assertEqual(tensor, expected_tensor) self._barrier()
def broadcast_multigpu(self, tensor_list, src, async_op=False, src_tensor=0): return dist.broadcast_multigpu(tensor_list, src, self.group, async_op, src_tensor)
def step(self, closure=None): """Performs a single optimization step. Arguments: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() grads = [] for group in self.param_groups: for p in group['params']: if p.grad is None: continue grads.append(p.grad) flat_grads = torch.nn.utils.parameters_to_vector(grads) dist.all_reduce_multigpu([flat_grads]) flat_grads /= self.world_size torch.nn.utils.vector_to_parameters(flat_grads, grads) for group in self.param_groups: for p in group['params']: if p.grad is None: continue grad = p.grad.data state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['exp_avg'] = grad.new().resize_as_(grad).zero_() # Exponential moving average of squared gradient values state['exp_avg_sq'] = grad.new().resize_as_(grad).zero_() exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] beta1, beta2 = group['betas'] state['step'] += 1 # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(1 - beta1, grad) exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) denom = exp_avg_sq.sqrt().add_(group['eps']) bias_correction1 = 1 - beta1**state['step'] bias_correction2 = 1 - beta2**state['step'] step_size = group['lr'] * \ math.sqrt(bias_correction2) / bias_correction1 if group['weight_decay'] != 0: p.data.add_(-group['weight_decay'], p.data) p.data.addcdiv_(-step_size, exp_avg, denom) params = [] for group in self.param_groups: for p in group['params']: if p.grad is None: continue params.append(p) params_vec = torch.nn.utils.parameters_to_vector(params) dist.broadcast_multigpu([params_vec], 0) torch.nn.utils.vector_to_parameters(params_vec, params) return loss
tensor_list, dst=1, # destination process rank op=dist.reduce_op.PRODUCT) print('{} AFTER reduce_multigpu {}'.format(local_rank, tensor_list)) if local_rank == 0: assert_mean(tensor_list[0], 1.) assert_mean(tensor_list[1], 2.) else: assert_mean(tensor_list[0], 24.) assert_mean(tensor_list[1], 4.) # ---------------- BROADCAST ----------------- tensor_list = get_tensor_list() dist.broadcast_multigpu( tensor_list, src=1, # rank 1 tensor_list[0] broadcast to all ) print('{} AFTER broadcast_multigpu {}'.format(local_rank, tensor_list)) if local_rank == 0: assert_mean(tensor_list[0], 3.) assert_mean(tensor_list[1], 3.) else: assert_mean(tensor_list[0], 3.) assert_mean(tensor_list[1], 3.) # ---------------- ALL_GATHER ----------------- # all_gather semantics is quite complicated: # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_gather_multigpu tensor_list = get_tensor_list() """