def test_reduce_add_coalesced(self): numel = 5 num_bytes = numel * 8 tensors = [ torch.randn(numel).long().cuda(), torch.randn(numel).cuda(), torch.randn(numel).long().cuda(), torch.randn(numel).long().cuda(), torch.randn(numel * 2).int().cuda(), # int is 2x shorter torch.randn(numel).cuda(), ] dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))] r_tensors = list(map(comm.reduce_add, zip(*dup_tensors))) for r, t in zip(r_tensors, tensors): self.assertEqual(r.get_device(), t.get_device()) self.assertEqual(r, t * 2) self.assertIsInstance(r, type(t)) rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=num_bytes * 5 // 2) self.assertEqual(r_tensors, rc_tensors) for r, rc in zip(r_tensors, rc_tensors): self.assertEqual(rc.get_device(), r.get_device()) self.assertIsInstance(rc, type(r))
def backward(ctx, *inputs): inputs = [i.data for i in inputs] inputs = [inputs[i:i + ctx.num_inputs] for i in range(0, len(inputs), ctx.num_inputs)] results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0]) outputs = comm.broadcast_coalesced(results, ctx.target_gpus) return (None,) + tuple([Variable(t) for tensors in outputs for t in tensors])
def collect_gradients(self): if self.ngradev > 1: # in case some parameters might not be used during the forward propagation on some GPUs: p.data.new_zeros(p.data.size()) if p.grad is None else p.grad instead of p.grad, but in most cases, this can warn you in case you miss the use of some parameters in the forward computation. grads = comm.reduce_add_coalesced([[p.grad for p in filter_para_grad(net.parameters())] for net in self.nets[:self.ngradev]], self.output_device)# if self.ngradev > 1 else [p.grad for p in filter_para_grad(self.nets[0].parameters())] for mp, grad in zip(filter_para_grad(self.module.parameters()), grads): mp.grad = grad
def hook(*unused): # reduce gradients across devices on a single machine if len(self.device_ids) > 1: # collect gradients from all copies all_grads = [[] for _ in range(len(self._module_copies))] for dev_idx, module in enumerate(self._module_copies): for p in module.parameters(): if not p.requires_grad or p.grad is None: continue all_grads[dev_idx].append(p.grad.data) # reduce grads reduced_grads = reduce_add_coalesced( all_grads, self.output_device, self.nccl_reduce_bucket_size) # update grads with reduced grads for grad, reduced in zip(all_grads[0], reduced_grads): grad.copy_(reduced) # clear the gradients and parameters across all replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None param.data.set_()
def collect_gradients(self): grads = comm.reduce_add_coalesced([[ p.data.new_zeros(p.data.size()) if p.grad is None else p.grad for p in net.parameters() ] for net in self.nets], self.output_device) for mp, grad in zip(self.module.parameters(), grads): mp.grad = grad
def forward(ctx, destination, num_inputs, *grads): ctx.target_gpus = [ grads[i].get_device() for i in range(0, len(grads), num_inputs) ] grads = [ grads[i:i + num_inputs] for i in range(0, len(grads), num_inputs) ] return comm.reduce_add_coalesced(grads, destination)
def forward(ctx, num_inputs, *inputs): ctx.num_inputs = num_inputs ctx.target_gpus = [inputs[i].get_device() for i in range(0, len(inputs), num_inputs)] inputs = [inputs[i:i + num_inputs] for i in range(0, len(inputs), num_inputs)] # sort before reduce sum inputs = sorted(inputs, key=lambda i: i[0].get_device()) results = comm.reduce_add_coalesced(inputs, ctx.target_gpus[0]) outputs = comm.broadcast_coalesced(results, ctx.target_gpus) return tuple([t for tensors in outputs for t in tensors])
def collect_gradients_func(self, func): if self.ngradev > 1: grads = comm.reduce_add_coalesced( [[p.grad for p in filter_para_grad(func(net).parameters())] for net in self.nets[:self.ngradev]], self.output_device) for mp, grad in zip( filter_para_grad(func(self.module).parameters()), grads): mp.grad = grad
def _test_reduce_add_coalesced(self, tensors, buffer_size): dup_tensors = [tensors, list(map(lambda t: t.cuda(1), tensors))] r_tensors = list(map(comm.reduce_add, zip(*dup_tensors))) for r, t in zip(r_tensors, tensors): self.assertEqual(r.get_device(), t.get_device()) self.assertEqual(r, t * 2) self.assertIsInstance(r, type(t)) rc_tensors = comm.reduce_add_coalesced(dup_tensors, buffer_size=buffer_size) self.assertEqual(r_tensors, rc_tensors) for r, rc in zip(r_tensors, rc_tensors): self.assertEqual(rc.get_device(), r.get_device()) self.assertIsInstance(rc, type(r))
def hook(*unused): # reduce gradients across devices on a single machine if len(self.device_ids) > 1: # collect gradients from all copies all_grads = [[] for _ in range(len(self._module_copies))] for dev_idx, module in enumerate(self._module_copies): for p in module.parameters(): if not p.requires_grad or p.grad is None: continue all_grads[dev_idx].append(p.grad.data) # reduce grads reduced_grads = reduce_add_coalesced( all_grads, self.output_device, self.nccl_reduce_bucket_size) # update grads with reduced grads for grad, reduced in zip(all_grads[0], reduced_grads): grad.copy_(reduced) # clear the gradients and parameters across all replicas for module in self._module_copies[1:]: for param in module.parameters(): if param.requires_grad: param.grad = None with torch.no_grad(): param.set_() if self.nprocs_per_node > 1: grads = [] for p in self.module.parameters(): if not p.requires_grad or p.grad is None: continue p.grad.data.div_( self.nprocs_per_node_device.type(p.grad.data.dtype)) grads.append(p.grad.data) communication_op = functools.partial( dist.all_reduce, group=self.local_node_group) communicate(grads, communication_op) # convert model back to ps-numerator self.ps_numerator()
def backward(self): group_params = [[(n, m[n]) for n, p in self.param_group if m[n].grad is not None] for m in self.named_params] grad_params = [g for g in group_params if len(g) > 0] assert all([len(g) == len(grad_params[0]) for g in grad_params]), [len(g) for g in grad_params] grad = [[p.grad for n, p in g] for g in grad_params] reduced_grad = reduce_add_coalesced(grad, self.device) grads = dict([(n, g) for ((n, p), g) in zip(grad_params[0], reduced_grad)]) l2norm = 0 for n, p in self.param_group: if n in grads: p.grad = grads[n].float( ) if grads[n].dtype == torch.half else grads[n] l2norm += p.grad.norm().item()**2 else: assert p.grad is None, n return l2norm
def backward(self, *grad_outputs): grad_outputs = [grad_outputs[i:i + self.num_inputs] for i in range(0, len(grad_outputs), self.num_inputs)] return comm.reduce_add_coalesced(grad_outputs, self.input_device)
def forward(ctx, destination, num_inputs, *grads): ctx.target_gpus = [grads[i].get_device() for i in range(0, len(grads), num_inputs)] grads = [grads[i:i + num_inputs] for i in range(0, len(grads), num_inputs)] return comm.reduce_add_coalesced(grads, destination)
def collect_gradients(self): if self.optm_splt is not None: if self.is_contiguous_parameters: for i, ( net, device, ) in enumerate(zip(self.nets, self.device_ids)): _dev_grads = [[ para.grad for para in get_contiguous_parameters_m(_net, index=i) ] for _net in self.nets[:self.ngradev]] if i > 0: _dev_grads.insert( 0, _dev_grads.pop(i) if i < self.ngradev else [ para.grad for para in get_contiguous_parameters_m(net, index=i) ]) _dev_grads = comm.reduce_add_coalesced(_dev_grads, device) for mp, grad in zip( get_contiguous_parameters_m(net, index=i), _dev_grads): mp.grad.copy_(grad) else: grads = [[ para.grad for para in filter_para_grad(net.parameters()) ] for net in self.nets[:self.ngradev]] for i, ( net, device, ( lind, rind, ), ) in enumerate(zip(self.nets, self.device_ids, self.optm_splt)): _dev_grads = [gradu[lind:rind] for gradu in grads] if i > 0: _dev_grads.insert( 0, _dev_grads.pop(i) if i < self.ngradev else [ _pg.new_zeros(_pg.size(), device=device) for _pg in _dev_grads[0] ]) _dev_grads = comm.reduce_add_coalesced(_dev_grads, device) for mp, grad in zip( range_parameter_iter(net, lind, rind, func=filter_para_grad_iter), _dev_grads): mp.grad = grad elif self.ngradev > 1: if self.is_contiguous_parameters: grads = comm.reduce_add_coalesced([[ para.grad for para in get_all_contiguous_parameters_m(net) ] for net in self.nets[:self.ngradev]], self.output_device) for mp, grad in zip( get_all_contiguous_parameters_m(self.module), grads): mp.grad.copy_(grad) else: # in case some parameters might not be used during the forward propagation on some GPUs: p.data.new_zeros(p.data.size()) if p.grad is None else p.grad instead of p.grad, but in most cases, this can warn you in case you miss the use of some parameters in the forward computation. grads = comm.reduce_add_coalesced( [[ para.grad for para in filter_para_grad(net.parameters()) ] for net in self.nets[:self.ngradev]], self.output_device ) # if self.ngradev > 1 else [p.grad for p in filter_para_grad(self.nets[0].parameters())] for mp, grad in zip(filter_para_grad(self.module.parameters()), grads): mp.grad = grad