def broadcast_parameters(params, root_rank): """ Broadcasts the parameters from root rank to all other processes. Typical usage is to broadcast the `model.state_dict()`, `model.named_parameters()`, or `model.parameters()`. Arguments: params: One of the following: - list of parameters to broadcast - dict of parameters to broadcast root_rank: The rank of the process from which parameters will be broadcasted to all other processes. """ if isinstance(params, dict): params = sorted(params.items()) elif isinstance(params, list): # support both named_parameters() and regular parameters() params = [p if isinstance(p, tuple) else (None, p) for p in params] else: raise ValueError('invalid params of type: %s' % type(params)) # Run synchronous broadcasts. for name, p in params: # Broadcast is implemented as push + pull in BytePS # To make it a real broadcast, we set the non-root tensors all 0. if rank() != root_rank: p.fill_(0) # Remember to disable averaging because we are doing broadcast if name: handle = byteps_push_pull(p, average=False, name="Parameter." + name) else: handle = byteps_push_pull(p, average=False) synchronize(handle)
def _push_pull_grad_async(self, p): name = self._parameter_names.get(p) tensor = p.grad tensor_compressed, ctx = self._compression.compress(tensor) handle = byteps_push_pull(tensor_compressed, average=True, name="Gradient."+name) return handle, ctx
def step(self, closure=None): if self._enable_async: old_weight_map = {} # store the weights before update for p, _ in self._handles.items(): old_weight_map[p] = p.data.clone().detach() # update loss = super(self.__class__, self).step(closure) for p, (h, _) in self._handles.items(): # get the diff for each weight (in-place) p.data.sub_(old_weight_map.get(p)) if h is None: # create the handler now if self._is_tensor_instance: name = self._parameter_names.get(p.__hash__()) else: name = self._parameter_names.get(p) handle = byteps_push_pull(p, average=False, name="AsyncParam." + name) _, ctx = self._compression.compress(p) self._handles[p] = (handle, ctx) self.synchronize() return loss else: self.synchronize() return super(self.__class__, self).step(closure)
def synchronize(self): # missing_p = self._requires_update - set(self._handles.keys()) # for p in missing_p: # handle, ctx = self._push_pull_grad_async(p) # self._handles[p] = (handle, ctx) # for p, value in self._handles.items(): # handle, ctx = value # if handle is None: # handle, ctx = self._push_pull_grad_async(p) # self._handles[p] = (handle, ctx) # for p, (handle, _) in self._handles.items(): # output = synchronize(handle) # self._push_pull_delay[p] = self.backward_passes_per_step # if not self._enable_async: # p.grad.set_(self._compression.decompress(output, ctx)) # handle, ctx = self._push_pull_grad_async(self.whole_gradient) print("=======================================") print("Length of self.whole_gradient: %d" % len(self.whole_gradient)) tensor_compressed, ctx = self._compression.compress(self.whole_gradient) handle = byteps_push_pull(tensor_compressed, average=True, name="Whole.Gradient") output = synchronize(handle) output = self._compression.decompress(output, ctx) # for param_group in self.param_groups: # for p in param_group['params']: # if p.requires_grad: for name, p in self._named_parameters.items(): d_p = self._pull_from_buffer(name).view(p.data.shape) p.grad.set_(d_p) self.whole_gradient = torch.tensor([]).cuda() self._merged_parameter_offsets = {} self._merged_parameter_index = {} self._handles.clear()
def _push_pull_grad_async(self, p): if self._is_tensor_instance: name = self._parameter_names.get(p.__hash__()) else: name = self._parameter_names.get(p) if self._enable_async: tensor = p _, ctx = self._compression.compress(tensor) handle = byteps_push_pull(p, average=False, name="AsyncParam." + name) else: tensor = p.grad tensor_compressed, ctx = self._compression.compress(tensor) handle = byteps_push_pull(tensor_compressed, average=True, name="Gradient." + name) return handle, ctx
def _push_pull_grad_async(self, p): if self._is_tensor_instance: name = self._parameter_names.get(p.__hash__()) else: name = self._parameter_names.get(p) if self._enable_async: # the real handle will be created in step() handle, ctx = None, None else: tensor = p.grad tensor_compressed, ctx = self._compression.compress(tensor) handle = byteps_push_pull(tensor_compressed, average=True, name="Gradient."+name) return handle, ctx
def _push_pull_grad_async(self, p): if self._is_tensor_instance: name = self._parameter_names.get(p.__hash__()) fp16_p = self._fp32_to_fp16_map.get(p.__hash__()) else: name = self._parameter_names.get(p) fp16_p = self._fp32_to_fp16_map.get(p) tensor = fp16_p.grad tensor_compressed, ctx = self._compression.compress(tensor) if fp16_p not in self.priorities: self.priorities[fp16_p] = self.gradient_count self.gradient_count += 1 handle = byteps_push_pull(tensor_compressed, average=False, name="Gradient."+name, priority=self.priorities[fp16_p]) return handle, ctx
def _push_pull_grad_async(self, p): """Call byteps API to push-pull gradient asynchronously Arguments: tensor: The tensor to push-pull. name: The name of the tensor. Returns: an push-pull handle and context """ name = self._parameter_names.get(p) tensor = p.grad tensor_compressed, ctx = self._compression.compress(tensor) self._locks[p].acquire() handle = byteps_push_pull(tensor_compressed, average=True, name="Gradient."+name) self._logger.debug("{} calls byteps_push_pull for {}".format(self._desc, self._parameter_names[p])) # Add to queue to poll completion self._event_queue.put((p, handle, ctx)) return handle, ctx