def __init__(self, model, byteps_opt, num_steps=10**6): """Construct a new ScheduledOptimizer, which uses byteps optimizer under the hood for averaging gradients across all workers. Args: model: The training model. ByteScheduler uses the model object to register hooks. byteps_opt: Optimizer to use for averaging gradients and applying updates. num_steps: The maximum number of training steps. ByteScheduler needs to know when to stop cross-iteration scheduling. """ self._model = model self._opt = byteps_opt self._logger = logging.getLogger("ByteScheduler") self._logger.debug("byteps size {}, rank {}".format(size(), rank())) self._desc = "rank {}".format(rank()) # Track training steps self._step = 0 self._final_step = num_steps # Use lock to block the forward propagation of each parameter. self._locks = {} for param_group in self.param_groups: for p in param_group['params']: self._locks[p] = threading.Lock() if size() > 1: self._register_forward_hooks() self._register_hooks() # Poll whether the tensor push-pull is finished. self._event_queue = queue.Queue() self._poller = threading.Thread(target=self._poll, args=()) self._poller.start()
def step(self, closure=None): """Override the default step function.""" self._logger.debug("{} calls step() {}".format(self._desc, self._step)) # Step 0 is called for parameter initialization after parameter broadcast if size() > 1 and self._step > 0: self._synchronize() # if it is the final training step, wait for the completion of all tensors if self._step == self._final_step: self._logger.debug( "final step {}, waiting for push-pull completion.".format( self._final_step)) while not self._event_queue.empty(): time.sleep(0.001) self._event_queue.put((None, None, None)) self._poller.join() self._logger.info("training finished!") loss = None if closure is not None: loss = closure() self._step += 1 return loss else: # Optimizer.step() will be triggered when user calls byteps.broadcast_optimizer_sate() super(self._opt.__class__, self._opt).step() self._step += 1
def zero_grad(self): """Override the default zero_grad function. Clears the gradients of all optimized tensors. """ self._logger.debug("{} calls zero_grad() of step {}".format(self._desc, self._step)) if size() > 1 and self._step > 0: return else: self._opt.zero_grad()
def step(self, closure=None, wait_for_finish=True): if size() > 1: self._sync_missing_gradients() if wait_for_finish: self._wait_for_all() loss = None if closure is not None: loss = closure() return loss else: super(self.__class__, self).step()
def _try_to_synchronize(self, p): handle, ctx = self._handles[p] if poll(handle): output = synchronize(handle) self._push_pull_delay[p] = self.backward_passes_per_step if self._is_tensor_instance: fp16_p = self._fp32_to_fp16_map.get(p.__hash__()) else: fp16_p = self._fp32_to_fp16_map.get(p) fp16_p.grad.set_(self._compression.decompress(output, ctx)) p.grad.data.copy_(fp16_p.grad.data) p.grad.data = p.grad.data / (self.loss_scale * size()) self._step_one_param(p) fp16_p.data.copy_(p.data) self._handles.pop(p) return True else: return False
def __init__(self, params, named_parameters, compression, backward_passes_per_step=1): super(self.__class__, self).__init__(params) self._compression = compression if named_parameters is not None: named_parameters = list(named_parameters) else: named_parameters = [] # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' 'tuples (name, parameter), usually produced by ' 'model.named_parameters().') dups = _DistributedOptimizer.find_duplicates( [k for k, _ in named_parameters]) if len(dups) > 0: raise ValueError( 'Parameter names in named_parameters must be unique. ' 'Found duplicates: %s' % ', '.join(dups)) if len(named_parameters) > 0: self._parameter_names = {v: k for k, v in sorted(named_parameters)} else: self._parameter_names = { v: 'push_pull.noname.%s' % i for param_group in self.param_groups for i, v in enumerate(param_group['params']) } self.backward_passes_per_step = backward_passes_per_step self._push_pull_delay = { v: self.backward_passes_per_step for _, v in sorted(named_parameters) } self._handles = {} self._grad_accs = [] self._requires_update = set() if size() > 1: self._register_hooks()
def __init__(self, params, named_parameters, compression, backward_passes_per_step=1): super(self.__class__, self).__init__(params) self._compression = compression if named_parameters is not None: named_parameters = list(named_parameters) else: named_parameters = [] # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' 'tuples (name, parameter), usually produced by ' 'model.named_parameters().') dups = _DistributedOptimizer.find_duplicates( [k for k, _ in named_parameters]) if len(dups) > 0: raise ValueError( 'Parameter names in named_parameters must be unique. ' 'Found duplicates: %s' % ', '.join(dups)) if len(named_parameters) > 0: if isinstance(named_parameters[0][1], torch.Tensor): if any([ not isinstance(p, torch.Tensor) for name, p in named_parameters ]): raise ValueError( 'named_parameters should consistently be a sequence of ' 'tuples (name, torch.Tensor)') self._is_tensor_instance = True # there is an issue when using torch.Tensor as key, so use its hash instead # https://github.com/pytorch/pytorch/issues/7733 self._parameter_names = { v.__hash__(): k for k, v in sorted(named_parameters) } self._tensor_list = [ tensor for name, tensor in named_parameters ] else: self._is_tensor_instance = False self._parameter_names = { v: k for k, v in sorted(named_parameters) } else: self._is_tensor_instance = False self._parameter_names = { v: 'push_pull.noname.%s' % i for param_group in self.param_groups for i, v in enumerate(param_group['params']) } self.backward_passes_per_step = backward_passes_per_step self._push_pull_delay = { v: self.backward_passes_per_step for _, v in sorted(named_parameters) } self._handles = {} self._grad_accs = [] self._requires_update = set() if size() > 1: self._register_hooks() # declare tensors for name in sorted(self._parameter_names.values()): declare("Gradient." + name) # We use two loops for load-balancing for name in sorted(self._parameter_names.values()): declare("Parameter." + name)
def __init__(self, module, device_ids=None, broadcast_buffers=True, compression=Compression.none ): super(DistributedDataParallel, self).__init__() assert device_ids and len(device_ids) == 1, ( "DistributedDataParallel device_ids contain exactlyone entry," " but got {}.").format(device_ids) self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids)) self.module = module self.broadcast_buffers = broadcast_buffers self.require_forward_param_sync = broadcast_buffers self._handles = {} self._grad_accs = [] self._requires_update = set() self._num_grads = 1 self.modules_buffers = [list(self.module.buffers())] self._compression = compression self._enable_async = False self._require_backward_grad_sync = True named_parameters = self.module.named_parameters() named_parameters = list(named_parameters) if len(named_parameters) > 0: if isinstance(named_parameters[0][1], torch.Tensor): if any([not isinstance(p, torch.Tensor) for name, p in named_parameters]): raise ValueError('named_parameters should consistently be a sequence of ' 'tuples (name, torch.Tensor)') self._is_tensor_instance = True # there is an issue when using torch.Tensor as key, so use its hash instead # https://github.com/pytorch/pytorch/issues/7733 self._parameter_names = {v.__hash__(): k for k, v in sorted(named_parameters)} self._tensor_list = [tensor for name, tensor in named_parameters] else: self._is_tensor_instance = False self._parameter_names = {v: k for k, v in sorted(named_parameters)} else: self._is_tensor_instance = False self._parameter_names = {v: 'push_pull.noname.%s' % i for param_group in self.param_groups for i, v in enumerate(param_group['params'])} if size() > 1: self._register_hooks() named_params = self.module.named_parameters() self._num_grads = sum(p.requires_grad for _, p in named_params) byteps_torch_set_num_grads(self._num_grads) # declare tensors for name in sorted(self._parameter_names.values()): declare("Gradient."+name) # We use two loops for load-balancing for name in sorted(self._parameter_names.values()): declare("Parameter."+name) # broadcast model state module_states = list(self.module.state_dict().values()) if len(module_states) > 0: bps.torch.broadcast_parameters(self.module.state_dict(), root_rank=0)
def __init__(self, params, named_parameters, compression, backward_passes_per_step=1): super(self.__class__, self).__init__(params) self._compression = compression if named_parameters is not None: named_parameters = list(named_parameters) else: named_parameters = [] self._sequential_keys = [k for k, v in named_parameters] self._named_parameters = {k: v for k, v in named_parameters} self._tensor_fusion_threshold = int(os.environ.get('BYTEPS_FUSION_THRESHOLD', '0')) # in bytes self._enable_async = (int(os.getenv('BYTEPS_ENABLE_ASYNC', 0)) != 0) if self._enable_async: assert int(os.getenv('DMLC_NUM_WORKER')) > 1, \ "Async is only valid for distributed training" print('BytePS: enable asynchronous training') # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' 'tuples (name, parameter), usually produced by ' 'model.named_parameters().') dups = _DistributedOptimizer.find_duplicates([k for k, _ in named_parameters]) if len(dups) > 0: raise ValueError('Parameter names in named_parameters must be unique. ' 'Found duplicates: %s' % ', '.join(dups)) if len(named_parameters) > 0: if isinstance(named_parameters[0][1], torch.Tensor): if any([not isinstance(p, torch.Tensor) for name, p in named_parameters]): raise ValueError('named_parameters should consistently be a sequence of ' 'tuples (name, torch.Tensor)') self._is_tensor_instance = True # there is an issue when using torch.Tensor as key, so use its hash instead # https://github.com/pytorch/pytorch/issues/7733 self._parameter_names = {v.__hash__(): k for k, v in sorted(named_parameters)} self._tensor_list = [tensor for name, tensor in named_parameters] else: self._is_tensor_instance = False self._parameter_names = {v: k for k, v in sorted(named_parameters)} else: self._is_tensor_instance = False self._parameter_names = {v: 'push_pull.noname.%s' % i for param_group in self.param_groups for i, v in enumerate(param_group['params'])} self.backward_passes_per_step = backward_passes_per_step self._push_pull_delay = {v: self.backward_passes_per_step for _, v in sorted(named_parameters)} self._handles = {} self._grad_accs = [] self._requires_update = set() self._should_sync = True if self._tensor_fusion_threshold > 0: self._generate_merged_parameters() else: self._groups = [] if size() > 1: self._register_hooks() # declare tensors for name in sorted(self._parameter_names.values()): declare("Gradient."+name) # We use two loops for load-balancing for name in sorted(self._parameter_names.values()): declare("Parameter."+name)