def __init__(self, module, device_ids=None, broadcast_buffers=True, compression=Compression.none ): super(DistributedDataParallel, self).__init__() assert device_ids and len(device_ids) == 1, ( "DistributedDataParallel device_ids contain exactlyone entry," " but got {}.").format(device_ids) self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids)) self.module = module self.broadcast_buffers = broadcast_buffers self.require_forward_param_sync = broadcast_buffers self._handles = {} self._grad_accs = [] self._requires_update = set() self._num_grads = 1 self.modules_buffers = [list(self.module.buffers())] self._compression = compression self._enable_async = False self._require_backward_grad_sync = True named_parameters = self.module.named_parameters() named_parameters = list(named_parameters) if len(named_parameters) > 0: if isinstance(named_parameters[0][1], torch.Tensor): if any([not isinstance(p, torch.Tensor) for name, p in named_parameters]): raise ValueError('named_parameters should consistently be a sequence of ' 'tuples (name, torch.Tensor)') self._is_tensor_instance = True # there is an issue when using torch.Tensor as key, so use its hash instead # https://github.com/pytorch/pytorch/issues/7733 self._parameter_names = {v.__hash__(): k for k, v in sorted(named_parameters)} self._tensor_list = [tensor for name, tensor in named_parameters] else: self._is_tensor_instance = False self._parameter_names = {v: k for k, v in sorted(named_parameters)} else: self._is_tensor_instance = False self._parameter_names = {v: 'push_pull.noname.%s' % i for param_group in self.param_groups for i, v in enumerate(param_group['params'])} if size() > 1: self._register_hooks() named_params = self.module.named_parameters() self._num_grads = sum(p.requires_grad for _, p in named_params) byteps_torch_set_num_grads(self._num_grads) # declare tensors for name in sorted(self._parameter_names.values()): declare("Gradient."+name) # We use two loops for load-balancing for name in sorted(self._parameter_names.values()): declare("Parameter."+name) # broadcast model state module_states = list(self.module.state_dict().values()) if len(module_states) > 0: bps.torch.broadcast_parameters(self.module.state_dict(), root_rank=0)
def __init__(self, params, named_parameters, compression, backward_passes_per_step=1): super(self.__class__, self).__init__(params) self._compression = compression if named_parameters is not None: named_parameters = list(named_parameters) else: named_parameters = [] # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' 'tuples (name, parameter), usually produced by ' 'model.named_parameters().') dups = _DistributedOptimizer.find_duplicates( [k for k, _ in named_parameters]) if len(dups) > 0: raise ValueError( 'Parameter names in named_parameters must be unique. ' 'Found duplicates: %s' % ', '.join(dups)) if len(named_parameters) > 0: if isinstance(named_parameters[0][1], torch.Tensor): if any([ not isinstance(p, torch.Tensor) for name, p in named_parameters ]): raise ValueError( 'named_parameters should consistently be a sequence of ' 'tuples (name, torch.Tensor)') self._is_tensor_instance = True # there is an issue when using torch.Tensor as key, so use its hash instead # https://github.com/pytorch/pytorch/issues/7733 self._parameter_names = { v.__hash__(): k for k, v in sorted(named_parameters) } self._tensor_list = [ tensor for name, tensor in named_parameters ] else: self._is_tensor_instance = False self._parameter_names = { v: k for k, v in sorted(named_parameters) } else: self._is_tensor_instance = False self._parameter_names = { v: 'push_pull.noname.%s' % i for param_group in self.param_groups for i, v in enumerate(param_group['params']) } self.backward_passes_per_step = backward_passes_per_step self._push_pull_delay = { v: self.backward_passes_per_step for _, v in sorted(named_parameters) } self._handles = {} self._grad_accs = [] self._requires_update = set() if size() > 1: self._register_hooks() # declare tensors for name in sorted(self._parameter_names.values()): declare("Gradient." + name) # We use two loops for load-balancing for name in sorted(self._parameter_names.values()): declare("Parameter." + name)
def __init__(self, params, named_parameters, compression, backward_passes_per_step=1): super(self.__class__, self).__init__(params) self._compression = compression if named_parameters is not None: named_parameters = list(named_parameters) else: named_parameters = [] self._sequential_keys = [k for k, v in named_parameters] self._named_parameters = {k: v for k, v in named_parameters} self._tensor_fusion_threshold = int(os.environ.get('BYTEPS_FUSION_THRESHOLD', '0')) # in bytes self._enable_async = (int(os.getenv('BYTEPS_ENABLE_ASYNC', 0)) != 0) if self._enable_async: assert int(os.getenv('DMLC_NUM_WORKER')) > 1, \ "Async is only valid for distributed training" print('BytePS: enable asynchronous training') # make sure that named_parameters are tuples if any([not isinstance(p, tuple) for p in named_parameters]): raise ValueError('named_parameters should be a sequence of ' 'tuples (name, parameter), usually produced by ' 'model.named_parameters().') dups = _DistributedOptimizer.find_duplicates([k for k, _ in named_parameters]) if len(dups) > 0: raise ValueError('Parameter names in named_parameters must be unique. ' 'Found duplicates: %s' % ', '.join(dups)) if len(named_parameters) > 0: if isinstance(named_parameters[0][1], torch.Tensor): if any([not isinstance(p, torch.Tensor) for name, p in named_parameters]): raise ValueError('named_parameters should consistently be a sequence of ' 'tuples (name, torch.Tensor)') self._is_tensor_instance = True # there is an issue when using torch.Tensor as key, so use its hash instead # https://github.com/pytorch/pytorch/issues/7733 self._parameter_names = {v.__hash__(): k for k, v in sorted(named_parameters)} self._tensor_list = [tensor for name, tensor in named_parameters] else: self._is_tensor_instance = False self._parameter_names = {v: k for k, v in sorted(named_parameters)} else: self._is_tensor_instance = False self._parameter_names = {v: 'push_pull.noname.%s' % i for param_group in self.param_groups for i, v in enumerate(param_group['params'])} self.backward_passes_per_step = backward_passes_per_step self._push_pull_delay = {v: self.backward_passes_per_step for _, v in sorted(named_parameters)} self._handles = {} self._grad_accs = [] self._requires_update = set() self._should_sync = True if self._tensor_fusion_threshold > 0: self._generate_merged_parameters() else: self._groups = [] if size() > 1: self._register_hooks() # declare tensors for name in sorted(self._parameter_names.values()): declare("Gradient."+name) # We use two loops for load-balancing for name in sorted(self._parameter_names.values()): declare("Parameter."+name)