Example #1
0
    def __init__(self, module, device_ids=None,
            broadcast_buffers=True,
            compression=Compression.none
            ):
        super(DistributedDataParallel, self).__init__()

        assert device_ids and len(device_ids) == 1, (
                "DistributedDataParallel device_ids contain exactlyone entry,"
                " but got {}.").format(device_ids)
        self.device_ids = list(map(lambda x: _get_device_index(x, True), device_ids))
        self.module = module
        self.broadcast_buffers = broadcast_buffers
        self.require_forward_param_sync = broadcast_buffers
        self._handles = {}
        self._grad_accs = []
        self._requires_update = set()
        self._num_grads = 1

        self.modules_buffers = [list(self.module.buffers())]
        self._compression = compression
        self._enable_async = False
        self._require_backward_grad_sync = True
        named_parameters = self.module.named_parameters()
        named_parameters = list(named_parameters)
        if len(named_parameters) > 0:
            if isinstance(named_parameters[0][1], torch.Tensor):
                if any([not isinstance(p, torch.Tensor) for name, p in named_parameters]):
                    raise ValueError('named_parameters should consistently be a sequence of '
                                     'tuples (name, torch.Tensor)')
                self._is_tensor_instance = True
                # there is an issue when using torch.Tensor as key, so use its hash instead
                # https://github.com/pytorch/pytorch/issues/7733
                self._parameter_names = {v.__hash__(): k for k, v
                                         in sorted(named_parameters)}
                self._tensor_list = [tensor for name, tensor in named_parameters]
            else:
                self._is_tensor_instance = False
                self._parameter_names = {v: k for k, v
                                         in sorted(named_parameters)}
        else:
            self._is_tensor_instance = False
            self._parameter_names = {v: 'push_pull.noname.%s' % i
                                     for param_group in self.param_groups
                                     for i, v in enumerate(param_group['params'])}
        if size() > 1:
            self._register_hooks()
            named_params = self.module.named_parameters()
            self._num_grads = sum(p.requires_grad for _, p in named_params)
            byteps_torch_set_num_grads(self._num_grads)

        # declare tensors
        for name in sorted(self._parameter_names.values()):
            declare("Gradient."+name)
        # We use two loops for load-balancing
        for name in sorted(self._parameter_names.values()):
            declare("Parameter."+name)

        # broadcast model state
        module_states = list(self.module.state_dict().values())
        if len(module_states) > 0:
            bps.torch.broadcast_parameters(self.module.state_dict(), root_rank=0)
Example #2
0
    def __init__(self,
                 params,
                 named_parameters,
                 compression,
                 backward_passes_per_step=1):
        super(self.__class__, self).__init__(params)
        self._compression = compression

        if named_parameters is not None:
            named_parameters = list(named_parameters)
        else:
            named_parameters = []

        # make sure that named_parameters are tuples
        if any([not isinstance(p, tuple) for p in named_parameters]):
            raise ValueError('named_parameters should be a sequence of '
                             'tuples (name, parameter), usually produced by '
                             'model.named_parameters().')

        dups = _DistributedOptimizer.find_duplicates(
            [k for k, _ in named_parameters])
        if len(dups) > 0:
            raise ValueError(
                'Parameter names in named_parameters must be unique. '
                'Found duplicates: %s' % ', '.join(dups))

        if len(named_parameters) > 0:
            if isinstance(named_parameters[0][1], torch.Tensor):
                if any([
                        not isinstance(p, torch.Tensor)
                        for name, p in named_parameters
                ]):
                    raise ValueError(
                        'named_parameters should consistently be a sequence of '
                        'tuples (name, torch.Tensor)')
                self._is_tensor_instance = True
                # there is an issue when using torch.Tensor as key, so use its hash instead
                # https://github.com/pytorch/pytorch/issues/7733
                self._parameter_names = {
                    v.__hash__(): k
                    for k, v in sorted(named_parameters)
                }
                self._tensor_list = [
                    tensor for name, tensor in named_parameters
                ]
            else:
                self._is_tensor_instance = False
                self._parameter_names = {
                    v: k
                    for k, v in sorted(named_parameters)
                }
        else:
            self._is_tensor_instance = False
            self._parameter_names = {
                v: 'push_pull.noname.%s' % i
                for param_group in self.param_groups
                for i, v in enumerate(param_group['params'])
            }
        self.backward_passes_per_step = backward_passes_per_step
        self._push_pull_delay = {
            v: self.backward_passes_per_step
            for _, v in sorted(named_parameters)
        }
        self._handles = {}
        self._grad_accs = []
        self._requires_update = set()
        if size() > 1:
            self._register_hooks()

        # declare tensors
        for name in sorted(self._parameter_names.values()):
            declare("Gradient." + name)
        # We use two loops for load-balancing
        for name in sorted(self._parameter_names.values()):
            declare("Parameter." + name)
    def __init__(self, params, named_parameters, compression,
                 backward_passes_per_step=1):
        super(self.__class__, self).__init__(params)
        self._compression = compression

        if named_parameters is not None:
            named_parameters = list(named_parameters)
        else:
            named_parameters = []
        self._sequential_keys = [k for k, v in named_parameters]
        self._named_parameters = {k: v for k, v
                                in named_parameters}

        self._tensor_fusion_threshold = int(os.environ.get('BYTEPS_FUSION_THRESHOLD', '0')) # in bytes
        self._enable_async = (int(os.getenv('BYTEPS_ENABLE_ASYNC', 0)) != 0)
        if self._enable_async:
            assert int(os.getenv('DMLC_NUM_WORKER')) > 1, \
                "Async is only valid for distributed training"
            print('BytePS: enable asynchronous training')

        # make sure that named_parameters are tuples
        if any([not isinstance(p, tuple) for p in named_parameters]):
            raise ValueError('named_parameters should be a sequence of '
                             'tuples (name, parameter), usually produced by '
                             'model.named_parameters().')

        dups = _DistributedOptimizer.find_duplicates([k for k, _ in named_parameters])
        if len(dups) > 0:
            raise ValueError('Parameter names in named_parameters must be unique. '
                             'Found duplicates: %s' % ', '.join(dups))

        if len(named_parameters) > 0:
            if isinstance(named_parameters[0][1], torch.Tensor):
                if any([not isinstance(p, torch.Tensor) for name, p in named_parameters]):
                    raise ValueError('named_parameters should consistently be a sequence of '
                                     'tuples (name, torch.Tensor)')
                self._is_tensor_instance = True
                # there is an issue when using torch.Tensor as key, so use its hash instead
                # https://github.com/pytorch/pytorch/issues/7733
                self._parameter_names = {v.__hash__(): k for k, v
                                         in sorted(named_parameters)}
                self._tensor_list = [tensor for name, tensor in named_parameters]
            else:
                self._is_tensor_instance = False
                self._parameter_names = {v: k for k, v
                                         in sorted(named_parameters)}
        else:
            self._is_tensor_instance = False
            self._parameter_names = {v: 'push_pull.noname.%s' % i
                                     for param_group in self.param_groups
                                     for i, v in enumerate(param_group['params'])}
        self.backward_passes_per_step = backward_passes_per_step
        self._push_pull_delay = {v: self.backward_passes_per_step
                                 for _, v in sorted(named_parameters)}
        self._handles = {}
        self._grad_accs = []
        self._requires_update = set()
        self._should_sync = True
        if self._tensor_fusion_threshold > 0:
            self._generate_merged_parameters()
        else:
            self._groups = []
        if size() > 1:
            self._register_hooks()

        # declare tensors
        for name in sorted(self._parameter_names.values()):
            declare("Gradient."+name)
        # We use two loops for load-balancing
        for name in sorted(self._parameter_names.values()):
            declare("Parameter."+name)