コード例 #1
0
ファイル: __init__.py プロジェクト: ChrisQiqiang/allocation
    def __init__(self, params, optimizer, optimizer_params=None, root_rank=0):
        if isinstance(optimizer, DistributedOptimizer):
            optimizer = optimizer._optimizer
            warnings.warn(
                "DistributedTrainer does not take DistributedOptimizer "
                "as its optimizer. We have unwrapped it for you.")

        param_list = []
        if isinstance(params, mx.gluon.ParameterDict):
            for key in sorted(list(params.keys())):
                param_list.append(params[key])

        super(DistributedTrainer,
              self).__init__(param_list,
                             optimizer,
                             optimizer_params=optimizer_params,
                             kvstore=None)

        # _scale is used to check and set rescale_grad for optimizer in Trainer.step()
        # function. Normalizing it by BytePS size, which is equivalent to performing
        # average in push_pull, has better performance.
        self._scale /= size()
        self.root_rank = root_rank
        for i, param in enumerate(self._params):
            byteps_declare_tensor("parameter_" + str(i))
            if param.grad_req != 'null':
                byteps_declare_tensor("gradient_" + str(i))
コード例 #2
0
ファイル: __init__.py プロジェクト: zzmcdc/byteps
 def _do_push_pull(self, index, grad):
     if isinstance(index, (tuple, list)):
         for i in range(len(index)):
             byteps_declare_tensor(grad[i], "gradient_"+str(index[i]))
             byteps_push_pull(grad[i], version=0, priority=-index[i], name="gradient_"+str(index[i]), is_average=True)
     else:
         byteps_declare_tensor(grad, "gradient_"+str(index))
         byteps_push_pull(grad, version=0, priority=-index, name="gradient_"+str(index), is_average=True)
コード例 #3
0
ファイル: __init__.py プロジェクト: zmxdream/byteps
 def _allreduce_grads(self):
     for i, param in enumerate(self._params):
         if param.grad_req != 'null':
             byteps_declare_tensor(param.list_grad()[0],
                                   "gradient_" + str(i))
             byteps_push_pull(param.list_grad()[0],
                              is_average=False,
                              name="gradient_" + str(i),
                              priority=-i)
コード例 #4
0
ファイル: __init__.py プロジェクト: KqSMea8/byteps
def broadcast_parameters(params, root_rank=0):
    """
    Broadcasts the parameters from root rank to all other processes.
    Typical usage is to broadcast the `Module.get_params()` or the
    `Block.collect_params()`.
    Arguments:
        params: One of the following:
            - dict of parameters to broadcast
            - ParameterDict to broadcast
        root_rank: The rank of the process from which parameters will be
                   broadcasted to all other processes.
    """
    tensors = []
    if isinstance(params, dict):
        tensors = [p for _, p in sorted(params.items())]
    elif isinstance(params, mx.gluon.parameter.ParameterDict):
        for _, p in sorted(params.items()):
            try:
                tensors.append(p.data())
            except mx.gluon.parameter.DeferredInitializationError:
                # Inject wrapper method with post-initialization broadcast to
                # handle parameters with deferred initialization
                global parameter_index
                byteps_declare_tensor(p.data(),
                                      "parameter_" + str(parameter_index))
                new_init = _append_broadcast_init(p, root_rank,
                                                  parameter_index)
                parameter_index += 1
                p._init_impl = types.MethodType(new_init, p)
    else:
        raise ValueError('invalid params of type: %s' % type(params))

    # Run tensor initilization
    for i in range(len(tensors)):
        byteps_declare_tensor(tensors[i], "parameter_" + str(parameter_index))
        # Broadcast is implemented as push + pull in BytePS
        # To broadcast: we should zero-out all non-root tensors, and disable push_pull average
        if rank() != root_rank:
            tensors[i].__imul__(0)
        byteps_push_pull(tensors[i],
                         version=0,
                         priority=0,
                         name="parameter_" + str(parameter_index),
                         is_average=False)
        parameter_index += 1

    # Make sure tensors pushed to MXNet engine get processed such that all
    # workers are synced before starting training.
    for tensor in tensors:
        tensor.wait_to_read()
コード例 #5
0
ファイル: __init__.py プロジェクト: ChrisQiqiang/allocation
 def _do_push_pull_param(self, index, delta_weight):
     if isinstance(index, (tuple, list)):
         for i in range(len(index)):
             byteps_declare_tensor("weight_" + str(index[i]))
             byteps_push_pull(delta_weight[i],
                              version=0,
                              priority=-index[i],
                              name="weight_" + str(index[i]),
                              is_average=False)
     else:
         byteps_declare_tensor("weight_" + str(index))
         byteps_push_pull(delta_weight,
                          version=0,
                          priority=-index,
                          name="weight_" + str(index),
                          is_average=False)
コード例 #6
0
ファイル: __init__.py プロジェクト: zprhhs/byteps
    def __init__(self,
                 params,
                 optimizer,
                 optimizer_params=None,
                 root_rank=0,
                 compression_params=None):
        if isinstance(optimizer, DistributedOptimizer):
            optimizer = optimizer._optimizer
            warnings.warn(
                "DistributedTrainer does not take DistributedOptimizer "
                "as its optimizer. We have unwrapped it for you.")

        param_list = []
        if isinstance(params, mx.gluon.ParameterDict):
            for key in sorted(list(params.keys())):
                param_list.append(params[key])

        self._intra_compressor = self._register_compressor(
            params, optimizer_params, compression_params)

        super(DistributedTrainer,
              self).__init__(param_list,
                             optimizer,
                             optimizer_params=optimizer_params,
                             kvstore=None)

        if local_rank() == 0:
            self._f = open("lr.s", "wb")
            self._f.truncate(8)

        self._bps_size = size()
        self.root_rank = root_rank
        self._intra_compressors = {}
        for i, param in enumerate(self._params):
            byteps_declare_tensor("parameter_" + str(i))
            self._intra_compressors[param.name] = copy.deepcopy(
                self._intra_compressor)
            if param.grad_req != 'null':
                byteps_params = dict(
                    filter(lambda attr: attr[0].startswith("byteps_", ),
                           param.__dict__.items()))
                byteps_declare_tensor("gradient_" + str(i), **byteps_params)
コード例 #7
0
ファイル: __init__.py プロジェクト: zmxdream/byteps
def broadcast_parameters(params, root_rank=0):
    """
    Broadcasts the parameters from root rank to all other processes.
    Typical usage is to broadcast the `Module.get_params()`.

    Arguments:
        params: dict of parameters to broadcast
        root_rank: The rank of the process from which parameters will be
                   broadcasted to all other processes.
    """
    global parameter_index

    if isinstance(params, dict):
        tensors = [p for _, p in sorted(params.items())]

        # Run tensor initilization
        for i in range(len(tensors)):
            byteps_declare_tensor(tensors[i],
                                  "parameter_" + str(parameter_index))
            # Broadcast is implemented as push + pull in BytePS
            # To broadcast: we should zero-out all non-root tensors, and disable push_pull average
            if rank() != root_rank:
                tensors[i].__imul__(0)
            byteps_push_pull(tensors[i],
                             version=0,
                             priority=0,
                             name="parameter_" + str(parameter_index),
                             is_average=False)
            parameter_index += 1

        # Make sure tensors pushed to MXNet engine get processed such that all
        # workers are synced before starting training.
        for tensor in tensors:
            tensor.wait_to_read()

    elif isinstance(params, mx.gluon.parameter.ParameterDict):
        raise TypeError("For gluon users, you should not call this function. "
                        "DistributedTrainer will broadcast all parameters at "
                        "the first training step.")

    else:
        raise ValueError('Invalid params of type: %s' % type(params))
コード例 #8
0
ファイル: __init__.py プロジェクト: zmxdream/byteps
    def _init_params(self):
        tensors = []
        for param in self._params_to_init:
            if param._deferred_init:
                tensors.append(param)
            else:
                param_arrays = param._check_and_get(param._data, list)
                idx = self._param2idx[param.name]
                byteps_declare_tensor(param_arrays[0], "parameter_" + str(idx))

                if rank() != self.root_rank:
                    param_arrays[0].__imul__(0)
                byteps_push_pull(param_arrays[0],
                                 version=0,
                                 priority=0,
                                 name="parameter_" + str(idx),
                                 is_average=False)
                param_arrays[0].wait_to_read()

        self._params_to_init = tensors