Ejemplo n.º 1
0
    def __init__(self, params, optimizer, optimizer_params=None, root_rank=0):
        if isinstance(optimizer, DistributedOptimizer):
            optimizer = optimizer._optimizer
            warnings.warn(
                "DistributedTrainer does not take DistributedOptimizer "
                "as its optimizer. We have unwrapped it for you.")

        param_list = []
        if isinstance(params, mx.gluon.ParameterDict):
            for key in sorted(list(params.keys())):
                param_list.append(params[key])

        super(DistributedTrainer,
              self).__init__(param_list,
                             optimizer,
                             optimizer_params=optimizer_params,
                             kvstore=None)

        # _scale is used to check and set rescale_grad for optimizer in Trainer.step()
        # function. Normalizing it by BytePS size, which is equivalent to performing
        # average in push_pull, has better performance.
        self._scale /= size()
        self.root_rank = root_rank
        for i, param in enumerate(self._params):
            byteps_declare_tensor("parameter_" + str(i))
            if param.grad_req != 'null':
                byteps_declare_tensor("gradient_" + str(i))
Ejemplo n.º 2
0
    def __init__(self, params, optimizer, optimizer_params=None):
        if isinstance(optimizer, DistributedOptimizer):
            optimizer = optimizer._optimizer
            warnings.warn(
                "DistributedTrainer does not take DistributedOptimizer "
                "as its optimizer. We have unwrapped it for you.")

        super(DistributedTrainer,
              self).__init__(params,
                             optimizer,
                             optimizer_params=optimizer_params,
                             kvstore=None)

        # _scale is used to check and set rescale_grad for optimizer in Trainer.step()
        # function. Normalizing it by BytePS size, which is equivalent to performing
        # average in push_pull, has better performance.
        self._scale /= size()
Ejemplo n.º 3
0
    def __init__(self,
                 params,
                 optimizer,
                 optimizer_params=None,
                 root_rank=0,
                 compression_params=None):
        if isinstance(optimizer, DistributedOptimizer):
            optimizer = optimizer._optimizer
            warnings.warn(
                "DistributedTrainer does not take DistributedOptimizer "
                "as its optimizer. We have unwrapped it for you.")

        param_list = []
        if isinstance(params, mx.gluon.ParameterDict):
            for key in sorted(list(params.keys())):
                param_list.append(params[key])

        self._intra_compressor = self._register_compressor(
            params, optimizer_params, compression_params)

        super(DistributedTrainer,
              self).__init__(param_list,
                             optimizer,
                             optimizer_params=optimizer_params,
                             kvstore=None)

        if local_rank() == 0:
            self._f = open("lr.s", "wb")
            self._f.truncate(8)

        self._bps_size = size()
        self.root_rank = root_rank
        self._intra_compressors = {}
        for i, param in enumerate(self._params):
            byteps_declare_tensor("parameter_" + str(i))
            self._intra_compressors[param.name] = copy.deepcopy(
                self._intra_compressor)
            if param.grad_req != 'null':
                byteps_params = dict(
                    filter(lambda attr: attr[0].startswith("byteps_", ),
                           param.__dict__.items()))
                byteps_declare_tensor("gradient_" + str(i), **byteps_params)