Beispiel #1
0
    def compress(self, sync_buffer):
        # get the sign/magnitude for the tensor (to be transmitted).
        selected_values, selected_indices = [], []

        for half_param, hat_param in zip(sync_buffer["flatten_half_params"],
                                         sync_buffer["flatten_params"]):
            _selected_values, _selected_indices = self.compressor_fn.compress(
                half_param - hat_param,
                self.comm_op,
                self.compress_ratio,
                self.is_biased,
            )
            selected_values.append(_selected_values)
            selected_indices.append(_selected_indices)

        # get selected shapes.
        selected_shapes = [len(_value) for _value in selected_values]

        # flatten selected values/indices.
        flatten_selected_values = TensorBuffer(selected_values)
        flatten_selected_indices = TensorBuffer(selected_indices)

        # get n_bits to transmit.
        n_bits = get_n_bits(flatten_selected_values.buffer) + get_n_bits(
            flatten_selected_indices.buffer)

        # update shared dict.
        sync_buffer["selected_shapes"] = selected_shapes
        sync_buffer["flatten_selected_values"] = flatten_selected_values
        sync_buffer["flatten_selected_indices"] = flatten_selected_indices
        sync_buffer["n_bits"] = n_bits
Beispiel #2
0
    def compress(self, grads_tb):
        # get the sign/magnitude for the tensor (to be transmitted).
        sync_buffer = dict()

        # flatten selected values/indices.
        grad_norms_tb = TensorBuffer([grad.norm(p=1) for grad in grads_tb])
        signs, sign_size = self.compressor_fn.compress(grads_tb.buffer)

        # get compressed grad.
        synced_grads_tb = copy.deepcopy(grads_tb)
        for synced_grad, grad_norm, grad in zip(synced_grads_tb, grad_norms_tb,
                                                grads_tb):
            synced_grad.data.copy_(grad_norm * torch.sign(grad) /
                                   grad.nelement())

        # get n_bits to transmit.
        n_bits = get_n_bits(grad_norms_tb.buffer) + get_n_bits(signs)

        # update shared dict.
        sync_buffer["grad_norms_tb"] = grad_norms_tb
        sync_buffer["grads_tb"] = grads_tb
        sync_buffer["synced_grads_tb"] = synced_grads_tb
        sync_buffer["signs"] = signs
        sync_buffer["sign_size"] = sign_size
        sync_buffer["n_bits"] = n_bits
        return sync_buffer
Beispiel #3
0
    def compress(self, sync_buffer):
        # flatten selected values/indices.
        param_norms_tb = TensorBuffer(
            [param.norm(p=1) for param in sync_buffer["params_tb"]]
        )
        signs, sign_size = self.compressor_fn.compress(sync_buffer["params_tb"].buffer)

        # get compressed model.
        local_compressed_params_tb = deepcopy(sync_buffer["params_tb"])
        for local_compressed_param, param_norm, param in zip(
            local_compressed_params_tb, param_norms_tb, sync_buffer["params_tb"]
        ):
            local_compressed_param.data.copy_(
                param_norm * torch.sign(param) / param.nelement()
            )

        # get n_bits to transmit.
        n_bits = get_n_bits(param_norms_tb.buffer) + get_n_bits(signs)

        # update shared dict.
        sync_buffer["param_norms_tb"] = param_norms_tb
        sync_buffer["signs"] = signs
        sync_buffer["sign_size"] = sign_size
        sync_buffer["n_bits"] = n_bits
        return local_compressed_params_tb
Beispiel #4
0
    def step(self, closure=None, **kargs):
        if self.conf.is_centralized:
            with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_):
                # Get data.
                grads, _ = comm.get_data(self.param_groups,
                                         self.param_names,
                                         is_get_grad=True)
                flatten_grads = TensorBuffer(grads)

            with kargs["timer"]("sync/sync", epoch=self.conf.epoch_):
                # Aggregate the gradients.
                flatten_grads.buffer = self.world_aggregator._agg(
                    flatten_grads.buffer,
                    op="avg",
                    distributed=self.conf.distributed)

            with kargs["timer"]("sync/unflatten_grad", epoch=self.conf.epoch_):
                # unflatten grads.
                flatten_grads.unpack(grads)

            with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            # Get n_bits to transmit.
            n_bits = get_n_bits(flatten_grads.buffer)
        else:
            with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_):
                # first get and flatten all params.
                params, _ = comm.get_data(self.param_groups,
                                          self.param_names,
                                          is_get_grad=False)
                flatten_params = TensorBuffer(params)

            with kargs["timer"]("sync/sync", epoch=self.conf.epoch_):
                # prepare the sync.
                if self.conf.comm_device == "cpu":
                    flatten_params.buffer.cpu().detach_()

                # then sync.
                flatten_params.buffer = self.decentralized_aggregator._agg(
                    flatten_params.buffer, op="weighted")

            with kargs["timer"]("sync/update_model", epoch=self.conf.epoch_):
                # finally unflatten.
                flatten_params.unpack(params)

            # Get n_bits to transmit.
            n_bits = get_n_bits(flatten_params.buffer)
        return n_bits
Beispiel #5
0
def compress_or_quantize(grad, comm_op, compressor_fn, compress_ratio,
                         quantize_level, is_biased):
    if "compress" in comm_op:
        values, indices = compressor_fn.compress(grad, comm_op, compress_ratio,
                                                 is_biased)

        n_bits = get_n_bits(values) + get_n_bits(indices)
    elif "quantize" in comm_op:
        values = compressor_fn.compress(grad, comm_op, quantize_level,
                                        is_biased)
        indices = None

        n_bits = get_n_bits(values) * quantize_level / 32
    else:
        raise NotImplementedError
    return values, indices, n_bits
Beispiel #6
0
    def compress(self, sync_buffer):
        # get the sign/magnitude for the tensor (to be transmitted).
        quantized_values = []

        # compress and get compressed model.
        local_compressed_params_tb = deepcopy(sync_buffer["params_tb"])
        local_compressed_params_tb.buffer = torch.zeros_like(
            local_compressed_params_tb.buffer
        )
        for param, local_compressed_param in zip(
            sync_buffer["params_tb"], local_compressed_params_tb
        ):
            # quantize.
            _quantized_values = self.compressor_fn.compress(
                param, self.comm_op, self.quantize_level, self.is_biased
            )
            quantized_values.append(_quantized_values)

            # update the local compressed params.
            local_compressed_param.data.copy_(_quantized_values)

        # flatten selected values/indices.
        flatten_updates = TensorBuffer(quantized_values)

        # get n_bits to transmit.
        n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32

        # update shared dict.
        sync_buffer["flatten_updates"] = flatten_updates
        sync_buffer["n_bits"] = n_bits
        return local_compressed_params_tb
Beispiel #7
0
    def step(self, closure=None, **kargs):
        with kargs['timer']('sync', epoch=self.conf.epoch_):
            # do the local update steps.
            with kargs["timer"]("local_update", epoch=self.conf.epoch_):
                utils.apply_gradient(self.param_groups,
                                     self.state,
                                     apply_grad_to_model=True)

            # enter the global sync if it satisfies the condition.
            if (self.conf.epoch_ < self.turn_on_local_step_from_epoch
                    or self.conf.local_index % self.local_step == 0):
                with kargs["timer"]("get_params", epoch=self.conf.epoch_):
                    # get parmas.
                    params, _ = comm.get_data(self.param_groups,
                                              self.param_names,
                                              is_get_grad=False)
                    params_tb = TensorBuffer(params)
                with kargs['timer']('memory_and_compress',
                                    epoch=self.conf.epoch_):
                    # get the params difference w.r.t. previous synced model.
                    local_scale, local_sign = [], []
                    for consensus_param, param, memory in zip(
                            self.consensus_params_tb, params_tb,
                            self.memory_tb):
                        memory.data.copy_(consensus_param - param + memory)
                        # compress.
                with kargs["timer"]("directions", epoch=self.conf.epoch_):
                    direction = exchange(self.memory_tb.buffer)  #signum
                with kargs['timer']('memory_and_compress',
                                    epoch=self.conf.epoch_):
                    for consensus_param, param, memory in zip(
                            self.consensus_params_tb, params_tb,
                            self.memory_tb):
                        _local_scale, _local_sign = scaled_sign(memory)
                        local_scale.append(_local_scale)
                        local_sign.append(_local_sign)
                        memory.data.copy_(memory - _local_scale * _local_sign)
                with kargs["timer"]("directions", epoch=self.conf.epoch_):
                    global_direction = TB(self.memory_tb, direction)
                with kargs["timer"]("magnitudes", epoch=self.conf.epoch_):
                    magnitudes_tb = TensorBuffer(local_scale)
                    magnitudes_tb.buffer = self.world_aggregator._agg(
                        magnitudes_tb.buffer,
                        "avg",
                        distributed=self.conf.distributed)
                # unpack the synced info and update the consensus params.
                with kargs["timer"]("update_consensus",
                                    epoch=self.conf.epoch_):
                    for update_magnitude, update_direction, consensus_param in zip(
                            magnitudes_tb, global_direction,
                            self.consensus_params_tb):
                        consensus_param.add_(
                            -1.0, update_direction.mul(update_magnitude))

                # consistent the local models by assigning the consensus params.
                self.consensus_params_tb.unpack(params)
                n_bits = get_n_bits(magnitudes_tb.buffer)
            else:
                n_bits = 0
            return n_bits
Beispiel #8
0
    def step(self, closure=None, **kargs):
        with kargs["timer"]("sync.local_update", epoch=self.conf.epoch_):
            utils.apply_gradient(self.param_groups,
                                 self.state,
                                 apply_grad_to_model=True)

        with kargs["timer"]("sync.sync_and_update", epoch=self.conf.epoch_):
            # enter the global sync if it satisfies the condition.
            if (self.conf.epoch_ < self.turn_on_local_step_from_epoch
                    or self.conf.local_index % self.local_step == 0):
                # get parmas.
                params, _ = comm.get_data(self.param_groups,
                                          self.param_names,
                                          is_get_grad=False)
                params_tb = TensorBuffer(params)

                # get params_diff.
                param_diff = self.consensus_params_tb.buffer - params_tb.buffer
                # sync the directions.
                param_diff = self.world_aggregator._agg(
                    param_diff, "avg", distributed=self.conf.distributed)

                # unpack the synced info and update the consensus params.
                self.consensus_params_tb.buffer.add_(-1.0, param_diff)

                # consistent the local models by assigning the consensus params.
                self.consensus_params_tb.unpack(params)

                # Get n_bits to transmit.
                n_bits = get_n_bits(param_diff)
            else:
                n_bits = 0
        return n_bits
Beispiel #9
0
    def compress(self, sync_buffer):
        # get the sign/magnitude for the tensor (to be transmitted).
        selected_values, selected_indices = [], []

        # compress and get compressed model.
        local_compressed_params_tb = deepcopy(sync_buffer["params_tb"])
        local_compressed_params_tb.buffer = torch.zeros_like(
            local_compressed_params_tb.buffer
        )
        for param, local_compressed_param in zip(
            sync_buffer["params_tb"], local_compressed_params_tb
        ):
            _selected_values, _selected_indices = self.compressor_fn.compress(
                param, self.comm_op, self.compress_ratio, self.is_biased
            )
            selected_values.append(_selected_values)
            selected_indices.append(_selected_indices)

            # update the local compressed params.
            local_compressed_param.data = local_compressed_param.data.view(-1)
            local_compressed_param.data[_selected_indices] = _selected_values
            local_compressed_param.data.view(*param.size())

        # get selected shapes.
        selected_shapes = [len(_value) for _value in selected_values]

        # flatten selected values/indices.
        flatten_selected_values = TensorBuffer(selected_values)
        flatten_selected_indices = TensorBuffer(selected_indices)

        # get n_bits to transmit.
        n_bits = get_n_bits(flatten_selected_values.buffer) + get_n_bits(
            flatten_selected_indices.buffer
        )

        # update shared dict.
        sync_buffer["selected_shapes"] = selected_shapes
        sync_buffer["flatten_selected_values"] = flatten_selected_values
        sync_buffer["flatten_selected_indices"] = flatten_selected_indices
        sync_buffer["n_bits"] = n_bits
        return local_compressed_params_tb
Beispiel #10
0
    def compress(self, sync_buffer):
        # get the sign/magnitude for the tensor (to be transmitted).
        norms, updates = [], []
        for flatten_updated_param in sync_buffer["flatten_updated_params"]:
            _update = flatten_updated_param
            updates += [_update]
            norms += [_update.norm(p=1)]

        # flatten selected values/indices.
        flatten_norms = TensorBuffer(norms)
        flatten_updates = TensorBuffer(updates)
        signs, sign_size = self.compressor_fn.compress(flatten_updates.buffer)

        # get n_bits to transmit.
        n_bits = get_n_bits(flatten_norms.buffer) + get_n_bits(signs)

        # update shared dict.
        sync_buffer["flatten_norms"] = flatten_norms
        sync_buffer["flatten_updates"] = flatten_updates
        sync_buffer["signs"] = signs
        sync_buffer["sign_size"] = sign_size
        sync_buffer["n_bits"] = n_bits
Beispiel #11
0
    def compress(self, grads_tb):
        # get the sign/magnitude for the tensor (to be transmitted).
        sync_buffer = dict()

        # concat the update magnitude and directions.
        signs, sign_size = self.compressor_fn.compress(grads_tb.buffer)

        # get n_bits to transmit.
        n_bits = get_n_bits(signs)

        # update shared dict.
        sync_buffer["grads_tb"] = grads_tb
        sync_buffer["signs"] = signs
        sync_buffer["sign_size"] = sign_size
        sync_buffer["n_bits"] = n_bits
        return sync_buffer
Beispiel #12
0
    def compress(self, sync_buffer):
        # get the sign/magnitude for the tensor (to be transmitted).
        quantized_values = []

        for flatten_updated_param in sync_buffer["flatten_updated_params"]:
            _quantized_values = self.compressor_fn.compress(
                flatten_updated_param, self.comm_op, self.quantize_level,
                self.is_biased)
            quantized_values.append(_quantized_values)

        # flatten selected values/indices.
        flatten_updates = TensorBuffer(quantized_values)

        # get n_bits to transmit.
        n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32

        # update shared dict.
        sync_buffer["flatten_updates"] = flatten_updates
        sync_buffer["n_bits"] = n_bits
Beispiel #13
0
    def compress(self, sync_buffer):
        quantized_values = []

        for half_param, hat_param in zip(sync_buffer["flatten_params"],
                                         sync_buffer["flatten_hat_params"]):
            _quantized_values = self.compressor_fn.compress(
                half_param - hat_param,
                self.comm_op,
                self.quantize_level,
                self.is_biased,
            )
            quantized_values.append(_quantized_values)

        # flatten selected values/indices.
        flatten_updates = TensorBuffer(quantized_values)

        # get n_bits to transmit.
        n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32

        # update shared dict.
        sync_buffer["flatten_updates"] = flatten_updates
        sync_buffer["n_bits"] = n_bits
    def step(self, closure=None, **kargs):
        # do the local update steps.
        with kargs["timer"]("sync.local_update", epoch=self.conf.epoch_):
            for group in self.param_groups:
                weight_decay = group["weight_decay"]
                momentum = group["momentum"]
                dampening = group["dampening"]
                nesterov = group["nesterov"]

                for p in group["params"]:
                    # get param_state
                    param_state = self.state[p]

                    # get the gradient
                    if p.grad is None:
                        continue
                    d_p = p.grad.data

                    # add the weight decay and apply the momentum.
                    if weight_decay != 0:
                        d_p.add_(weight_decay, p.data)
                    # apply the momentum.
                    if momentum != 0:
                        if "momentum_buffer" not in param_state:
                            buf = param_state["momentum_buffer"] = torch.zeros_like(
                                p.data
                            )
                            buf.mul_(momentum).add_(d_p)
                        else:
                            buf = param_state["momentum_buffer"]
                            buf.mul_(momentum).add_(1 - dampening, d_p)
                        if nesterov:
                            d_p = d_p.add(momentum, buf)
                        else:
                            d_p = buf

                    # get the local sign and apply to the local model.
                    p.data.add_(-group["lr"], torch.sign(d_p))

        # enter the global sync if it satisfies the condition.
        if (
            self.conf.epoch_ < self.turn_on_local_step_from_epoch
            or self.conf.local_index % self.local_step == 0
        ):
            with kargs["timer"]("sync.get_params", epoch=self.conf.epoch_):
                # get parmas.
                params, _ = comm.get_data(
                    self.param_groups, self.param_names, is_get_grad=False
                )
                params_tb = TensorBuffer(params)

            # get the params difference w.r.t. previous synced model.
            local_scale, local_sign = [], []
            for consensus_param, param in zip(self.consensus_params_tb, params_tb):
                _local_scale, _local_sign = scaled_sign(consensus_param - param)
                local_scale.append(_local_scale)
                local_sign.append(_local_sign)

            # concat the update magnitude and directions.
            magnitudes_tb = TensorBuffer(local_scale)
            directions_tb = TensorBuffer(local_sign)

            # sync and decompress.
            with kargs["timer"]("sync.sync_and_decompress", epoch=self.conf.epoch_):
                # sync the directions.
                directions_tb.buffer = self.world_aggregator._agg(
                    directions_tb.buffer, "avg", distributed=self.conf.distributed
                )
                magnitudes_tb.buffer = self.world_aggregator._agg(
                    magnitudes_tb.buffer, "avg", distributed=self.conf.distributed
                )

            # unpack the synced info and update the consensus params.
            with kargs["timer"]("sync.update_consensus", epoch=self.conf.epoch_):
                for update_magnitude, update_direction, consensus_param in zip(
                    magnitudes_tb, directions_tb, self.consensus_params_tb
                ):
                    consensus_param.add_(-1.0, update_direction.mul(update_magnitude))

            # consistent the local models by assigning the consensus params.
            self.consensus_params_tb.unpack(params)
            n_bits = get_n_bits(directions_tb.buffer) + get_n_bits(magnitudes_tb.buffer)
        else:
            n_bits = 0
        return n_bits