def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). selected_values, selected_indices = [], [] for half_param, hat_param in zip(sync_buffer["flatten_half_params"], sync_buffer["flatten_params"]): _selected_values, _selected_indices = self.compressor_fn.compress( half_param - hat_param, self.comm_op, self.compress_ratio, self.is_biased, ) selected_values.append(_selected_values) selected_indices.append(_selected_indices) # get selected shapes. selected_shapes = [len(_value) for _value in selected_values] # flatten selected values/indices. flatten_selected_values = TensorBuffer(selected_values) flatten_selected_indices = TensorBuffer(selected_indices) # get n_bits to transmit. n_bits = get_n_bits(flatten_selected_values.buffer) + get_n_bits( flatten_selected_indices.buffer) # update shared dict. sync_buffer["selected_shapes"] = selected_shapes sync_buffer["flatten_selected_values"] = flatten_selected_values sync_buffer["flatten_selected_indices"] = flatten_selected_indices sync_buffer["n_bits"] = n_bits
def compress(self, grads_tb): # get the sign/magnitude for the tensor (to be transmitted). sync_buffer = dict() # flatten selected values/indices. grad_norms_tb = TensorBuffer([grad.norm(p=1) for grad in grads_tb]) signs, sign_size = self.compressor_fn.compress(grads_tb.buffer) # get compressed grad. synced_grads_tb = copy.deepcopy(grads_tb) for synced_grad, grad_norm, grad in zip(synced_grads_tb, grad_norms_tb, grads_tb): synced_grad.data.copy_(grad_norm * torch.sign(grad) / grad.nelement()) # get n_bits to transmit. n_bits = get_n_bits(grad_norms_tb.buffer) + get_n_bits(signs) # update shared dict. sync_buffer["grad_norms_tb"] = grad_norms_tb sync_buffer["grads_tb"] = grads_tb sync_buffer["synced_grads_tb"] = synced_grads_tb sync_buffer["signs"] = signs sync_buffer["sign_size"] = sign_size sync_buffer["n_bits"] = n_bits return sync_buffer
def compress(self, sync_buffer): # flatten selected values/indices. param_norms_tb = TensorBuffer( [param.norm(p=1) for param in sync_buffer["params_tb"]] ) signs, sign_size = self.compressor_fn.compress(sync_buffer["params_tb"].buffer) # get compressed model. local_compressed_params_tb = deepcopy(sync_buffer["params_tb"]) for local_compressed_param, param_norm, param in zip( local_compressed_params_tb, param_norms_tb, sync_buffer["params_tb"] ): local_compressed_param.data.copy_( param_norm * torch.sign(param) / param.nelement() ) # get n_bits to transmit. n_bits = get_n_bits(param_norms_tb.buffer) + get_n_bits(signs) # update shared dict. sync_buffer["param_norms_tb"] = param_norms_tb sync_buffer["signs"] = signs sync_buffer["sign_size"] = sign_size sync_buffer["n_bits"] = n_bits return local_compressed_params_tb
def step(self, closure=None, **kargs): if self.conf.is_centralized: with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_): # Get data. grads, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=True) flatten_grads = TensorBuffer(grads) with kargs["timer"]("sync/sync", epoch=self.conf.epoch_): # Aggregate the gradients. flatten_grads.buffer = self.world_aggregator._agg( flatten_grads.buffer, op="avg", distributed=self.conf.distributed) with kargs["timer"]("sync/unflatten_grad", epoch=self.conf.epoch_): # unflatten grads. flatten_grads.unpack(grads) with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=True) # Get n_bits to transmit. n_bits = get_n_bits(flatten_grads.buffer) else: with kargs["timer"]("sync/apply_grad", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=True) with kargs["timer"]("sync/get_data", epoch=self.conf.epoch_): # first get and flatten all params. params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) flatten_params = TensorBuffer(params) with kargs["timer"]("sync/sync", epoch=self.conf.epoch_): # prepare the sync. if self.conf.comm_device == "cpu": flatten_params.buffer.cpu().detach_() # then sync. flatten_params.buffer = self.decentralized_aggregator._agg( flatten_params.buffer, op="weighted") with kargs["timer"]("sync/update_model", epoch=self.conf.epoch_): # finally unflatten. flatten_params.unpack(params) # Get n_bits to transmit. n_bits = get_n_bits(flatten_params.buffer) return n_bits
def compress_or_quantize(grad, comm_op, compressor_fn, compress_ratio, quantize_level, is_biased): if "compress" in comm_op: values, indices = compressor_fn.compress(grad, comm_op, compress_ratio, is_biased) n_bits = get_n_bits(values) + get_n_bits(indices) elif "quantize" in comm_op: values = compressor_fn.compress(grad, comm_op, quantize_level, is_biased) indices = None n_bits = get_n_bits(values) * quantize_level / 32 else: raise NotImplementedError return values, indices, n_bits
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). quantized_values = [] # compress and get compressed model. local_compressed_params_tb = deepcopy(sync_buffer["params_tb"]) local_compressed_params_tb.buffer = torch.zeros_like( local_compressed_params_tb.buffer ) for param, local_compressed_param in zip( sync_buffer["params_tb"], local_compressed_params_tb ): # quantize. _quantized_values = self.compressor_fn.compress( param, self.comm_op, self.quantize_level, self.is_biased ) quantized_values.append(_quantized_values) # update the local compressed params. local_compressed_param.data.copy_(_quantized_values) # flatten selected values/indices. flatten_updates = TensorBuffer(quantized_values) # get n_bits to transmit. n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32 # update shared dict. sync_buffer["flatten_updates"] = flatten_updates sync_buffer["n_bits"] = n_bits return local_compressed_params_tb
def step(self, closure=None, **kargs): with kargs['timer']('sync', epoch=self.conf.epoch_): # do the local update steps. with kargs["timer"]("local_update", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=True) # enter the global sync if it satisfies the condition. if (self.conf.epoch_ < self.turn_on_local_step_from_epoch or self.conf.local_index % self.local_step == 0): with kargs["timer"]("get_params", epoch=self.conf.epoch_): # get parmas. params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) params_tb = TensorBuffer(params) with kargs['timer']('memory_and_compress', epoch=self.conf.epoch_): # get the params difference w.r.t. previous synced model. local_scale, local_sign = [], [] for consensus_param, param, memory in zip( self.consensus_params_tb, params_tb, self.memory_tb): memory.data.copy_(consensus_param - param + memory) # compress. with kargs["timer"]("directions", epoch=self.conf.epoch_): direction = exchange(self.memory_tb.buffer) #signum with kargs['timer']('memory_and_compress', epoch=self.conf.epoch_): for consensus_param, param, memory in zip( self.consensus_params_tb, params_tb, self.memory_tb): _local_scale, _local_sign = scaled_sign(memory) local_scale.append(_local_scale) local_sign.append(_local_sign) memory.data.copy_(memory - _local_scale * _local_sign) with kargs["timer"]("directions", epoch=self.conf.epoch_): global_direction = TB(self.memory_tb, direction) with kargs["timer"]("magnitudes", epoch=self.conf.epoch_): magnitudes_tb = TensorBuffer(local_scale) magnitudes_tb.buffer = self.world_aggregator._agg( magnitudes_tb.buffer, "avg", distributed=self.conf.distributed) # unpack the synced info and update the consensus params. with kargs["timer"]("update_consensus", epoch=self.conf.epoch_): for update_magnitude, update_direction, consensus_param in zip( magnitudes_tb, global_direction, self.consensus_params_tb): consensus_param.add_( -1.0, update_direction.mul(update_magnitude)) # consistent the local models by assigning the consensus params. self.consensus_params_tb.unpack(params) n_bits = get_n_bits(magnitudes_tb.buffer) else: n_bits = 0 return n_bits
def step(self, closure=None, **kargs): with kargs["timer"]("sync.local_update", epoch=self.conf.epoch_): utils.apply_gradient(self.param_groups, self.state, apply_grad_to_model=True) with kargs["timer"]("sync.sync_and_update", epoch=self.conf.epoch_): # enter the global sync if it satisfies the condition. if (self.conf.epoch_ < self.turn_on_local_step_from_epoch or self.conf.local_index % self.local_step == 0): # get parmas. params, _ = comm.get_data(self.param_groups, self.param_names, is_get_grad=False) params_tb = TensorBuffer(params) # get params_diff. param_diff = self.consensus_params_tb.buffer - params_tb.buffer # sync the directions. param_diff = self.world_aggregator._agg( param_diff, "avg", distributed=self.conf.distributed) # unpack the synced info and update the consensus params. self.consensus_params_tb.buffer.add_(-1.0, param_diff) # consistent the local models by assigning the consensus params. self.consensus_params_tb.unpack(params) # Get n_bits to transmit. n_bits = get_n_bits(param_diff) else: n_bits = 0 return n_bits
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). selected_values, selected_indices = [], [] # compress and get compressed model. local_compressed_params_tb = deepcopy(sync_buffer["params_tb"]) local_compressed_params_tb.buffer = torch.zeros_like( local_compressed_params_tb.buffer ) for param, local_compressed_param in zip( sync_buffer["params_tb"], local_compressed_params_tb ): _selected_values, _selected_indices = self.compressor_fn.compress( param, self.comm_op, self.compress_ratio, self.is_biased ) selected_values.append(_selected_values) selected_indices.append(_selected_indices) # update the local compressed params. local_compressed_param.data = local_compressed_param.data.view(-1) local_compressed_param.data[_selected_indices] = _selected_values local_compressed_param.data.view(*param.size()) # get selected shapes. selected_shapes = [len(_value) for _value in selected_values] # flatten selected values/indices. flatten_selected_values = TensorBuffer(selected_values) flatten_selected_indices = TensorBuffer(selected_indices) # get n_bits to transmit. n_bits = get_n_bits(flatten_selected_values.buffer) + get_n_bits( flatten_selected_indices.buffer ) # update shared dict. sync_buffer["selected_shapes"] = selected_shapes sync_buffer["flatten_selected_values"] = flatten_selected_values sync_buffer["flatten_selected_indices"] = flatten_selected_indices sync_buffer["n_bits"] = n_bits return local_compressed_params_tb
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). norms, updates = [], [] for flatten_updated_param in sync_buffer["flatten_updated_params"]: _update = flatten_updated_param updates += [_update] norms += [_update.norm(p=1)] # flatten selected values/indices. flatten_norms = TensorBuffer(norms) flatten_updates = TensorBuffer(updates) signs, sign_size = self.compressor_fn.compress(flatten_updates.buffer) # get n_bits to transmit. n_bits = get_n_bits(flatten_norms.buffer) + get_n_bits(signs) # update shared dict. sync_buffer["flatten_norms"] = flatten_norms sync_buffer["flatten_updates"] = flatten_updates sync_buffer["signs"] = signs sync_buffer["sign_size"] = sign_size sync_buffer["n_bits"] = n_bits
def compress(self, grads_tb): # get the sign/magnitude for the tensor (to be transmitted). sync_buffer = dict() # concat the update magnitude and directions. signs, sign_size = self.compressor_fn.compress(grads_tb.buffer) # get n_bits to transmit. n_bits = get_n_bits(signs) # update shared dict. sync_buffer["grads_tb"] = grads_tb sync_buffer["signs"] = signs sync_buffer["sign_size"] = sign_size sync_buffer["n_bits"] = n_bits return sync_buffer
def compress(self, sync_buffer): # get the sign/magnitude for the tensor (to be transmitted). quantized_values = [] for flatten_updated_param in sync_buffer["flatten_updated_params"]: _quantized_values = self.compressor_fn.compress( flatten_updated_param, self.comm_op, self.quantize_level, self.is_biased) quantized_values.append(_quantized_values) # flatten selected values/indices. flatten_updates = TensorBuffer(quantized_values) # get n_bits to transmit. n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32 # update shared dict. sync_buffer["flatten_updates"] = flatten_updates sync_buffer["n_bits"] = n_bits
def compress(self, sync_buffer): quantized_values = [] for half_param, hat_param in zip(sync_buffer["flatten_params"], sync_buffer["flatten_hat_params"]): _quantized_values = self.compressor_fn.compress( half_param - hat_param, self.comm_op, self.quantize_level, self.is_biased, ) quantized_values.append(_quantized_values) # flatten selected values/indices. flatten_updates = TensorBuffer(quantized_values) # get n_bits to transmit. n_bits = get_n_bits(flatten_updates.buffer) * self.quantize_level / 32 # update shared dict. sync_buffer["flatten_updates"] = flatten_updates sync_buffer["n_bits"] = n_bits
def step(self, closure=None, **kargs): # do the local update steps. with kargs["timer"]("sync.local_update", epoch=self.conf.epoch_): for group in self.param_groups: weight_decay = group["weight_decay"] momentum = group["momentum"] dampening = group["dampening"] nesterov = group["nesterov"] for p in group["params"]: # get param_state param_state = self.state[p] # get the gradient if p.grad is None: continue d_p = p.grad.data # add the weight decay and apply the momentum. if weight_decay != 0: d_p.add_(weight_decay, p.data) # apply the momentum. if momentum != 0: if "momentum_buffer" not in param_state: buf = param_state["momentum_buffer"] = torch.zeros_like( p.data ) buf.mul_(momentum).add_(d_p) else: buf = param_state["momentum_buffer"] buf.mul_(momentum).add_(1 - dampening, d_p) if nesterov: d_p = d_p.add(momentum, buf) else: d_p = buf # get the local sign and apply to the local model. p.data.add_(-group["lr"], torch.sign(d_p)) # enter the global sync if it satisfies the condition. if ( self.conf.epoch_ < self.turn_on_local_step_from_epoch or self.conf.local_index % self.local_step == 0 ): with kargs["timer"]("sync.get_params", epoch=self.conf.epoch_): # get parmas. params, _ = comm.get_data( self.param_groups, self.param_names, is_get_grad=False ) params_tb = TensorBuffer(params) # get the params difference w.r.t. previous synced model. local_scale, local_sign = [], [] for consensus_param, param in zip(self.consensus_params_tb, params_tb): _local_scale, _local_sign = scaled_sign(consensus_param - param) local_scale.append(_local_scale) local_sign.append(_local_sign) # concat the update magnitude and directions. magnitudes_tb = TensorBuffer(local_scale) directions_tb = TensorBuffer(local_sign) # sync and decompress. with kargs["timer"]("sync.sync_and_decompress", epoch=self.conf.epoch_): # sync the directions. directions_tb.buffer = self.world_aggregator._agg( directions_tb.buffer, "avg", distributed=self.conf.distributed ) magnitudes_tb.buffer = self.world_aggregator._agg( magnitudes_tb.buffer, "avg", distributed=self.conf.distributed ) # unpack the synced info and update the consensus params. with kargs["timer"]("sync.update_consensus", epoch=self.conf.epoch_): for update_magnitude, update_direction, consensus_param in zip( magnitudes_tb, directions_tb, self.consensus_params_tb ): consensus_param.add_(-1.0, update_direction.mul(update_magnitude)) # consistent the local models by assigning the consensus params. self.consensus_params_tb.unpack(params) n_bits = get_n_bits(directions_tb.buffer) + get_n_bits(magnitudes_tb.buffer) else: n_bits = 0 return n_bits