def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) tensor_compressed, ctx, selected_values = self._compression.compress( tensor, name, ratio=density) if False and rank( ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000: grads = tensor.cpu().numpy() layer_idx = self._sequential_keys.index(name) np.save( '%s/r%d_gradients_iter_%d::%s::%d' % (self._gradient_path, rank(), self.train_iter, name, layer_idx), grads) indexes = ctx if indexes is None: handle = allgather_async(tensor_compressed, name) handle_idx = None # quantization uses all indices else: handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name + '_indexes') if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) return (handle, handle_idx), ctx
def _sparse_allreduce_async(self, p, name): stime = time.time() tensor = p.data.view(-1) tensor_compressed, ctx, selected_values = self._compression.compress( tensor, name, ratio=self._density) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name + '_indexes') if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) return (handle, handle_idx), ctx
def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density) self._selected_num_gradients.append(int(ctx.numel())) if settings.LOGGING_GRADIENTS and rank() == 0: grads = tensor.cpu().numpy() np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name+'_indexes') if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time()-stime) return (handle, handle_idx), ctx
def synchronize(self): for p, value in self._handles.items(): name = self._merged_parameter_names.get(p) handle, ctx, density = value stime = time.time() output = synchronize(handle) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() if self._norm_clip is not None: norm_clip = np.sqrt(1.0 / size()) * self._norm_clip norm_type = 2.0 param_norm = output.norm(norm_type) total_norm = param_norm.item() clip_coef = norm_clip / (total_norm + 1e-6) if clip_coef < 1: output.mul_(clip_coef) p.set_(output) if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) if len(self._groups) != len(self._sequential_keys): for merged_p, value in self._handles.items(): new_name = self._merged_parameter_names.get(merged_p) tensors = self._pull_from_buffer(new_name, merged_p) for n in tensors: p = self._named_parameters.get(n) if settings.FP16: p.grad.set_(tensors[n].data.type(p.grad.type())) else: p.grad.set_(tensors[n].data) self.train_iter += 1 self._handles.clear() self._print_profiling()
def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) if settings.CPU_COMPRESS: density = abs(density) tensor_cpy = tensor.to('cpu') tensor_compressed, ctx, selected_values = self._compression.compress( tensor_cpy, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw) selected_values = selected_values.to('cuda') ctx = ctx.to('cuda') else: tensor_compressed, ctx, selected_values = self._compression.compress( tensor, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw) if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) self._selected_num_gradients.append(int(ctx.numel())) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name + '_indexes') return (handle, handle_idx), ctx
def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) #if self.train_iter < 100: # tensor_compressed, ctx, selected_values = compressors['topkec'].compress(tensor, name, ratio=density, tb=self._tb) #else: if settings.CPU_COMPRESS: density = abs(density) tensor_cpy = tensor.to('cpu') tensor_compressed, ctx, selected_values = self._compression.compress(tensor_cpy, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw) selected_values = selected_values.to('cuda') ctx = ctx.to('cuda') else: tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw) if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) # #Ahmed - Add the grads to memory # if settings.UPDATE_ITER: # self._grad_memory[name] += (tensor - tensor_compressed) # self._grad_count += 1 self._selected_num_gradients.append(int(ctx.numel())) # if settings.LOGGING_GRADIENTS and ctx.numel() == 0 and rank() == 0: # grads = tensor.cpu().numpy() # np.save('%s/r%d_zero_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) # elif settings.LOGGING_GRADIENTS and settings.UPDATE_ITER and self.train_iter % settings.UPDATE_ITER == 0 and rank() == 0: # grads = tensor.cpu().numpy() # np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) #handle = allreduce_async_(tensor_compressed, average=True, name=name) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name+'_indexes') return (handle, handle_idx), ctx
def synchronize(self): num_of_workers = size() for p, value in self._handles.items(): name = self._merged_parameter_names.get(p) handle, ctx, density = value if self._sparse and density < 1: stime = time.time() handle_idx = None all_indexes = None if type(handle) is tuple: handle, handle_idx = handle[0], handle[1] output = synchronize(handle) if handle_idx is not None: all_indexes = synchronize(handle_idx) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() new_grad = p.data.view(-1) new_grad.fill_(0.0) numel = output.size(0) real_num_values = numel // num_of_workers for i in range(num_of_workers): values_and_indexes = output.data[i * real_num_values:(i + 1) * real_num_values] if all_indexes is None: values = values_and_indexes indexes = None per_values = values per_values = self._compression.decompress( per_values, p.size()) new_grad += per_values.view(-1) else: values = values_and_indexes indexes = all_indexes.data[i * real_num_values:(i + 1) * real_num_values].long() per_values = values[0:indexes.numel()] per_values = self._compression.decompress( per_values, p.size()) new_grad[indexes[0:indexes.numel()]] += per_values new_grad /= num_of_workers if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) else: stime = time.time() output = synchronize(handle) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() if self._norm_clip is not None: norm_clip = np.sqrt(1.0 / size()) * self._norm_clip norm_type = 2.0 param_norm = output.norm(norm_type) total_norm = param_norm.item() clip_coef = norm_clip / (total_norm + 1e-6) if clip_coef < 1: output.mul_(clip_coef) if self._compression: output = self._compression.decompress(output, p.size()) p.set_(output) if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) if len(self._groups) != len(self._sequential_keys): for merged_p, value in self._handles.items(): new_name = self._merged_parameter_names.get(merged_p) tensors = self._pull_from_buffer(new_name, merged_p) for n in tensors: p = self._named_parameters.get(n) if self._fp16: p.grad.set_(tensors[n].data.type(p.grad.type())) else: p.grad.set_(tensors[n].data) self.train_iter += 1 self._handles.clear() self._print_profiling()
def synchronize(self): global SPEED num_of_workers = size() ratio = 0 i = 0 for p, value in self._handles.items(): name = self._merged_parameter_names.get(p) handle, ctx, density = value if self._sparse and density < 1: stime = time.time() handle_idx = None all_indexes = None if type(handle) is tuple: handle, handle_idx = handle[0], handle[1] output = synchronize(handle) if handle_idx is not None: all_indexes = synchronize(handle_idx) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() new_grad = p.data.view(-1) dectx = output, all_indexes, num_of_workers new_grad = self._compression.decompress(new_grad, dectx) if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) elif density == 1: stime = time.time() output = synchronize(handle) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() if self._norm_clip is not None: norm_clip = np.sqrt(1.0 / size()) * self._norm_clip norm_type = 2.0 param_norm = output.norm(norm_type) total_norm = param_norm.item() clip_coef = norm_clip / (total_norm + 1e-6) if clip_coef < 1: output.mul_(clip_coef) p.set_(output) if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) elif density > 1: #allgather instead of allreduce of sparse tensor stime = time.time() output = synchronize(handle) if self._profiling: utils.force_insert_item(self._allreduce_timers, name, time.time() - stime) stime = time.time() new_grad = p.data.view(-1) new_grad.fill_(0.0) numel = output.size(0) real_num_values = numel // num_of_workers for i in range(num_of_workers): values = output.data[i * real_num_values:(i + 1) * real_num_values] new_grad += values new_grad /= num_of_workers if self._norm_clip is not None: norm_clip = np.sqrt(1.0 / size()) * self._norm_clip norm_type = 2.0 param_norm = new_grad.norm(norm_type) total_norm = param_norm.item() clip_coef = norm_clip / (total_norm + 1e-6) if clip_coef < 1: new_grad.mul_(clip_coef) p.set_(new_grad) if self._profiling: utils.force_insert_item(self._update_times, name, time.time() - stime) # Ahmed - track number of elments if ctx is not None: ratio += ctx.numel() / p.data.numel() else: ratio += 1 self._avg_ratio += ratio self._num_avg_sample += 1 if density < 1: #Volume for all-gather compression (data + indexes) - %TODO should multiply (1-1/num-of-workers) (to remove portion of local node) self._sum_volume += output.numel() * output.element_size( ) + all_indexes.numel() * all_indexes.element_size() elif density == 1: #Volume for all-reduce no-compression self._sum_volume += 2 * output.numel() * output.element_size() elif density == 2: #Volume for all-gather no compression (data ) - %TODO should multiply (1-1/num-of-workers) (to remove portion of local node) self._sum_volume += output.numel() * output.element_size() self._num_vol_sample += 1 if rank() == 0 and self.train_iter % settings.DISPLAY == 0: self._tb.log('datavol/cum_vol_bytes', self._sum_volume) self._tb.log('datavol/avg_vol_bytes', self._sum_volume / self._num_vol_sample) if self._compression is not compressors['none']: #and ratio > 0: #target_k = (self.model_elemnum * density) self._tb.log('compress/comp_ratio', ratio) self._tb.log('compress/est_compratio', ratio / density) self._tb.log('compress/avg_est_compratio', (1.0 * self._avg_ratio / self._num_avg_sample) / density) if self.stages < 0: self._tb.log('compress/num_stages', self._compression.cur_stages) else: self._tb.log('compress/num_stages', self.stages) if self.stages == 0: self._tb.log('compress/first_ratio', self.iratio) else: self._tb.log('compress/first_ratio', self._compression.first_ratio) self._num_sample = 0 self._sum_elems = 0 if len(self._groups) != len(self._sequential_keys): for merged_p, value in self._handles.items(): new_name = self._merged_parameter_names.get(merged_p) tensors = self._pull_from_buffer(new_name, merged_p) for n in tensors: p = self._named_parameters.get(n) p.grad.set_(tensors[n].data.type(p.grad.type())) self.train_iter += 1 self._handles.clear() self._print_profiling()