def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) tensor_compressed, ctx, selected_values = self._compression.compress( tensor, name, ratio=density) if False and rank( ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000: grads = tensor.cpu().numpy() layer_idx = self._sequential_keys.index(name) np.save( '%s/r%d_gradients_iter_%d::%s::%d' % (self._gradient_path, rank(), self.train_iter, name, layer_idx), grads) indexes = ctx if indexes is None: handle = allgather_async(tensor_compressed, name) handle_idx = None # quantization uses all indices else: handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name + '_indexes') if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) return (handle, handle_idx), ctx
def _sparse_allreduce_async(self, p, name): stime = time.time() tensor = p.data.view(-1) tensor_compressed, ctx, selected_values = self._compression.compress( tensor, name, ratio=self._density) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name + '_indexes') if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) return (handle, handle_idx), ctx
def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density) self._selected_num_gradients.append(int(ctx.numel())) if settings.LOGGING_GRADIENTS and rank() == 0: grads = tensor.cpu().numpy() np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name+'_indexes') if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time()-stime) return (handle, handle_idx), ctx
def _allgather_grad_async(self, p, name): tensor = p.data.view(-1) tensor_compressed, ctx = tensor, None #self._compression.compress(tensor, name) if settings.LOGGING_GRADIENTS and rank() == 0: grads = tensor.cpu().numpy() np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) handle = allgather_async(tensor_compressed, name=name) return handle, ctx
def _allgather_factors(self): """Allgather the factors for all layers""" handles = [] def _get_value_and_idx(sparse_tensor): tensor = sparse_tensor.data.view(-1) one_indexes = tensor != 0 indexes = one_indexes.nonzero().data.squeeze().view(-1) values = tensor.data[indexes] return values, indexes.int() for i, m in enumerate(self.modules): module_name = self.module_names[i] A_values, A_indexes = _get_value_and_idx(self.m_A[m].data) A_value_name = module_name + '_A_value' A_idx_name = module_name + '_A_idx' h_value = allgather_async(A_values, A_value_name) h_idx = allgather_async(A_indexes, A_idx_name) G_values, G_indexes = _get_value_and_idx(self.m_G[m].data) G_value_name = module_name + '_G_value' G_idx_name = module_name + '_G_idx' h_value_G = allgather_async(G_values, G_value_name) h_idx_G = allgather_async(G_indexes, G_idx_name) handles.append((h_value, h_idx, h_value_G, h_idx_G)) for i, handle in enumerate(handles): module_name = self.module_names[i] module = self.modules[i] m_A = self.m_A[module].view(-1) m_A.fill_(0.0) m_G = self.m_G[module].view(-1) m_G.fill_(0.0) h_value_A, h_idx_A, h_value_G, h_idx_G = handle A_values = hvd.synchronize(h_value_A) A_indexes = hvd.synchronize(h_idx_A).long() m_A.scatter_add_(0, A_indexes, A_values) m_A.div_(hvd.size()) G_values = hvd.synchronize(h_value_G) G_indexes = hvd.synchronize(h_idx_G).long() m_G.scatter_add_(0, G_indexes, G_values) m_G.div_(hvd.size())
def _sparse_allreduce_async(self, p): name = self._parameter_names.get(p) tensor = p.grad.data.view(-1) #print('name: ', name, ' input shape: ', tensor.shape) ratio = 0.5 tensor_compressed, ctx = self._compression.compress(tensor, name, ratio=ratio) indexes = ctx selected_values = tensor_compressed[indexes] communicated_tensor = torch.cat((selected_values, indexes.float()), 0) handle = allgather_async(communicated_tensor, name) return handle, ctx
def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) if settings.CPU_COMPRESS: density = abs(density) tensor_cpy = tensor.to('cpu') tensor_compressed, ctx, selected_values = self._compression.compress( tensor_cpy, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw) selected_values = selected_values.to('cuda') ctx = ctx.to('cuda') else: tensor_compressed, ctx, selected_values = self._compression.compress( tensor, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw) if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) self._selected_num_gradients.append(int(ctx.numel())) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name + '_indexes') return (handle, handle_idx), ctx
def forward(self, input, weight, bias, running_mean, running_var, eps, momentum): input = input.contiguous() size = input.numel() // input.size(1) count = torch.tensor([size]) # calculate mean/invstd for input. mean, invstd = torch.batch_norm_stats(input, eps) count_handle = allgather_async(count.unsqueeze(0), name='sync_batch_norm.count') mean_handle = allgather_async(mean.unsqueeze(0), name='sync_batch_norm.mean') invstd_handle = allgather_async(invstd.unsqueeze(0), name='sync_batch_norm.invstd') # wait on the async communication to finish count_all = synchronize(count_handle) mean_all = synchronize(mean_handle) invstd_all = synchronize(invstd_handle) if _SYNC_BN_V2: counts_for_bngswc = count_all.view(-1).float().to(input.device) else: # backwards compatibility counts_for_bngswc = count_all.view(-1).tolist() # calculate global mean & invstd mean, invstd = torch.batch_norm_gather_stats_with_counts( input, mean_all, invstd_all, running_mean, running_var, momentum, eps, counts_for_bngswc) self.save_for_backward(input, weight, mean, invstd, count_all) # apply element-wise normalization return torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps)
def _broadcast_grad_async(self, p): name = self._parameter_names.get(p) compressor = self._compressors.get(p) tensor = p.grad ctx = None if compressor is None: tensor_compressed, ctx = self._compression.compress(tensor) handles = [allreduce_async_(tensor_compressed, average=True, name=name)] else: tensors_compressed = compressor.compress(tensor) handles = [allgather_async(t, name=name+" "+str(i)) for i, t in enumerate(tensors_compressed)] return handles, ctx
def _sparse_allreduce_async(self, p, name, density): stime = time.time() tensor = p.data.view(-1) #if self.train_iter < 100: # tensor_compressed, ctx, selected_values = compressors['topkec'].compress(tensor, name, ratio=density, tb=self._tb) #else: if settings.CPU_COMPRESS: density = abs(density) tensor_cpy = tensor.to('cpu') tensor_compressed, ctx, selected_values = self._compression.compress(tensor_cpy, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw) selected_values = selected_values.to('cuda') ctx = ctx.to('cuda') else: tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw) if self._profiling: utils.force_insert_item(self._compression_timers, name, time.time() - stime) # #Ahmed - Add the grads to memory # if settings.UPDATE_ITER: # self._grad_memory[name] += (tensor - tensor_compressed) # self._grad_count += 1 self._selected_num_gradients.append(int(ctx.numel())) # if settings.LOGGING_GRADIENTS and ctx.numel() == 0 and rank() == 0: # grads = tensor.cpu().numpy() # np.save('%s/r%d_zero_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) # elif settings.LOGGING_GRADIENTS and settings.UPDATE_ITER and self.train_iter % settings.UPDATE_ITER == 0 and rank() == 0: # grads = tensor.cpu().numpy() # np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads) #handle = allreduce_async_(tensor_compressed, average=True, name=name) indexes = ctx handle = allgather_async(selected_values, name) handle_idx = allgather_async(indexes.int(), name+'_indexes') return (handle, handle_idx), ctx