Exemple #1
0
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)
        tensor_compressed, ctx, selected_values = self._compression.compress(
            tensor, name, ratio=density)

        if False and rank(
        ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000:
            grads = tensor.cpu().numpy()
            layer_idx = self._sequential_keys.index(name)
            np.save(
                '%s/r%d_gradients_iter_%d::%s::%d' %
                (self._gradient_path, rank(), self.train_iter, name,
                 layer_idx), grads)
        indexes = ctx
        if indexes is None:
            handle = allgather_async(tensor_compressed, name)
            handle_idx = None  # quantization uses all indices
        else:
            handle = allgather_async(selected_values, name)
            handle_idx = allgather_async(indexes.int(), name + '_indexes')
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name,
                                    time.time() - stime)
        return (handle, handle_idx), ctx
Exemple #2
0
 def _sparse_allreduce_async(self, p, name):
     stime = time.time()
     tensor = p.data.view(-1)
     tensor_compressed, ctx, selected_values = self._compression.compress(
         tensor, name, ratio=self._density)
     indexes = ctx
     handle = allgather_async(selected_values, name)
     handle_idx = allgather_async(indexes.int(), name + '_indexes')
     if self._profiling:
         utils.force_insert_item(self._compression_timers, name,
                                 time.time() - stime)
     return (handle, handle_idx), ctx
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)
        tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density)
        self._selected_num_gradients.append(int(ctx.numel()))

        if settings.LOGGING_GRADIENTS and rank() == 0:
            grads = tensor.cpu().numpy()
            np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
        indexes = ctx
        handle = allgather_async(selected_values, name)
        handle_idx = allgather_async(indexes.int(), name+'_indexes')
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name, time.time()-stime)
        return (handle, handle_idx), ctx 
Exemple #4
0
 def _allgather_grad_async(self, p, name):
     tensor = p.data.view(-1)
     tensor_compressed, ctx = tensor, None #self._compression.compress(tensor, name)
     if settings.LOGGING_GRADIENTS and rank() == 0:
         grads = tensor.cpu().numpy()
         np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
     handle = allgather_async(tensor_compressed, name=name)
     return handle, ctx
    def _allgather_factors(self):
        """Allgather the factors for all layers"""
        handles = []

        def _get_value_and_idx(sparse_tensor):
            tensor = sparse_tensor.data.view(-1)
            one_indexes = tensor != 0
            indexes = one_indexes.nonzero().data.squeeze().view(-1)
            values = tensor.data[indexes]
            return values, indexes.int()

        for i, m in enumerate(self.modules):
            module_name = self.module_names[i]

            A_values, A_indexes = _get_value_and_idx(self.m_A[m].data)
            A_value_name = module_name + '_A_value'
            A_idx_name = module_name + '_A_idx'
            h_value = allgather_async(A_values, A_value_name)
            h_idx = allgather_async(A_indexes, A_idx_name)

            G_values, G_indexes = _get_value_and_idx(self.m_G[m].data)
            G_value_name = module_name + '_G_value'
            G_idx_name = module_name + '_G_idx'
            h_value_G = allgather_async(G_values, G_value_name)
            h_idx_G = allgather_async(G_indexes, G_idx_name)
            handles.append((h_value, h_idx, h_value_G, h_idx_G))

        for i, handle in enumerate(handles):
            module_name = self.module_names[i]
            module = self.modules[i]
            m_A = self.m_A[module].view(-1)
            m_A.fill_(0.0)
            m_G = self.m_G[module].view(-1)
            m_G.fill_(0.0)

            h_value_A, h_idx_A, h_value_G, h_idx_G = handle
            A_values = hvd.synchronize(h_value_A)
            A_indexes = hvd.synchronize(h_idx_A).long()
            m_A.scatter_add_(0, A_indexes, A_values)
            m_A.div_(hvd.size())

            G_values = hvd.synchronize(h_value_G)
            G_indexes = hvd.synchronize(h_idx_G).long()
            m_G.scatter_add_(0, G_indexes, G_values)
            m_G.div_(hvd.size())
Exemple #6
0
 def _sparse_allreduce_async(self, p):
     name = self._parameter_names.get(p)
     tensor = p.grad.data.view(-1)
     #print('name: ', name, ' input shape: ', tensor.shape)
     ratio = 0.5
     tensor_compressed, ctx = self._compression.compress(tensor, name, ratio=ratio)
     indexes = ctx
     selected_values = tensor_compressed[indexes]
     communicated_tensor = torch.cat((selected_values, indexes.float()), 0)
     handle = allgather_async(communicated_tensor, name)
     return handle, ctx 
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)

        if settings.CPU_COMPRESS:
            density = abs(density)
            tensor_cpy = tensor.to('cpu')
            tensor_compressed, ctx, selected_values = self._compression.compress(
                tensor_cpy,
                name,
                ratio=density,
                tb=self._tb,
                i_ratio=self.iratio,
                stages=self.stages,
                ec_grad_w=self.ec_gradw,
                ec_mem_w=self.ec_memw)
            selected_values = selected_values.to('cuda')
            ctx = ctx.to('cuda')
        else:
            tensor_compressed, ctx, selected_values = self._compression.compress(
                tensor,
                name,
                ratio=density,
                tb=self._tb,
                i_ratio=self.iratio,
                stages=self.stages,
                ec_grad_w=self.ec_gradw,
                ec_mem_w=self.ec_memw)
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name,
                                    time.time() - stime)

        self._selected_num_gradients.append(int(ctx.numel()))

        indexes = ctx
        handle = allgather_async(selected_values, name)
        handle_idx = allgather_async(indexes.int(), name + '_indexes')
        return (handle, handle_idx), ctx
    def forward(self, input, weight, bias, running_mean, running_var, eps,
                momentum):
        input = input.contiguous()

        size = input.numel() // input.size(1)
        count = torch.tensor([size])

        # calculate mean/invstd for input.
        mean, invstd = torch.batch_norm_stats(input, eps)

        count_handle = allgather_async(count.unsqueeze(0),
                                       name='sync_batch_norm.count')
        mean_handle = allgather_async(mean.unsqueeze(0),
                                      name='sync_batch_norm.mean')
        invstd_handle = allgather_async(invstd.unsqueeze(0),
                                        name='sync_batch_norm.invstd')

        # wait on the async communication to finish
        count_all = synchronize(count_handle)
        mean_all = synchronize(mean_handle)
        invstd_all = synchronize(invstd_handle)

        if _SYNC_BN_V2:
            counts_for_bngswc = count_all.view(-1).float().to(input.device)
        else:
            # backwards compatibility
            counts_for_bngswc = count_all.view(-1).tolist()

        # calculate global mean & invstd
        mean, invstd = torch.batch_norm_gather_stats_with_counts(
            input, mean_all, invstd_all, running_mean, running_var, momentum,
            eps, counts_for_bngswc)

        self.save_for_backward(input, weight, mean, invstd, count_all)

        # apply element-wise normalization
        return torch.batch_norm_elemt(input, weight, bias, mean, invstd, eps)
    def _broadcast_grad_async(self, p):
        name = self._parameter_names.get(p)
        compressor = self._compressors.get(p)
        tensor = p.grad
        ctx = None

        if compressor is None:
            tensor_compressed, ctx = self._compression.compress(tensor)
            handles = [allreduce_async_(tensor_compressed, average=True, name=name)]
        else:
            tensors_compressed = compressor.compress(tensor)
            handles = [allgather_async(t, name=name+" "+str(i))
                       for i, t in enumerate(tensors_compressed)]

        return handles, ctx
Exemple #10
0
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)
        #if self.train_iter < 100:
        #    tensor_compressed, ctx, selected_values = compressors['topkec'].compress(tensor, name, ratio=density, tb=self._tb)
        #else:
        if settings.CPU_COMPRESS:
            density = abs(density)
            tensor_cpy = tensor.to('cpu')
            tensor_compressed, ctx, selected_values = self._compression.compress(tensor_cpy, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw)
            selected_values = selected_values.to('cuda')
            ctx = ctx.to('cuda')
        else:
            tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw)
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name, time.time() - stime)

        # #Ahmed - Add the grads to memory
        # if settings.UPDATE_ITER:
        #     self._grad_memory[name] += (tensor - tensor_compressed)
        #     self._grad_count += 1

        self._selected_num_gradients.append(int(ctx.numel()))

        # if settings.LOGGING_GRADIENTS and ctx.numel() == 0 and rank() == 0:
        #     grads = tensor.cpu().numpy()
        #     np.save('%s/r%d_zero_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
        # elif settings.LOGGING_GRADIENTS and settings.UPDATE_ITER and self.train_iter % settings.UPDATE_ITER == 0 and rank() == 0:
        #     grads = tensor.cpu().numpy()
        #     np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
        #handle = allreduce_async_(tensor_compressed, average=True, name=name)

        indexes = ctx
        handle = allgather_async(selected_values, name)
        handle_idx = allgather_async(indexes.int(), name+'_indexes')
        return (handle, handle_idx), ctx