Exemple #1
0
 def _allreduce_grad_async(self, p, name):
     tensor = p.data.view(-1)
     if False and rank(
     ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000:
         grads = tensor.cpu().numpy()
         layer_idx = self._sequential_keys.index(name)
         np.save(
             '%s/r%d_gradients_iter_%d::%s::%d' %
             (self._gradient_path, rank(), self.train_iter, name,
              layer_idx), grads)
     allreduce_name = name
     if len(name) > 200:
         allreduce_name = name[0:100] + '...' + name[-100:]
     handle = allreduce_async_(tensor, average=True, name=allreduce_name)
     return handle, None
Exemple #2
0
 def _allreduce_grad_async(self, p, name):
     tensor = p.data.view(-1)
     tensor_compressed, ctx = tensor, None  #self._compression.compress(tensor, name)
     if settings.LOGGING_GRADIENTS and rank() == 0 and self.train_iter in [
             0, 10, 100, 200, 300, 500, 700, 900, 2000, 3000, 4000, 5000,
             6000, 7000, 8000, 9000
     ]:
         grads = tensor.cpu().numpy()
         np.save(
             '%s/r%d_gradients_iter_%d' %
             (self._gradient_path, rank(), self.train_iter), grads)
         if self.train_iter == 9000:
             exit()
     handle = allreduce_async_(tensor_compressed, average=True, name=name)
     return handle, ctx
    def _broadcast_grad_async(self, p):
        name = self._parameter_names.get(p)
        compressor = self._compressors.get(p)
        tensor = p.grad
        ctx = None

        if compressor is None:
            tensor_compressed, ctx = self._compression.compress(tensor)
            handles = [allreduce_async_(tensor_compressed, average=True, name=name)]
        else:
            tensors_compressed = compressor.compress(tensor)
            handles = [allgather_async(t, name=name+" "+str(i))
                       for i, t in enumerate(tensors_compressed)]

        return handles, ctx
Exemple #4
0
    def _allreduce_grad_async(self, p):
        name = self._parameter_names.get(p)
        tensor = p.grad
        tensor_compressed, ctx = self._compression.compress(tensor)

        if self.op == Average:
           # Split average operation across pre/postscale factors
           # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average.
            prescale_factor = 1.0 / self.gradient_predivide_factor
            postscale_factor = self.gradient_predivide_factor
        else:
            prescale_factor = 1.0
            postscale_factor = 1.0

        handle = allreduce_async_(tensor_compressed, name=name, op=self.op,
                                  prescale_factor=prescale_factor,
                                  postscale_factor=postscale_factor)
        return handle, ctx
Exemple #5
0
    def _allreduce_grad_async(self, p):
        # Delta optimizer implements this logic:
        #  start = current.copy()
        #  step() -> computes 'current - \alpha.f(g)' where f is
        #            optimizer logic and g is the gradient
        #  delta = current-start
        #  allreduce_(delta)
        #  start += delta
        #  current = start
        # In order to suppport this logic using function hook to improve performance,
        # we do:
        # delta = (start - \alpha.f(g)) - start
        #       = -\alpha.f(g)
        # set start to zero and step computes -\alpha.f(g)
        # where f is the underlying optimizer logic

        name = self._parameter_names.get(p)
        start = self._starting_models[p]

        stashed_params = []
        for group in self.param_groups:
            stashed_params.append(group['params'])
            # only want to step on p
            if any([p is v for v in group['params']]):
                group['params'] = [p]
            else:
                group['params'] = []

        start.data.copy_(p)

        super(self.__class__, self).step()

        # compute delta = curr - start
        p.data.sub_(start)

        # allreduce as before
        tensor_compressed, ctx = self._compression.compress(p)
        handle = allreduce_async_(tensor_compressed.data, name=name, op=Adasum)

        # reset stashed parameters
        for stashed, group in zip(stashed_params, self.param_groups):
            group['params'] = stashed

        return handle, ctx
Exemple #6
0
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            if p_size < 1024:
                handle = allreduce_async_(p.grad.data, average=True, name=name)
            else:
                masks = [torch.zeros(p.size()).cuda()]
                masks, compressed_val, compressed_idx = select_top_k_appr(
                    p.grad, 0.001, masks)
                mgs = torch.cat([
                    compressed_idx.type('torch.cuda.FloatTensor'),
                    compressed_val
                ])
                handle = hvd.allgather_async(msg)
                for node_idx in range(hvd.size()):
                    p.grad.data[msg[node_idx*msg_size*2 : node_idx*msg_size*2 + msg_size].type('torch.cuda.LongTensor')] += \
                    msg[node_idx*msg_size*2 + msg_size : node_idx*msg_size*2 + 2*msg_size]

            self._handles[p] = handle
Exemple #7
0
    def _allreduce_grad_async(self, p):
        if p.grad is None:
            # Gradient was not computed, but we still need to submit a tensor to allreduce
            # as one of the other ranks may have computed it (due to dynamic forward functions).
            #
            # NOTE: this will not work if the gradient is sparse and we perform an allgather.
            # Unfrotunately, there doesn't appear to be a good way to detect that the parameter will
            # produce sparse gradients before computing the gradient.
            p.grad = p.data.new(p.size()).zero_()

        name = self._parameter_names.get(p)
        tensor = p.grad

        if p.grad.is_sparse:
            if self.sparse_as_dense:
                tensor = tensor.to_dense()
            else:
                return self._sparse_allreduce_grad_async(p, name)

        tensor_compressed, ctx = self._compression.compress(tensor)

        if self.op == Average:
            # Split average operation across pre/postscale factors
            # C++ backend will apply additional 1 / size() factor to postscale_factor for op == Average.
            prescale_factor = 1.0 / self.gradient_predivide_factor
            postscale_factor = self.gradient_predivide_factor
        else:
            prescale_factor = 1.0
            postscale_factor = 1.0

        handle = allreduce_async_(tensor_compressed,
                                  name=name,
                                  op=self.op,
                                  prescale_factor=prescale_factor,
                                  postscale_factor=postscale_factor,
                                  process_set=self.process_set)
        return handle, ctx
Exemple #8
0
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time = time.time()
            #if self._use_allgather and p_size > 1024 and len(p.size()) == 4:
            if self._use_allgather:
                # fjr compress grad
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                p.grad.data.div_(hvd.size())
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    self._V[name] = self._V[name] + self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    self._V[name] = self._V[name] + self._U[name]
                torch.cuda.synchronize()
                begin_select_time = time.time()

                compressed_idx = []
                len_p = len(p)
                chunk_size = len_p // 10
                if self._offset[name] + 2 * chunk_size > len_p:
                    compressed_idx = range(self._offset[name], len_p)
                    self._offset[name] = 0
                else:
                    compressed_idx = range(self._offset[name],
                                           self._offset[name] + chunk_size)
                    self._offset[name] += chunk_size

                torch.cuda.synchronize()
                end_select_time = time.time()
                self.select_time += end_select_time - begin_select_time

                self._masks[name].zero_()
                self._masks[name][compressed_idx] = 1.0
                self._masks[name] = 1.0 - self._masks[name]

                if self._debug:
                    self._v_ref[name] = self._V[name] * (1.0 -
                                                         self._masks[name])
                    allreduce_(self._v_ref[name], average=False)

                #self._V[name] = self._V[name] * (1 - self._masks[name])
                #self._U[name] = self._U[name] * (1 - self._masks[name])
                self._V[name].mul_(self._masks[name])
                self._U[name].mul_(self._masks[name])

                torch.cuda.synchronize()
                begin_pack_time = time.time()

                p.grad.zero_()
                p.grad.data[compressed_idx] = self._V[name][compressed_idx]
                handle = allreduce_async_(p.grad.data[compressed_idx],
                                          average=False,
                                          name=name)
                self._handles[p] = handle

                torch.cuda.synchronize()
                self.pack_time += time.time() - begin_pack_time
            else:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    self._V[name] = self._V[name] + self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    self._V[name] = self._V[name] + self._U[name]
                p.grad.data = self._V[name]
                #compressed_msg = torch.randn(100).cuda()
                #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                handle = allreduce_async_(p.grad.data, average=True, name=name)
                self._handles[p] = handle

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time =  time.time()

            if self._use_allgather and p_size > 1024:
                weight_decay = self._weight_decay #group['weight_decay']
                momentum = self._momentum #group['momentum']
                dampening = 0.0 #group['dampening']
                nesterov = False #group['nesterov']
                d_p = p.grad.data
                d_p.div_(hvd.size())
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(d_p)
                        #buf.mul_(momentum).add_(1 - dampening, d_p)
                    #TODO
                    # if nesterov:
                    #     d_p = d_p.add(momentum, buf)
                    # else:
                    #     d_p = buf
                if 'residue_buffer' not in param_state:
                    rsd = param_state['residue_buffer'] = torch.zeros_like(p.data)
                    rsd.add_(param_state['momentum_buffer'])

                    if self._use_nesterov:
                        rsd  = rsd.add(momentum, d_p)
                else:
                    rsd = param_state['residue_buffer']
                    rsd.add_(param_state['momentum_buffer'])
                    if self._use_nesterov:
                        rsd  = rsd.add(momentum, d_p)

                compressed_val = []
                compressed_idx = []

                torch.cuda.synchronize()
                begin_select_time =  time.time()
                if 'interval' not in param_state:
                    param_state['interval'] = 1
                it = 0
                sparsity = 0.0
                if param_state['interval'] == 1:
                    compressed_val, compressed_idx, it, _, sparsity = \
                            select_bs_top(param_state['residue_buffer'], 0.001)
                    param_state['interval'] = 0
                else:
                    compressed_val, compressed_idx, it, _, sparsity = \
                            select_bs_bottom(param_state['residue_buffer'], 0.001)
                    param_state['interval'] = 1
                assert(len(compressed_idx) > 0)
                torch.cuda.synchronize()
                end_select_time =  time.time()
                self.select_time += end_select_time - begin_select_time

                masks_size = self._masks[name].size()
                self._masks[name].zero_()
                self._masks[name] = self._masks[name].view(-1)
                self._masks[name][compressed_idx] = 1.0

                self._masks[name] = 1.0 - self._masks[name]
                self._masks[name] = self._masks[name].view(masks_size)

                if self._debug:
                    self._v_ref[name] = torch.mean(compressed_val) \
                            * (1.0 - self._masks[name])
                    allreduce_(self._v_ref[name], average = False)

                if hvd.size() == 1:
                    p.grad.data = torch.mean(compressed_val) \
                            * (1.0 - self._masks[name])

                param_state['residue_buffer'].mul_(self._masks[name])
                param_state['momentum_buffer'].mul_(self._masks[name])

                torch.cuda.synchronize()
                begin_pack_time =  time.time()
                compressed_msg = []

                if hvd.size() > 1:
                    if self._use_gpu:
                        compressed_msg= torch.cat((\
                                torch.tensor([len(compressed_idx)]).type(torch.cuda.LongTensor),\
                                compressed_idx))

                    handle = _allgather_async(compressed_msg, self._compressed_idx[name], name=name + "idx")
                    self._handles[p] = handle

                    handle = _allgather_async(torch.mean(compressed_val), self._compressed_val[name], name=name + "val")
                    self._handles_val[p] = handle

                torch.cuda.synchronize()
                self.pack_time += time.time() - begin_pack_time

            else:
                weight_decay = self._weight_decay #group['weight_decay']
                momentum = self._momentum #group['momentum']
                dampening = 0.0 #group['dampening']
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if self._use_nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf
                #compressed_msg = torch.randn(100).cuda()
                #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                if hvd.size() > 1:
                    handle = allreduce_async_(p.grad.data, average=True, name=name)
                    self._handles[p] = handle
            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time
Exemple #10
0
 def hook(*ignore):
     assert p not in self._handles
     assert not p.grad.requires_grad
     name = self._parameter_names.get(p)
     handle = allreduce_async_(p.grad.data, average=True, name=name)
     self._handles[p] = handle
    def step(self, closure=None):
        # local clipping
        # DGC
        for group in self.param_groups:
            for p in group['params']:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                p.grad.data.div_(hvd.size())

            torch.nn.utils.clip_grad_norm_(group['params'], 0.25 * hvd.size() ** -0.5)
            #torch.nn.utils.clip_grad_norm(group['params'], 0.25)
            #weight_decay = group['weight_decay']
            #momentum = group['momentum']
            torch.cuda.synchronize()
            begin_time =  time.time()

            dampening = 0.0 #gcoup['dampening']
            for p in group['params']:
                assert p not in self._handles
                assert not p.grad.requires_grad
                name = self._parameter_names.get(p)
                p_size = np.prod(p.size())
                if self._use_allgather and p_size > 1024:
                    param_state = self.state[p]
                    self._V[name].add_(p.grad.data)
                    compressed_val = []
                    compressed_idx = []
                    #if p_size < 1000:
                    #self._masks[name], compressed_val, compressed_idx = select_top_k_appr(self._V[name], 0.001, self._masks[name])

                    torch.cuda.synchronize()
                    begin_select_time =  time.time()
                    if 'mid_store' not in param_state:
                        param_state['mid_store'] = 0.0
                    if 'interval' not in param_state:
                        param_state['interval'] = self._interval
                    compressed_val = []
                    compressed_idx = []
                    if param_state['interval'] == self._interval:
                        compressed_val, compressed_idx, it, param_state['mid_store'], sparsity = \
                            select_top_k_thdv3(self._V[name], 0.001)
                        param_state['interval'] = 0
                    else:
                        compressed_val, compressed_idx, sparsity = \
                            select_top_k_fixthd(self._V[name], param_state['mid_store'])
                        param_state['interval'] += 1
                    #masks_size = self._masks[name].size()
                    #self._masks[name].zero_()
                    #self._masks[name] = self._masks[name].view(-1)
                    #self._masks[name][compressed_idx] = 1.0
                    #self._masks[name] = 1.0 - self._masks[name]
                    #self._masks[name] = self._masks[name].view(masks_size)
                    torch.cuda.synchronize()
                    self.select_time += time.time() - begin_select_time

                    if self._debug:
                        self._v_ref[name] = self._V[name] * self._masks[name]
                        allreduce_(self._v_ref[name], average = False)

                    #self._V[name] = self._V[name] * (1 - self._masks[name])
                    #self._U[name] = self._U[name] * (1 - self._masks[name])
                    torch.cuda.synchronize()
                    begin_mask_time =  time.time()
                    V_size = self._masks[name].size()
                    self._V[name] = self._V[name].view(-1)
                    self._V[name][compressed_idx] = 0.0
                    self._V[name] = self._V[name].view(V_size)

                    torch.cuda.synchronize()
                    self.mask_time += time.time() - begin_mask_time
                    begin_pack_time =  time.time()

                    self._compressed_msg_size[name] = len(compressed_idx)
                    if self._use_gpu:
                        compressed_msg = torch.cat([\
                            torch.tensor([len(compressed_idx)]).type('torch.cuda.FloatTensor'),\
                            compressed_idx.type('torch.cuda.FloatTensor'), \
                            compressed_val])

                    handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                    self._handles[p] = handle

                    torch.cuda.synchronize()
                    self.pack_time += time.time() - begin_pack_time

                else:
                    handle = allreduce_async_(p.grad.data, average=True, name=name)
                    self._handles[p] = handle

            torch.cuda.synchronize()
            self.pruning_time += time.time() - begin_time

        self.synchronize()
        return super(self.__class__, self).step(closure)
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time = time.time()

            if self._use_allgather and p_size > 1024:
                weight_decay = self._weight_decay  #group['weight_decay']
                momentum = self._momentum  #group['momentum']
                dampening = 0.0  #group['dampening']
                d_p = p.grad.data
                d_p.div_(hvd.size())
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                if 'residue_buffer' not in param_state:
                    rsd = param_state['residue_buffer'] = torch.zeros_like(
                        p.data)
                    rsd.add_(param_state['momentum_buffer'])
                    if self._use_nesterov:
                        rsd = rsd.add(momentum, d_p)
                else:
                    rsd = param_state['residue_buffer']
                    rsd.add_(param_state['momentum_buffer'])
                    if self._use_nesterov:
                        rsd = rsd.add(momentum, d_p)

                compressed_val = []
                compressed_idx = []

                torch.cuda.synchronize()
                begin_select_time = time.time()
                if 'mid_store' not in param_state:
                    param_state['mid_store'] = 0.0
                if 'interval' not in param_state:
                    param_state['interval'] = 10
                it = 0
                sparsity = 0.0
                compressed_val, compressed_idx, it, _, sparsity = \
                    select_top_k_thdv3(param_state['residue_buffer'], 0.001)
                #                if param_state['interval'] == 10:
                #                    compressed_val, compressed_idx, it, param_state['mid_store'], sparsity = \
                #                            select_top_k_thdv3(param_state['residue_buffer'], 0.001)
                #                    param_state['interval'] = 0
                #                else:
                #                    compressed_val, compressed_idx, sparsity = \
                #                            select_top_k_fixthd(param_state['residue_buffer'], param_state['mid_store'])
                #                    param_state['interval'] += 1
                assert (len(compressed_idx) > 0)
                #if hvd.rank() == 0:
                #    print(name, p.size())
                #if hvd.rank() == 0 and name == "features.27.weight":
                #if name == "features.27.weight":
                #    torch.save(compressed_val, 'compressed_val' + str(local_rank()))
                #    torch.save(compressed_idx, 'compressed_idx' + str(local_rank()))
                #if hvd.rank() == 0 and name == "features.27.weight":
                #    self._it = it
                #    self._mid = param_state['mid_store']
                #    self._sparsity = sparsity
                torch.cuda.synchronize()
                end_select_time = time.time()
                self.select_time += end_select_time - begin_select_time
                #tmp_t = torch.tensor([local_len], dtype=torch.long)
                #                tmp_t = torch.tensor([local_len])
                # print("len list, ", global_len_list)
                #local_len = torch.min(global_len_list)
                ##print("local_len, ", local_len)
                #compressed_val = compressed_val[0:local_len]
                #compressed_idx = compressed_idx[0:local_len]

                masks_size = self._masks[name].size()
                self._masks[name].zero_()
                self._masks[name] = self._masks[name].view(-1)
                self._masks[name][compressed_idx] = 1.0

                self._masks[name] = 1.0 - self._masks[name]
                self._masks[name] = self._masks[name].view(masks_size)

                p.grad.data = param_state['residue_buffer'] * (
                    1.0 - self._masks[name])
                handle = allreduce_async_(p.grad.data, average=False)
                self._handles[p] = handle

                param_state['residue_buffer'].mul_(self._masks[name])
                param_state['momentum_buffer'].mul_(self._masks[name])

                torch.cuda.synchronize()
                begin_pack_time = time.time()
                #compressed_msg = torch.randn(100).cuda()

                torch.cuda.synchronize()
                self.pack_time += time.time() - begin_pack_time
            else:
                weight_decay = self._weight_decay  #group['weight_decay']
                momentum = self._momentum  #group['momentum']
                dampening = 0.0  #group['dampening']
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if self._use_nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                if hvd.size() > 1:
                    handle = allreduce_async_(p.grad.data,
                                              average=True,
                                              name=name)
                    self._handles[p] = handle
            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time
Exemple #13
0
    def step(self, closure=None):
        # local clipping
        # DGC
        for group in self.param_groups:
            for p in group['params']:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                p.grad.data.div_(hvd.size())

            torch.nn.utils.clip_grad_norm_(group['params'], 0.25 * hvd.size() ** -0.5)
            #torch.nn.utils.clip_grad_norm(group['params'], 0.25)
            #weight_decay = group['weight_decay']
            #momentum = group['momentum']
            torch.cuda.synchronize()
            begin_time =  time.time()

            dampening = 0.0 #gcoup['dampening']
            for p in group['params']:
                assert p not in self._handles
                assert not p.grad.requires_grad
                name = self._parameter_names.get(p)
                p_size = np.prod(p.size())
                if self._use_allgather and p_size > 1024:
                    param_state = self.state[p]
                    # fjr compress grad
                    if self._use_nesterov:
                        self._U[name] = torch.mul(torch.add(self._U[name], p.grad.data), self._momentum)
                        self._V[name] = self._V[name] + self._U[name] + p.grad.data
                    else:
                        self._U[name] = self._momentum * self._U[name] + p.grad.data
                        self._V[name] = self._V[name] + self._U[name]
                    compressed_val = []
                    compressed_idx = []

                    torch.cuda.synchronize()
                    begin_select_time =  time.time()
                    #if 'interval' not in param_state:
                    #    param_state['interval'] = 1
                    #if param_state['interval'] == 0:
                    #    compressed_val, compressed_idx, _, _, _ = \
                    #        select_bs_top(self._V[name], 0.001)
                    #    param_state['interval'] = 1
                    #else:
                    #    compressed_val, compressed_idx, _, _, _ = \
                    #        select_bs_bottom(self._V[name], 0.001)
                    #    param_state['interval'] = 0

                    compressed_val_top, compressed_idx_top, _, _, _ = \
                        select_bs_top(self._V[name], 0.001)
                    compressed_val_low, compressed_idx_low, _, _, _ = \
                        select_bs_bottom(self._V[name], 0.001)
                    compressed_mean = 0.0
                    if torch.mean(compressed_val_top) > -torch.mean(compressed_val_low):
                        compressed_val = compressed_val_top
                        compressed_idx = compressed_idx_top
                    else:
                        compressed_val = compressed_val_low
                        compressed_idx = compressed_idx_low


                    masks_size = self._masks[name].size()
                    self._masks[name].zero_()
                    self._masks[name] = self._masks[name].view(-1)
                    self._masks[name][compressed_idx] = 1.0
                    self._masks[name] = 1.0 - self._masks[name]
                    self._masks[name] = self._masks[name].view(masks_size)
                    torch.cuda.synchronize()
                    end_select_time =  time.time()
                    self.select_time += end_select_time - begin_select_time

                    if self._debug:
                        self._v_ref[name] = self._V[name] * (1.0 - self._masks[name])
                        allreduce_(self._v_ref[name], average = False)

                    #self._V[name] = self._V[name] * (1 - self._masks[name])
                    #self._U[name] = self._U[name] * (1 - self._masks[name])
                    self._V[name].mul_(self._masks[name])
                    self._U[name].mul_(self._masks[name])

                    torch.cuda.synchronize()
                    begin_comm_time =  time.time()

                    self._compressed_msg_size[name] = len(compressed_idx)
                    if self._use_gpu:
                        compressed_msg = torch.cat([\
                            torch.tensor([len(compressed_idx)]).type('torch.cuda.LongTensor'),\
                            compressed_idx])

                    handle = _allgather_async(compressed_msg, self._compressed_idx[name], name=name+"idx")
                    self._handles[p] = handle
                    handle = _allgather_async(torch.mean(compressed_val), \
                            self._compressed_val[name], name=name+"val")
                    self._handles_val[p] = handle

                    torch.cuda.synchronize()
                    self.comm_time += time.time() - begin_comm_time

                else:
                    #p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                    if self._use_nesterov:
                        self._U[name] = torch.mul(torch.add(self._U[name], p.grad.data), self._momentum)
                        self._V[name] = self._V[name] + self._U[name] + p.grad.data
                    else:
                        self._U[name] = self._momentum * self._U[name] + p.grad.data
                        self._V[name] = self._V[name] + self._U[name]
                    p.grad.data = self._V[name]
                    #compressed_msg = torch.randn(100).cuda()
                    handle = allreduce_async_(p.grad.data, average=True, name=name)
                    self._handles[p] = handle

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time

        self.synchronize()
        return super(self.__class__, self).step(closure)
Exemple #14
0
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time = time.time()
            if self._use_allgather and p_size > 1024:
                # fjr compress grad
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                p.grad.data.div_(hvd.size())
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    self._V[name] = self._V[name] + self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    self._V[name] = self._V[name] + self._U[name]
                compressed_val = []
                compressed_idx = []
                #if p_size < 1000:
                #    self._masks[name], compressed_val, compressed_idx = select_top_k_thd(self._V[name], 0.001, self._masks[name])
                #else:
                #self._masks[name], compressed_val, compressed_idx = select_top_k_thd(self._V[name], 0.001, self._masks[name])
                #self._masks[name], compressed_val, compressed_idx = select_top_k_thd(self._V[name], 0.001, self._masks[name])

                torch.cuda.synchronize()
                begin_select_time = time.time()
                local_mean, compressed_idx = select_top_k_thd_mean(
                    self._V[name], 0.001)
                torch.cuda.synchronize()
                end_select_time = time.time()
                self.select_time += end_select_time - begin_select_time

                #tmp_t = torch.tensor([local_len], dtype=torch.long)
                #                tmp_t = torch.tensor([local_len])
                # print("len list, ", global_len_list)
                #local_len = torch.min(global_len_list)
                ##print("local_len, ", local_len)
                #compressed_val = compressed_val[0:local_len]
                #compressed_idx = compressed_idx[0:local_len]
                masks_size = self._masks[name].size()
                self._masks[name].zero_()
                self._masks[name] = self._masks[name].view(-1)
                self._masks[name][compressed_idx] = 1.0

                self._masks[name] = 1.0 - self._masks[name]
                self._masks[name] = self._masks[name].view(masks_size)

                if self._debug:
                    self._v_ref[name] = self._V[name] * (1.0 -
                                                         self._masks[name])
                    allreduce_(self._v_ref[name], average=False)

                #self._V[name] = self._V[name] * (1 - self._masks[name])
                #self._U[name] = self._U[name] * (1 - self._masks[name])
                self._V[name].mul_(self._masks[name])
                self._U[name].mul_(self._masks[name])
                #self._compressed_msg_size[name] = len(compressed_idx)
                if self._use_gpu:
                    compressed_msg = torch.cat(\
                            [torch.tensor([len(compressed_idx)]).type('torch.cuda.FloatTensor'), \
                            torch.tensor([local_mean]).type('torch.cuda.FloatTensor'), \
                            compressed_idx.type('torch.cuda.FloatTensor')])
                else:
                    pass

                handle = _allgather_async(compressed_msg,
                                          self._compressed_msg[name],
                                          name=name)
                #compressed_msg = torch.randn(100).cuda()
                self._handles[p] = handle

            else:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    self._V[name] = self._V[name] + self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    self._V[name] = self._V[name] + self._U[name]
                p.grad.data = self._V[name]
                #compressed_msg = torch.randn(100).cuda()
                #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                handle = allreduce_async_(p.grad.data, average=True, name=name)
                self._handles[p] = handle
            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time
Exemple #15
0
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time = time.time()
            if self._use_allgather and p_size > 1024:
                # fjr compress grad
                p.grad.data.div_(hvd.size())
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    self._V[name] = self._V[name] + self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    self._V[name] = self._V[name] + self._U[name]
                compressed_val = []
                compressed_idx = []
                torch.cuda.synchronize()
                begin_select_time = time.time()
                #self._masks[name], compressed_val, compressed_idx = select_top_k_appr(self._V[name], 0.001, self._masks[name])
                self._masks[
                    name], compressed_val, compressed_idx = select_top_k_thd(
                        self._V[name], 0.001, self._masks[name])
                torch.cuda.synchronize()
                end_select_time = time.time()
                self.select_time += end_select_time - begin_select_time

                if self._debug:
                    self._v_ref[name] = self._V[name] * self._masks[name]
                    allreduce_(self._v_ref[name], average=False)

                #self._V[name] = self._V[name] * (1 - self._masks[name])
                #self._U[name] = self._U[name] * (1 - self._masks[name])
                if hvd.size() == 1:
                    p.grad.data = self._V[name] * (1.0 - self._masks[name])

                self._V[name].mul_(self._masks[name])
                self._U[name].mul_(self._masks[name])

                torch.cuda.synchronize()
                begin_comm_time = time.time()

                if hvd.size() > 1:
                    self._compressed_msg_size[name] = len(compressed_idx)
                    if self._use_gpu:
                        compressed_msg = torch.cat([compressed_idx.type('torch.cuda.FloatTensor'), \
                                compressed_val])
                    else:
                        compressed_msg = torch.cat([compressed_idx.type('torch.FloatTensor'), \
                                compressed_val])
                    handle = _allgather_async(compressed_msg,
                                              self._compressed_msg[name],
                                              name=name)
                    self._handles[p] = handle

                torch.cuda.synchronize()
                end_comm_time = time.time()
                self.pack_time += end_comm_time - begin_comm_time

            else:
                p.grad.data.div_(hvd.size())
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    #self._V[name] = self._V[name] + self._U[name] + p.grad.data
                    p.grad.data = self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    #self._V[name] = self._V[name] + self._U[name]
                    p.grad.data = self._U[name]
                #p.grad.data = self._V[name]
                #compressed_msg = torch.randn(100).cuda()
                #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                if hvd.size() > 1:
                    handle = allreduce_async_(p.grad.data,
                                              average=False,
                                              name=name)
                    self._handles[p] = handle

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time
Exemple #16
0
    def step(self, closure=None):
        # local clipping
        # DGC
        for group in self.param_groups:
            for p in group['params']:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                p.grad.data.div_(hvd.size())

            torch.nn.utils.clip_grad_norm_(group['params'],
                                           0.25 * hvd.size()**-0.5)
            #torch.nn.utils.clip_grad_norm(group['params'], 0.25)
            #weight_decay = group['weight_decay']
            #momentum = group['momentum']
            torch.cuda.synchronize()
            begin_time = time.time()

            dampening = 0.0  #gcoup['dampening']
            for p in group['params']:
                assert p not in self._handles
                assert not p.grad.requires_grad
                name = self._parameter_names.get(p)
                p_size = np.prod(p.size())
                if self._use_allgather and p_size > 1024:
                    param_state = self.state[p]
                    self._V[name].add_(p.grad.data)
                    # fjr compress grad
                    compressed_val = []
                    compressed_idx = []

                    torch.cuda.synchronize()
                    begin_select_time = time.time()
                    if 'interval' not in param_state:
                        param_state['interval'] = 1
                    if param_state['interval'] == 0:
                        compressed_val, compressed_idx, _, _, _ = \
                            select_bs_top(self._V[name], 0.001)
                        param_state['interval'] = 1
                    else:
                        compressed_val, compressed_idx, _, _, _ = \
                            select_bs_bottom(self._V[name], 0.001)
                        param_state['interval'] = 0

                    #masks_size = self._masks[name].size()
                    #self._masks[name].zero_()
                    #self._masks[name] = self._masks[name].view(-1)
                    #self._masks[name][compressed_idx] = 1.0
                    #self._masks[name] = 1.0 - self._masks[name]
                    #self._masks[name] = self._masks[name].view(masks_size)
                    torch.cuda.synchronize()
                    end_select_time = time.time()
                    self.select_time += end_select_time - begin_select_time

                    if self._debug:
                        self._v_ref[name] = self._V[name] * (1.0 -
                                                             self._masks[name])
                        allreduce_(self._v_ref[name], average=False)

                    torch.cuda.synchronize()
                    begin_mask_time = time.time()
                    V_size = self._masks[name].size()
                    self._V[name] = self._V[name].view(-1)
                    self._V[name][compressed_idx] = 0.0
                    self._V[name] = self._V[name].view(V_size)

                    torch.cuda.synchronize()
                    self.mask_time += time.time() - begin_mask_time
                    begin_pack_time = time.time()

                    self._compressed_msg_size[name] = len(compressed_idx)
                    if self._use_gpu:
                        compressed_msg = torch.cat([\
                            torch.tensor([len(compressed_idx)]).type('torch.cuda.LongTensor'),\
                            compressed_idx])

                    if p_size == 1500 * 10000:
                        compressed_msg = torch.cat([\
                            torch.tensor([len(compressed_idx)]).type('torch.cuda.FloatTensor'),\
                            compressed_idx.type('torch.cuda.FloatTensor'), \
                            compressed_val])
                        handle = _allgather_async(compressed_msg,
                                                  self._compressed_msg[name],
                                                  name=name)
                        self._handles[p] = handle
                    else:
                        handle = _allgather_async(compressed_msg,
                                                  self._compressed_idx[name],
                                                  name=name + "idx")
                        self._handles[p] = handle
                        handle = _allgather_async(torch.mean(compressed_val), \
                                self._compressed_val[name], name=name+"val")
                        self._handles_val[p] = handle

                    torch.cuda.synchronize()
                    self.pack_time += time.time() - begin_pack_time

                else:
                    handle = allreduce_async_(p.grad.data,
                                              average=True,
                                              name=name)
                    self._handles[p] = handle

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time

        self.synchronize()
        return super(self.__class__, self).step(closure)
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time = time.time()

            if self._use_allgather and p_size > 1024:
                weight_decay = self._weight_decay  #group['weight_decay']
                momentum = self._momentum  #group['momentum']
                dampening = 0.0  #group['dampening']
                nesterov = False  #group['nesterov']
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    #TODO
                    # if nesterov:
                    #     d_p = d_p.add(momentum, buf)
                    # else:
                    #     d_p = buf
                if 'residue_buffer' not in param_state:
                    rsd = param_state['residue_buffer'] = torch.zeros_like(
                        p.data)
                    rsd.add_(param_state['momentum_buffer'])
                else:
                    rsd = param_state['residue_buffer']
                    rsd.add_(param_state['momentum_buffer'])

                compressed_val = []
                compressed_idx = []

                torch.cuda.synchronize()
                begin_select_time = time.time()
                if 'mid_store' not in param_state:
                    param_state['mid_store'] = 0.0
                if 'interval' not in param_state:
                    param_state['interval'] = 10
                it = 0
                sparsity = 0.0
                if param_state['interval'] == 10:
                    compressed_val, compressed_idx, it, param_state['mid_store'], sparsity = \
                            select_top_k_thdv3(param_state['residue_buffer'], 0.001)
                    param_state['interval'] = 0
                else:
                    compressed_val, compressed_idx, sparsity = \
                            select_top_k_fixthd(param_state['residue_buffer'], param_state['mid_store'])
                    param_state['interval'] += 1
                assert (len(compressed_idx) > 0)
                #if hvd.rank() == 0:
                #    print(name, p.size())
                #if hvd.rank() == 0 and name == "features.27.weight":
                #if name == "features.27.weight":
                #    torch.save(compressed_val, 'compressed_val' + str(local_rank()))
                #    torch.save(compressed_idx, 'compressed_idx' + str(local_rank()))
                #if hvd.rank() == 0 and name == "features.27.weight":
                #    self._it = it
                #    self._mid = param_state['mid_store']
                #    self._sparsity = sparsity
                torch.cuda.synchronize()
                end_select_time = time.time()
                self.select_time += end_select_time - begin_select_time
                #tmp_t = torch.tensor([local_len], dtype=torch.long)
                #                tmp_t = torch.tensor([local_len])
                # print("len list, ", global_len_list)
                #local_len = torch.min(global_len_list)
                ##print("local_len, ", local_len)
                #compressed_val = compressed_val[0:local_len]
                #compressed_idx = compressed_idx[0:local_len]

                masks_size = self._masks[name].size()
                self._masks[name].zero_()
                self._masks[name] = self._masks[name].view(-1)
                self._masks[name][compressed_idx] = 1.0

                self._masks[name] = 1.0 - self._masks[name]
                self._masks[name] = self._masks[name].view(masks_size)

                if self._debug:
                    self._v_ref[name] = param_state['residue_buffer'] * (
                        1.0 - self._masks[name])
                    allreduce_(self._v_ref[name], average=False)

                #self._V[name] = self._V[name] * (1 - self._masks[name])
                #self._U[name] = self._U[name] * (1 - self._masks[name])
                param_state['residue_buffer'].mul_(self._masks[name])
                param_state['momentum_buffer'].mul_(self._masks[name])
                #self._compressed_msg_size[name] = len(compressed_idx)

                torch.cuda.synchronize()
                begin_pack_time = time.time()

                #if self._use_gpu:
                #    #compressed_msg = torch.cat([\
                #    #        torch.tensor([len(compressed_idx)]).type('torch.cuda.FloatTensor'),\
                #    #        compressed_idx.type('torch.cuda.FloatTensor'), \
                #    #        compressed_val])
                #    compressed_msg = torch.cat([\
                #            torch.tensor([len(compressed_idx)]).type('torch.cuda.LongTensor'), \
                #            compressed_idx])

                handle = _allgather_async(compressed_idx,
                                          self._compressed_idx[name],
                                          name=name + 'idx')
                #compressed_msg = torch.randn(100).cuda()
                self._handles[p] = handle

                handle = _allgather_async(compressed_val,
                                          self._compressed_val[name],
                                          name=name + 'val')
                #compressed_msg = torch.randn(100).cuda()
                self._handles_val[p] = handle

                handle = _allgather_async(torch.tensor([len(compressed_idx)]),
                                          self._compressed_len[name],
                                          name=name + 'len')
                #handle = _allgather_async(len(compressed_idx), self._compressed_len[name], name=name + 'len')
                #compressed_msg = torch.randn(100).cuda()
                self._handles_len[p] = handle

                torch.cuda.synchronize()
                self.pack_time += time.time() - begin_pack_time

            else:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    self._V[name] = self._V[name] + self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    self._V[name] = self._V[name] + self._U[name]
                p.grad.data = self._V[name]
                #compressed_msg = torch.randn(100).cuda()
                #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                handle = allreduce_async_(p.grad.data, average=True, name=name)
                self._handles[p] = handle
            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time
Exemple #18
0
    def step(self, closure=None):
        # local clipping
        # DGC
        for group in self.param_groups:
            for p in group['params']:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                p.grad.data.div_(hvd.size())

            torch.nn.utils.clip_grad_norm(group['params'],
                                          0.25 * hvd.size()**-0.5)
            #torch.nn.utils.clip_grad_norm(group['params'], 0.25)
            #weight_decay = group['weight_decay']
            #momentum = group['momentum']
            torch.cuda.synchronize()
            begin_time = time.time()

            dampening = 0.0  #gcoup['dampening']
            for p in group['params']:
                assert p not in self._handles
                assert not p.grad.requires_grad
                name = self._parameter_names.get(p)
                p_size = np.prod(p.size())
                if self._use_allgather and p_size > 1024:
                    # fjr compress grad
                    self._V[name].add_(p.grad.data)
                    compressed_val = []
                    compressed_idx = []

                    torch.cuda.synchronize()
                    begin_select_time = time.time()

                    compressed_val, compressed_idx = select_top_k_thd(
                        self._V[name], 0.001)
                    torch.cuda.synchronize()
                    end_select_time = time.time()
                    self.select_time += end_select_time - begin_select_time

                    if self._debug:
                        masks_size = self._masks[name].size()
                        self._masks[name].zero_()
                        self._masks[name] = self._masks[name].view(-1)
                        self._masks[name][compressed_idx] = 1.0
                        self._masks[name] = 1.0 - self._masks[name]
                        self._masks[name] = self._masks[name].view(masks_size)
                        self._v_ref[name] = self._V[name] * self._masks[name]
                        allreduce_(self._v_ref[name], average=False)

                    #self._V[name].mul_(self._masks[name])
                    V_size = self._masks[name].size()
                    self._V[name] = self._V[name].view(-1)
                    self._V[name][compressed_idx] = 0.0
                    self._V[name] = self._V[name].view(V_size)

                    torch.cuda.synchronize()
                    begin_comm_time = time.time()
                    self._compressed_msg_size[name] = len(compressed_idx)
                    if self._use_gpu:
                        compressed_msg = torch.cat([
                            compressed_idx.type('torch.cuda.FloatTensor'),
                            compressed_val
                        ])
                    else:
                        compressed_msg = torch.cat([
                            compressed_idx.type('torch.FloatTensor'),
                            compressed_val
                        ])

                    handle = _allgather_async(compressed_msg,
                                              self._compressed_msg[name],
                                              name=name)
                    self._handles[p] = handle
                else:
                    handle = allreduce_async_(p.grad.data,
                                              average=True,
                                              name=name)
                    self._handles[p] = handle

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time

        self.synchronize()
        return super(self.__class__, self).step(closure)
Exemple #19
0
    def step(self, closure=None):
        # local clipping
        # DGC
        for group in self.param_groups:
            for p in group['params']:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                p.grad.data.div_(hvd.size())

            torch.nn.utils.clip_grad_norm(group['params'],
                                          0.25 * hvd.size()**-0.5)
            #torch.nn.utils.clip_grad_norm(group['params'], 0.25)
            #weight_decay = group['weight_decay']
            #momentum = group['momentum']
            torch.cuda.synchronize()
            begin_time = time.time()

            dampening = 0.0  #gcoup['dampening']
            for p in group['params']:
                assert p not in self._handles
                assert not p.grad.requires_grad
                name = self._parameter_names.get(p)
                p_size = np.prod(p.size())
                if self._use_allgather and p_size > 1024:
                    if self._momentum != 0:
                        if self._use_nesterov:
                            self._U[name] = torch.mul(
                                torch.add(self._U[name], p.grad.data),
                                self._momentum)
                            self._V[name] = self._V[name] + self._U[
                                name] + p.grad.data
                        else:
                            self._U[name] = self._momentum * self._U[
                                name] + p.grad.data
                            self._V[name] = self._V[name] + self._U[name]
                    else:
                        self._V[name].add_(p.grad.data)
                    compressed_val = []
                    compressed_idx = []

                    torch.cuda.synchronize()
                    begin_select_time = time.time()

                    #compressed_val, compressed_idx = select_top_k_thd(self._V[name], 0.001)
                    if self._flag[name] == 0:
                        compressed_val, compressed_idx = \
                                select_lowk_truncated_mean(self._V[name], 0.001)
                        self._flag[name] = 1
                    else:
                        compressed_val, compressed_idx = \
                                select_topk_truncated_mean(self._V[name], 0.001)
                        self._flag[name] = 0
                    #compressed_val_low, compressed_idx_low = \
                    #        select_lowk_truncated_mean(self._V[name], 0.001)
                    #compressed_val_top, compressed_idx_top = \
                    #        select_topk_truncated_mean(self._V[name], 0.001)
                    #compressed_mean = 0
                    #if(-torch.mean(compressed_val_low) > torch.mean(compressed_val_top)):
                    #    compressed_val = compressed_val_low
                    #    compressed_idx = compressed_idx_low
                    #    compressed_mean = torch.mean(compressed_val_low)
                    #else:
                    #    compressed_val = compressed_val_top
                    #    compressed_idx = compressed_idx_top
                    #    compressed_mean = torch.mean(compressed_val_top)

                    torch.cuda.synchronize()
                    end_select_time = time.time()
                    self.select_time += end_select_time - begin_select_time
                    if self._debug:
                        masks_size = self._masks[name].size()
                        self._masks[name].zero_()
                        self._masks[name] = self._masks[name].view(-1)
                        self._masks[name][compressed_idx] = 1.0
                        self._masks[name] = 1.0 - self._masks[name]
                        self._masks[name] = self._masks[name].view(masks_size)
                        self._v_ref[name] = self._V[name] * (1.0 -
                                                             self._masks[name])
                        allreduce_(self._v_ref[name], average=False)

                    #self._V[name].mul_(self._masks[name])

                    V_size = self._masks[name].size()
                    self._V[name] = self._V[name].view(-1)
                    self._V[name][compressed_idx] = 0.0
                    self._V[name] = self._V[name].view(V_size)

                    if self._momentum != 0.0:
                        self._U[name].mul_(self._masks[name])

                    torch.cuda.synchronize()
                    begin_comm_time = time.time()
                    self._compressed_msg_size[name] = len(compressed_idx)

                    handle = _allgather_async(compressed_idx,
                                              self._compressed_idx[name],
                                              name=name + "idx")
                    self._handles[p] = handle
                    handle = _allgather_async(torch.mean(compressed_val), \
                            self._compressed_val[name], name=name+"val")
                    self._handles_val[p] = handle
                else:
                    if self._weight_decay != 0.0:
                        p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                    if self._momentum != 0:
                        if self._use_nesterov:
                            self._U[name] = torch.mul(
                                torch.add(self._U[name], p.grad.data),
                                self._momentum)
                            self._V[name] = self._V[name] + self._U[
                                name] + p.grad.data
                        else:
                            self._U[name] = self._momentum * self._U[
                                name] + p.grad.data
                            self._V[name] = self._V[name] + self._U[name]
                        p.grad.data = self._V[name]
                    #compressed_msg = torch.randn(100).cuda()
                    #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                    torch.cuda.synchronize()
                    begin_comm_time = time.time()

                    handle = allreduce_async_(p.grad.data,
                                              average=True,
                                              name=name)
                    self._handles[p] = handle

                    torch.cuda.synchronize()
                    self.comm_time += time.time() - begin_comm_time

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time

        self.synchronize()
        return super(self.__class__, self).step(closure)
Exemple #20
0
 def _allreduce_grad_async(self, p, name):
     tensor = p.data.view(-1)
     tensor_compressed, ctx = tensor, None  #self._compression.compress(tensor, name)
     handle = allreduce_async_(tensor_compressed, average=True, name=name)
     return handle, ctx
Exemple #21
0
 def hook(*ignore):
     assert p not in self._handles
     assert not p.grad.requires_grad
     name = self._parameter_names.get(p)
     handle = allreduce_async_(p.grad.data, average=True, name=name)
     self._handles[p] = handle
Exemple #22
0
 def _allreduce_grad_async(self, p):
     name = self._parameter_names.get(p)
     tensor = p.grad.data
     tensor_compressed, ctx = self._compression.compress(tensor)
     handle = allreduce_async_(tensor_compressed, average=True, name=name)
     return handle, ctx
Exemple #23
0
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time =  time.time()

            if self._use_allgather and p_size > self._plan1:
                torch.cuda.synchronize()
                begin_mom_time =  time.time()

                weight_decay = self._weight_decay #group['weight_decay']
                momentum = self._momentum #group['momentum']
                dampening = 0.0 #group['dampening']
                d_p = p.grad.data
                d_p.div_(hvd.size())
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    #TODO
                if 'residue_buffer' not in param_state:
                    rsd = param_state['residue_buffer'] = torch.zeros_like(p.data)
                    rsd.add_(param_state['momentum_buffer'])
                    if self._use_nesterov:
                        rsd  = rsd.add(momentum, d_p)
                else:
                    rsd = param_state['residue_buffer']
                    rsd.add_(param_state['momentum_buffer'])
                    if self._use_nesterov:
                        rsd  = rsd.add(momentum, d_p)

                torch.cuda.synchronize()
                self.mom_time += time.time() - begin_mom_time

                compressed_val = []
                compressed_idx = []

                torch.cuda.synchronize()
                begin_select_time =  time.time()

                if 'flag' not in param_state:
                    param_state['flag'] = 0
                if 'interval' not in param_state:
                    param_state['interval'] = 10
                it = 0
                sparsity = 0.0

                if p_size > self._plan3:
                    if param_state['flag'] == 1:
                        compressed_val, compressed_idx, it, _, sparsity = \
                            select_bs_top(param_state['residue_buffer'], 0.001)
                        param_state['flag'] = 0
                    else:
                        compressed_val, compressed_idx, it, _, sparsity = \
                            select_bs_bottom(param_state['residue_buffer'], 0.001)
                        param_state['flag'] = 1
                elif p_size > self._plan2:
                    if param_state['flag'] == 1:
                        compressed_val, compressed_idx = \
                            select_trim_topk_mean(param_state['residue_buffer'], 0.001)
                        param_state['flag'] = 0
                    else:
                        compressed_val, compressed_idx = \
                            select_trim_lowk_mean(param_state['residue_buffer'], 0.001)
                        param_state['flag'] = 1
                else:
                    if param_state['flag'] == 1:
                        compressed_val, compressed_idx = \
                            select_topk_mean(param_state['residue_buffer'], 0.001)
                        param_state['flag'] = 0
                    else:
                        compressed_val, compressed_idx = \
                            select_lowk_mean(param_state['residue_buffer'], 0.001)
                        param_state['flag'] = 1

                assert(len(compressed_idx) > 0)
                torch.cuda.synchronize()
                end_select_time =  time.time()
                self.select_time += end_select_time - begin_select_time
                #if param_state['interval'] == 10:
                #    compressed_val, compressed_idx, it, param_state['mid_store'], sparsity = \
                #            select_top_k_thdv3(param_state['residue_buffer'], 0.001)
                #    param_state['interval'] = 0
                #else:
                #    compressed_val, compressed_idx, sparsity = \
                #            select_top_k_fixthd(param_state['residue_buffer'], param_state['mid_store'])
                #    param_state['interval'] += 1
                #if hvd.rank() == 0:
                #    print(name, p.size())
                #if hvd.rank() == 0 and name == "features.27.weight":
                #if name == "features.27.weight":
                #    torch.save(compressed_val, 'compressed_val' + str(local_rank()))
                #    torch.save(compressed_idx, 'compressed_idx' + str(local_rank()))
                #if hvd.rank() == 0 and name == "features.27.weight":
                #    self._it = it
                #    self._mid = param_state['mid_store']
                #    self._sparsity = sparsity
                #tmp_t = torch.tensor([local_len], dtype=torch.long)
#                tmp_t = torch.tensor([local_len])
                # print("len list, ", global_len_list)
                #local_len = torch.min(global_len_list)
                ##print("local_len, ", local_len)
                #compressed_val = compressed_val[0:local_len]
                #compressed_idx = compressed_idx[0:local_len]

                torch.cuda.synchronize()
                begin_mask_time =  time.time()

                masks_size = self._masks[name].size()
                self._masks[name].zero_()
                self._masks[name] = self._masks[name].view(-1)
                self._masks[name][compressed_idx] = 1.0

                self._masks[name] = 1.0 - self._masks[name]
                self._masks[name] = self._masks[name].view(masks_size)

                if self._debug:
                    self._v_ref[name] = param_state['residue_buffer'] * (1.0 - self._masks[name])
                    allreduce_(self._v_ref[name], average = False)


                if hvd.size() == 1:
                    p.grad.data = param_state['residue_buffer'] * (1.0 - self._masks[name])

                param_state['residue_buffer'].mul_(self._masks[name])
                param_state['momentum_buffer'].mul_(self._masks[name])

                end_mask_time =  time.time()
                self.mask_time += end_mask_time - begin_mask_time

                torch.cuda.synchronize()
                begin_pack_time =  time.time()

                if hvd.size() > 1:
                    if self._use_gpu:
                        if p_size > self._plan3:
                            compressed_msg= torch.cat((\
                                torch.tensor([len(compressed_idx)]).type(torch.cuda.LongTensor),\
                                compressed_idx))
                            handle = _allgather_async(compressed_msg, self._compressed_idx[name], name=name + "idx")
                            self._handles[p] = handle

                            handle = _allgather_async(torch.mean(compressed_val), self._compressed_val[name], name=name + "val")
                            self._handles_val[p] = handle
                        else:
                            self._compressed_msg_size[name] = len(compressed_idx)
                            handle = _allgather_async(compressed_idx, self._compressed_idx[name], \
                                    name = name+"idx")
                            self._handles[p] = handle
                            handle = _allgather_async(torch.mean(compressed_val), \
                                    self._compressed_val[name], name=name+"val")
                            self._handles_val[p] = handle
                torch.cuda.synchronize()
                self.pack_time += time.time() - begin_pack_time
            else:
                torch.cuda.synchronize()
                begin_allreduce_time =  time.time()
                p.grad.data.div_(hvd.size())
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                param_state = self.state[p]
                if 'momentum_buffer' not in param_state:
                    param_state['momentum_buffer'] = torch.zeros_like(p.data)
                if self._use_nesterov:
                    param_state['momentum_buffer'] = torch.mul(torch.add(param_state['momentum_buffer'], p.grad.data), self._momentum)
                    p.grad.data = param_state['momentum_buffer'] + p.grad.data
                else:
                    param_state['momentum_buffer']= self._momentum * param_state['momentum_buffer'] + p.grad.data
                    p.grad.data = param_state['momentum_buffer']
                if hvd.size() > 1:
                    handle = allreduce_async_(p.grad.data, average=False, name=name)
                    self._handles[p] = handle
                torch.cuda.synchronize()
                self.allreduce_time += time.time() - begin_allreduce_time

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time