Beispiel #1
0
    def async_send(self, tensors_compressed, name):
        """
        :param tensors_compressed: list of flat tensors to communicate
        :param name: for the all_gather operation
        :return: handles to synchronize, tensor sizes per rank
        """
        tensors_size = [t.numel() for t in tensors_compressed
                        ]  # list of tensor size for this rank
        if self.compressor.tensors_size_are_same:
            tensors_size_ag = [
                tensors_size
            ] * self.world_size  # list of tensor sizes per rank
            tensor_sizes = zip(*tensors_size_ag)  # transpose
        else:
            tensors_size = torch.tensor(tensors_size)  # TODO: set device
            gathered = allgather(
                tensors_size)  # tensor of tensor sizes per rank
            tensor_sizes = gathered.view([self.world_size, -1
                                          ]).t().tolist()  # transpose, to list

        handles = []
        for tensor_compressed in tensors_compressed:
            handle = allgather_async(tensor_compressed)
            handles.append(handle)

        return handles, tensor_sizes
Beispiel #2
0
    def async_send(self, tensors_compressed, ctx):
        if tensors_compressed is None:
            return

        handles = []
        """
            We use alltoall()+allgather() to implement Parameter Server.
            For quantization compression algorithms, we allgather() the corresponding scalars
            for each server to decompress the data.

        """
        name, numel = ctx

        if self.compressor.quantization and len(tensors_compressed) == 2:
            handle = alltoall_async(tensors_compressed[0],
                                    splits=self.get_splits(
                                        tensors_compressed[0].numel()),
                                    name=name)
            handles.append(handle)
            handle = allgather_async(tensors_compressed[1], name=name)
            handles.append(handle)
        else:
            for i, tensor_compressed in enumerate(tensors_compressed):
                handle = alltoall_async(tensor_compressed, name + str(i))
                handles.append(handle)

        #self.thread = threading.Thread(target=self.ps_synchronize, args=(handles, ctx))
        #self.thread.start()
        self.ps_synchronize(handles, ctx)

        return handles
Beispiel #3
0
    def allgather_sync(self, tensors, ranks):
        nworkers = hvd.size()
        rank = hvd.rank()
        start = 0
        sub_ranks = ranks[start:start+nworkers]
        sub_tensors = tensors[start:start+nworkers]
        while len(sub_ranks) > 0:
            #print('len(sub_ranks): ', len(sub_ranks))
            #print('len(sub_tensors): ', len(sub_tensors))
            try:
                idx = sub_ranks.index(rank)
            except:
                idx = -1
            if idx < 0:
                tensor = sub_tensors[0].new(0) 
            else:
                tensor = sub_tensors[idx]
            handle = hvd.allgather_async(tensor.view(-1))
            sync_tensors = hvd.synchronize(handle)
            offset = 0
            for i, r in enumerate(sub_ranks):
                if idx < 0:
                    continue
                original_t = sub_tensors[r]
                numel = original_t.numel()
                t = sync_tensors[offset:offset+numel]
                original_t.copy_(t.view(original_t.shape))
                offset += numel

            start += nworkers
            sub_ranks = ranks[start:start+nworkers]
            sub_tensors = tensors[start:start+nworkers]
    def fsp_matrix_transfer(self):
        '''
		obtain the feature maps of bottlenecks (h*w*m), reshape it to (hw*m), then do matrix multiplication (m*n)
		allgather the mm, use L2 loss on it
		:return:
		'''
        handles = []
        matrix_group = []
        for key in self.activation:
            if 'in' in key:
                fm_in = self.activation[key]
            if 'out' in key:
                fm_out = self.activation[key]

                fm_in = fm_in.view(fm_in.shape[0], fm_in.shape[1], -1)
                fm_out = fm_out.view(fm_out.shape[0], fm_out.shape[1], -1)
                fm_out = torch.transpose(fm_out, 1, 2)
                fsp_matrix = torch.bmm(fm_in, fm_out) / fm_in.shape[-1]
                matrix_group.append(fsp_matrix)
                fsp_matrix = fsp_matrix.unsqueeze(0)
                handle = hvd.allgather_async(fsp_matrix, key)
                handles.append(handle)

        fsp_loss = 0
        for idx, handle in enumerate(handles):
            rec_fsp = hvd.synchronize(handle)
            for i in range(0, hvd.size()):
                if i != self.task_id:
                    fsp_loss += self.norm_loss(matrix_group[idx], rec_fsp[i])
        fsp_loss /= (hvd.size() - 1)
        self.log_dict['transfer_count'] += 1
        return fsp_loss
    def attention_transfer(self):
        def at(x):
            return F.normalize(x.pow(2).mean(1).view(x.size(0), -1))

        handles = []
        att_group = []
        for key in self.activation:
            at_out = at(self.activation[key])
            att_group.append(at_out)
            at_numpy = at_out.data.unsqueeze(0)
            handle = hvd.allgather_async(at_numpy, key)
            handles.append(handle)
            # self.norm_loss

        att_loss = 0
        for idx, handle in enumerate(handles):
            rec_att = hvd.synchronize(handle)
            # att_loss += self.norm_loss(att_group[idx], rec_att.mean(0).cuda(self.device))
            for i in range(0, hvd.size()):
                if i != self.task_id:
                    att_loss += self.norm_loss(att_group[idx],
                                               rec_att[i].cuda(self.device))
        att_loss /= (hvd.size() - 1)
        self.log_dict['transfer_count'] += 1
        return att_loss
Beispiel #6
0
 def forward(ctx, tensor, name):
     ctx.dim = tensor.shape[0]
     # we try to put all sync ops in forward pass
     ctx.all_dims = hvd.allgather(
         torch.tensor([ctx.dim], device=tensor.device)).view(hvd.size())
     handle = hvd.allgather_async(tensor, name)
     return hvd.synchronize(handle)
Beispiel #7
0
 def forward(self, data):
     """
     Arguments:
         data:
             Tensor to be gathered across all processes.
     """
     return hvd.allgather_async(data, name=self.name,)
    def async_send(self, tensors_compressed, ctx):
        if tensors_compressed is None:
            return

        handles = []
        for i, tensor_compressed in enumerate(tensors_compressed):
            handle = allgather_async(tensor_compressed, ctx[0] + str(i))
            handles.append(handle)

        return handles
Beispiel #9
0
    def test_horovod_allgather_duplicate_name_error(self):
        """Test that the allgather raises an error if there are
        two concurrent operations with the same name."""
        hvd.init()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        dims = [17] * 3
        tensor = torch.FloatTensor(*dims)

        hvd.allgather_async(tensor, name='duplicate_name')
        try:
            for i in range(10):
                hvd.allgather_async(tensor, name='duplicate_name')
            assert False, 'hvd.allgather_async did not throw error'
        except (torch.FatalError, ValueError):
            pass
Beispiel #10
0
def test_allgather():
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    tensor = torch.rand(10).float().cuda()
    print('rank: ', rank, ', tensor: ', tensor)
    handle = hvd.allgather_async(tensor)
    #tensor = hvd.synchronize(handle)
    #handle = hvd.broadcast_async(tensor, 0)
    hvd.synchronize(handle)
    comm.Barrier()
    print('---------')
    print('rank: ', rank, ', tensor: ', tensor)
    def weights_transfer(self):
        # transfer model weights
        weights = copy.deepcopy(self.network.state_dict())
        handles = []
        for name in weights:
            # TODO: need to consider bias
            if 'weight' in name:
                # print(self.task_id, 'send', name)
                handle = hvd.allgather_async(weights[name], name)
                handles.append(handle)

        hidx = 0
        for name, param in self.network.named_parameters():
            if 'weight' in name:
                # print(self.task_id, 'rec', name)
                rec_weights = hvd.synchronize(handles[hidx])
                hidx += 1
                # print(rec_weights.shape)

                n_num = param.shape[0]
                rec_weights = list(torch.split(rec_weights, n_num, 0))

                del rec_weights[self.task_id]

                # TODO weights cat in the first dim, 2*[64,3]--> [128,3]
                # logging.info(type(rec_weights), rec_weights.shape)
                # calculate IOM of each filter
                im_list = []
                for i in range(param.shape[0]):
                    im_list.append(
                        torch.sum(torch.abs(param[i])).data.cpu().numpy())
                im_list = np.array(im_list)
                # print('minimal weight sum is {} size {}'.format(im_list.min(), im_list.shape[0]))

                for i, im in enumerate(im_list):
                    prob = 1 - stats.norm(0, 2).cdf(im)
                    if np.random.rand() < prob:
                        random_sender = np.random.randint(0, len(rec_weights))
                        new_param = rec_weights[random_sender].clone()
                        # random pic
                        random_filter = np.random.randint(
                            0, new_param.shape[0])
                        # TODO give larger weights more chance
                        weights[name][i] = new_param[random_filter]
                        self.log_dict['transfer_count'] += 1
            # self.network.state_dict()[name].copy_(param.clone())
            # TODO: maybe modify the optimizer
        self.network.load_state_dict(weights)
        hvd.allreduce(torch.zeros(1), name='Barrier')
    def _allgather_factors(self):
        """Allgather the factors for all layers"""
        handles = []

        def _get_value_and_idx(sparse_tensor):
            tensor = sparse_tensor.data.view(-1)
            one_indexes = tensor != 0.0
            indexes = one_indexes.nonzero().data.squeeze().view(-1)
            values = tensor.data[indexes]
            return values, indexes.int()

        for i, m in enumerate(self.modules):
            module_name = self.module_names[i]

            A_values, A_indexes = self.m_sparseA[
                m]  #_get_value_and_idx(self.m_A[m].data)
            if A_values.numel() == 0:
                continue
            A_value_name = module_name + '_A_value'
            A_idx_name = module_name + '_A_idx'
            #h_value = hvd.allgather_async(A_values, A_value_name)
            #h_idx = hvd.allgather_async(A_indexes, A_idx_name)
            h_value = hvd.allgather_async(A_values)
            h_idx = hvd.allgather_async(A_indexes)

            G_values, G_indexes = self.m_sparseG[
                m]  #_get_value_and_idx(self.m_G[m].data)
            G_value_name = module_name + '_G_value'
            G_idx_name = module_name + '_G_idx'
            #h_value_G = hvd.allgather_async(G_values, G_value_name)
            #h_idx_G = hvd.allgather_async(G_indexes, G_idx_name)
            if G_values is not None and G_values.numel() > 0:
                h_value_G = hvd.allgather_async(G_values)
                h_idx_G = hvd.allgather_async(G_indexes)
                handles.append((h_value, h_idx, h_value_G, h_idx_G))

        num_of_workers = hvd.size()

        def _decompress(values, indices, output):
            numel = indices.numel()
            real_num_values = numel // num_of_workers
            for i in range(num_of_workers):
                tmp_values = values.data[i * real_num_values:(i + 1) *
                                         real_num_values]
                tmp_indices = indices.data[i * real_num_values:(i + 1) *
                                           real_num_values]
                output[tmp_indices] += tmp_values

        for i, handle in enumerate(handles):
            module_name = self.module_names[i]
            module = self.modules[i]
            m_A = self.m_A[module].view(-1)
            m_A.fill_(0.0)
            m_G = self.m_G[module].view(-1)
            m_G.fill_(0.0)

            h_value_A, h_idx_A, h_value_G, h_idx_G = handle
            A_values = hvd.synchronize(h_value_A)
            A_indexes = hvd.synchronize(h_idx_A).long()
            _decompress(A_values, A_indexes, m_A)
            #print(A_indexes[0])
            #print(A_values[0])
            #m_A.scatter_add_(0, A_indexes, A_values)
            m_A.div_(hvd.size())

            G_values = hvd.synchronize(h_value_G)
            G_indexes = hvd.synchronize(h_idx_G).long()
            #print('G_I: ', G_indexes[0])
            #print('G_V: ', G_values[0])
            #m_G.scatter_add_(0, G_indexes, G_values)
            _decompress(G_values, G_indexes, m_G)
            m_G.div_(hvd.size())