def load_state_dict(self, state):
        if self.method == 'none':
            return
        self.levels = state['levels']
        self.grad_dist_nb = CondNormalTruncHist(
            state['means'], state['sigmas'], state['norms'], -1,
            1, nbins=100000, bin_type='linear')

        self.grad_dist_nl = TruncNorm(
            state['mean'], state['sigma'], -1,
            1, nbins=100000, bin_type='linear')
        self.qdq = QDQ(self.levels)

        self.error = state['error']
Example #2
0
class QuantizeMultiBucket(object):
    def __init__(self, method, bits, bucket_size, multiplier, **kwargs):
        """
        QSGD: qdqL2 + levels_uni
        NUQSGD: qdqL2 + levels_exp
        QSGD-inf: qdqLinf + levels_uni
        """
        self.method = method
        if method == 'q':
            self.levels = get_uniform_levels(bits)
            self.norm_type = 'fro'
        elif method == 'nuq':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'qinf':
            self.levels = get_uniform_levels(bits)
            self.norm_type = float('inf')
        elif method == 'none':
            return

        self.bucket_size = bucket_size
        self.bits = bits
        self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda()
        self.qdq = QDQ(self.levels)

    def quantize(self, x):
        if self.method == 'none':
            return x
        assert isinstance(x, torch.cuda.FloatTensor)
        bucket_size = self.bucket_size

        num_tail = math.ceil(x.numel() / bucket_size) * bucket_size - x.numel()
        xv = torch.cat(
            (x.view(-1), torch.zeros(num_tail, dtype=x.dtype,
                                     device=x.device)))
        xv = xv.view(-1, bucket_size)
        norm = xv.norm(p=self.norm_type, dim=1, keepdim=True).expand(
            xv.shape[0], xv.shape[1]).contiguous().view(-1).contiguous()
        q = torch.zeros_like(x)
        r = torch.randint_like(x, 1000001).long()

        self.qdq.qdqGPU(x, norm, q, r)

        return q
Example #3
0
def qdq_gpu(a):
    assert isinstance(a, torch.cuda.FloatTensor)
    bucket_size = 16
    asize = a.size()
    num_tail = math.ceil(a.numel()/bucket_size)*bucket_size-a.numel()
    av = torch.cat((a.view(-1), torch.zeros_like(a)[:num_tail]))
    c = torch.zeros_like(a)
    av = av.view(-1, bucket_size)
    norm = av.norm(dim=1, keepdim=True).expand(
        av.shape[0], av.shape[1]).contiguous().view(-1).contiguous()
    print('norm', norm)
    r = torch.randint_like(a, 1000001).long()
    levels = get_uniform_levels(4).cuda()
    print('levels', levels)
    print('#levels', len(levels))
    qdq = QDQ(levels)

    qdq.qdqGPU(a, norm, c, r)
    return c.view(asize)
Example #4
0
    def __init__(self, method, bits, bucket_size, **kwargs):
        """
        QSGD: qdqL2 + levels_uni
        NUQSGD: qdqL2 + levels_exp
        QSGD-inf: qdqLinf + levels_uni
        """
        self.method = method
        if method == 'q':
            self.levels = get_uniform_levels(bits)
            self.qdq = qdqL2
        elif method == 'nuq':
            self.levels = get_exp_levels(bits)
            self.qdq = qdqL2
        elif method == 'qinf':
            self.levels = get_uniform_levels(bits)
            self.qdq = qdqLinf

        self.bucket_size = bucket_size
        self.bits = bits
        self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda()
        self.qdq = QDQ(self.levels)
Example #5
0
class QuantizeSingleBucket(object):
    def __init__(self, method, bits, bucket_size, **kwargs):
        """
        QSGD: qdqL2 + levels_uni
        NUQSGD: qdqL2 + levels_exp
        QSGD-inf: qdqLinf + levels_uni
        """
        self.method = method
        if method == 'q':
            self.levels = get_uniform_levels(bits)
            self.qdq = qdqL2
        elif method == 'nuq':
            self.levels = get_exp_levels(bits)
            self.qdq = qdqL2
        elif method == 'qinf':
            self.levels = get_uniform_levels(bits)
            self.qdq = qdqLinf

        self.bucket_size = bucket_size
        self.bits = bits
        self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda()
        self.qdq = QDQ(self.levels)

    def quantize(self, x):
        q = x.clone()
        bucket_size = self.bucket_size
        num_bucket = int(np.ceil(len(x) / bucket_size))
        for bucket_i in range(num_bucket):

            start = bucket_i * bucket_size
            end = min((bucket_i + 1) * bucket_size, len(x))
            x_bucket = x[start:end].clone()
            q_bucket = q[start:end].clone()

            norm = x_bucket.norm()
            self.qdq.qdqGPU(x_bucket, float(norm), q_bucket)
            q[start:end] = q_bucket

        return q
Example #6
0
    def __init__(self, method, bits, bucket_size, multiplier, **kwargs):
        """
        QSGD: qdqL2 + levels_uni
        NUQSGD: qdqL2 + levels_exp
        QSGD-inf: qdqLinf + levels_uni
        """
        self.method = method
        if method == 'q':
            self.levels = get_uniform_levels(bits)
            self.norm_type = 'fro'
        elif method == 'nuq':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'qinf':
            self.levels = get_uniform_levels(bits)
            self.norm_type = float('inf')
        elif method == 'none':
            return

        self.bucket_size = bucket_size
        self.bits = bits
        self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda()
        self.qdq = QDQ(self.levels)
    def update_levels(self):
        interval = self.interval
        mean = self.mean
        bits = self.bits
        variance = self.variance
        grad_dist_nl = self.grad_dist_nl
        grad_dist_nb = self.grad_dist_nb
        sigma = torch.sqrt(torch.tensor(self.variance)).cpu().item()
        half_point = int(len(self.levels) / 2)
        quantile_levels = get_quantile_levels(bits, grad_dist_nb)
        uniform_levels = get_uniform_levels(
            self.bits)
        exp_levels = get_exp_levels(
            self.bits, 0.5)

        bits = self.bits
        if self.method == 'alq':
            inv = self.inv
            sym = self.symmetric
            epochs = self.epochs
            initial_levels = self.levels

            levels_qua, _, losses_qua = alq(
                quantile_levels, grad_dist_nl, epochs, inv, sym)
            levels_uniform, _, losses_uni = alq(
                uniform_levels, grad_dist_nl, epochs, inv, sym)
            levels_exp, _, losses_exp = alq(
                exp_levels, grad_dist_nl, epochs, inv, sym)
            candidate_levels = np.asarray(
                [levels_qua, levels_uniform, levels_exp])
            candidate_losses = np.asarray(
                [losses_qua[-1], losses_uni[-1], losses_exp[-1]])
            self.levels = candidate_levels[np.argsort(candidate_losses)][0]

        elif self.method == 'alq_nb':
            epochs = self.epochs
            inv = self.inv
            sym = self.symmetric
            quantile_levels = get_quantile_levels(bits, grad_dist_nb)
            levels_qua, _, losses_qua = alq(
                quantile_levels, grad_dist_nb, epochs, inv, sym)
            levels_uniform, _, losses_uni = alq(
                uniform_levels, grad_dist_nb, epochs, inv, sym)
            levels_exp, _, losses_exp = alq(
                exp_levels, grad_dist_nb, epochs, inv, sym)
            candidate_levels = np.asarray(
                [levels_qua, levels_uniform, levels_exp])
            candidate_losses = np.asarray(
                [losses_qua[-1], losses_uni[-1], losses_exp[-1]])
            self.levels = candidate_levels[np.argsort(candidate_losses)][0]

        elif self.method == 'amq':
            initial_points = []

            if self.previous_best is None:
                initial_points = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9]
            else:
                initial_points = [0.1, 0.2, 0.3, 0.4,
                                  self.previous_best, 0.5,  0.8, 0.9]
            optimal_points = []
            for point in initial_points:
                optimal_p, _ = amq_norm_less(point, grad_dist_nl, bits, self.amq_lr, self.amq_epochs)
                optimal_points.append(optimal_p)
            optimal_points_costs = [
                grad_dist_nl.estimate_variance(get_exp_levels(bits, p)[
                    half_point:]) for p in optimal_points]
            index = np.argmin(optimal_points_costs)
            self.multiplier = optimal_points[index]
            self.previous_best = self.multiplier
            self.levels = get_exp_levels(bits, self.multiplier)

        elif self.method == 'amq_nb':
            initial_points = []

            if self.previous_best is None:
                initial_points = [0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 0.9]
            else:
                initial_points = [0.1, 0.2, 0.3, 0.4,
                                  self.previous_best, 0.5, 0.8, 0.9]
            optimal_points = []
            for point in initial_points:
                optimal_p, _ = amq_norm_based(point, grad_dist_nb, bits, self.amq_lr, self.amq_epochs)
                optimal_points.append(optimal_p)
            optimal_points_costs = [
                grad_dist_nb.estimate_variance(get_exp_levels(bits, p)[
                    half_point:]) for p in optimal_points]
            index = np.argmin(optimal_points_costs)
            self.multiplier = optimal_points[index]
            self.previous_best = self.multiplier
            self.levels = get_exp_levels(self.bits, self.multiplier)


        self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda()
        self.qdq = QDQ(self.levels)
    def __init__(self, method, bits, bucket_size, multiplier, **kwargs):
        """
        QSGD: qdqL2 + levels_uni
        NUQSGD: qdqL2 + levels_exp
        QSGD-inf: qdqLinf + levels_uni
        """
        self.method = method
        self.multiplier = multiplier
        if kwargs['interval'] != None:
            self.interval = kwargs['interval']
            a, b = (-self.interval - 0) / 0.1, (self.interval - 0) / 0.1
        if method == 'q':
            self.levels = get_uniform_levels(bits)
            self.norm_type = 'fro'
        elif method == 'nuq':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'qinf':
            self.levels = get_uniform_levels(bits)
            self.norm_type = float('inf')
        elif method == 'nuq2':
            self.levels = get_quantile_levels(
                bits, 0, 0.1, -self.interval, self.interval)
            self.norm_type = 'fro'
        elif method == 'nuq2inf':
            self.levels = get_quantile_levels(
                bits, 0, 0.1, -self.interval, self.interval)
            self.norm_type = float('inf')
        elif method == 'amq':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'amq_nb':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'alq':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'alq_nb':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'trn':
            self.levels = get_ternary_levels()
            self.norm_type = float('inf')

        elif method == 'none':
            return

        self.number_of_iterations = 0
        self.gradient_samples = []
        self.gradient_samples_overtime = []
        self.previous_best = None

        self.bucket_size = bucket_size
        self.bits = bits
        self.epochs = kwargs['cd_epochs']
        self.path = kwargs['path']
        self.amq_lr = kwargs['amq_lr']
        self.amq_epochs = kwargs['amq_epochs']
        self.symmetric = kwargs['symmetric']
        self.inv = kwargs['inv']
        self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda()
        self.qdq = QDQ(self.levels)
        self.mean_weights = 0
        self.variance_weights = 0.1
        self.error = None
class QuantizeMultiBucket(object):
    def __init__(self, method, bits, bucket_size, multiplier, **kwargs):
        """
        QSGD: qdqL2 + levels_uni
        NUQSGD: qdqL2 + levels_exp
        QSGD-inf: qdqLinf + levels_uni
        """
        self.method = method
        self.multiplier = multiplier
        if kwargs['interval'] != None:
            self.interval = kwargs['interval']
            a, b = (-self.interval - 0) / 0.1, (self.interval - 0) / 0.1
        if method == 'q':
            self.levels = get_uniform_levels(bits)
            self.norm_type = 'fro'
        elif method == 'nuq':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'qinf':
            self.levels = get_uniform_levels(bits)
            self.norm_type = float('inf')
        elif method == 'nuq2':
            self.levels = get_quantile_levels(
                bits, 0, 0.1, -self.interval, self.interval)
            self.norm_type = 'fro'
        elif method == 'nuq2inf':
            self.levels = get_quantile_levels(
                bits, 0, 0.1, -self.interval, self.interval)
            self.norm_type = float('inf')
        elif method == 'amq':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'amq_nb':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'alq':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'alq_nb':
            self.levels = get_exp_levels(bits, multiplier)
            self.norm_type = 'fro'
        elif method == 'trn':
            self.levels = get_ternary_levels()
            self.norm_type = float('inf')

        elif method == 'none':
            return

        self.number_of_iterations = 0
        self.gradient_samples = []
        self.gradient_samples_overtime = []
        self.previous_best = None

        self.bucket_size = bucket_size
        self.bits = bits
        self.epochs = kwargs['cd_epochs']
        self.path = kwargs['path']
        self.amq_lr = kwargs['amq_lr']
        self.amq_epochs = kwargs['amq_epochs']
        self.symmetric = kwargs['symmetric']
        self.inv = kwargs['inv']
        self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda()
        self.qdq = QDQ(self.levels)
        self.mean_weights = 0
        self.variance_weights = 0.1
        self.error = None

    def set_mean_variance(self, stats):
        self.mean = mean = stats['nl']['mean']
        self.variance = variance = stats['nl']['sigma'] ** 2
        self.norms = norms = stats['nb']
        self.number_of_iterations += 1
        interval = self.interval
        sigma = torch.sqrt(torch.tensor(self.variance)).cpu().item()
        self.grad_dist_nb = CondNormalTruncHist(
            norms['means'], norms['sigmas'], norms['norms'], -interval,
            interval, nbins=100000, bin_type='linear')
        self.grad_dist_nl = TruncNorm(
            mean, sigma, -interval, interval, nbins=100000, bin_type='linear')

        self.error = self.grad_dist_nb.estimate_variance(self.levels.cpu())
        if self.method == 'amq':
            np.savetxt(self.path + '/norms_mean' +
                       str(self.number_of_iterations), np.asarray(self.norms['means']))
            np.savetxt(self.path + '/norms_sigma' +
                       str(self.number_of_iterations), np.asarray(self.norms['sigmas']))
            np.savetxt(self.path + '/norms_norm' +
                       str(self.number_of_iterations), np.asarray(self.norms['norms']))

    def update_levels(self):
        interval = self.interval
        mean = self.mean
        bits = self.bits
        variance = self.variance
        grad_dist_nl = self.grad_dist_nl
        grad_dist_nb = self.grad_dist_nb
        sigma = torch.sqrt(torch.tensor(self.variance)).cpu().item()
        half_point = int(len(self.levels) / 2)
        quantile_levels = get_quantile_levels(bits, grad_dist_nb)
        uniform_levels = get_uniform_levels(
            self.bits)
        exp_levels = get_exp_levels(
            self.bits, 0.5)

        bits = self.bits
        if self.method == 'alq':
            inv = self.inv
            sym = self.symmetric
            epochs = self.epochs
            initial_levels = self.levels

            levels_qua, _, losses_qua = alq(
                quantile_levels, grad_dist_nl, epochs, inv, sym)
            levels_uniform, _, losses_uni = alq(
                uniform_levels, grad_dist_nl, epochs, inv, sym)
            levels_exp, _, losses_exp = alq(
                exp_levels, grad_dist_nl, epochs, inv, sym)
            candidate_levels = np.asarray(
                [levels_qua, levels_uniform, levels_exp])
            candidate_losses = np.asarray(
                [losses_qua[-1], losses_uni[-1], losses_exp[-1]])
            self.levels = candidate_levels[np.argsort(candidate_losses)][0]

        elif self.method == 'alq_nb':
            epochs = self.epochs
            inv = self.inv
            sym = self.symmetric
            quantile_levels = get_quantile_levels(bits, grad_dist_nb)
            levels_qua, _, losses_qua = alq(
                quantile_levels, grad_dist_nb, epochs, inv, sym)
            levels_uniform, _, losses_uni = alq(
                uniform_levels, grad_dist_nb, epochs, inv, sym)
            levels_exp, _, losses_exp = alq(
                exp_levels, grad_dist_nb, epochs, inv, sym)
            candidate_levels = np.asarray(
                [levels_qua, levels_uniform, levels_exp])
            candidate_losses = np.asarray(
                [losses_qua[-1], losses_uni[-1], losses_exp[-1]])
            self.levels = candidate_levels[np.argsort(candidate_losses)][0]

        elif self.method == 'amq':
            initial_points = []

            if self.previous_best is None:
                initial_points = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9]
            else:
                initial_points = [0.1, 0.2, 0.3, 0.4,
                                  self.previous_best, 0.5,  0.8, 0.9]
            optimal_points = []
            for point in initial_points:
                optimal_p, _ = amq_norm_less(point, grad_dist_nl, bits, self.amq_lr, self.amq_epochs)
                optimal_points.append(optimal_p)
            optimal_points_costs = [
                grad_dist_nl.estimate_variance(get_exp_levels(bits, p)[
                    half_point:]) for p in optimal_points]
            index = np.argmin(optimal_points_costs)
            self.multiplier = optimal_points[index]
            self.previous_best = self.multiplier
            self.levels = get_exp_levels(bits, self.multiplier)

        elif self.method == 'amq_nb':
            initial_points = []

            if self.previous_best is None:
                initial_points = [0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 0.9]
            else:
                initial_points = [0.1, 0.2, 0.3, 0.4,
                                  self.previous_best, 0.5, 0.8, 0.9]
            optimal_points = []
            for point in initial_points:
                optimal_p, _ = amq_norm_based(point, grad_dist_nb, bits, self.amq_lr, self.amq_epochs)
                optimal_points.append(optimal_p)
            optimal_points_costs = [
                grad_dist_nb.estimate_variance(get_exp_levels(bits, p)[
                    half_point:]) for p in optimal_points]
            index = np.argmin(optimal_points_costs)
            self.multiplier = optimal_points[index]
            self.previous_best = self.multiplier
            self.levels = get_exp_levels(self.bits, self.multiplier)


        self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda()
        self.qdq = QDQ(self.levels)

    def quantize(self, x, ig_sm_bkts):
        if self.method == 'none':
            return x
        assert isinstance(x, torch.cuda.FloatTensor)
        bucket_size = self.bucket_size

        num_tail = math.ceil(x.numel()/bucket_size)*bucket_size-x.numel()
        xv = torch.cat((x.view(-1),
                        torch.zeros(num_tail, dtype=x.dtype, device=x.device)))
        xv = xv.view(-1, bucket_size)
        norm = xv.norm(p=self.norm_type, dim=1, keepdim=True).expand(
            xv.shape[0], xv.shape[1]).contiguous().view(-1).contiguous()

        if ig_sm_bkts:
            if xv.shape[0] > 1:
                q = torch.zeros_like(xv)
                r = torch.randint_like(xv, 1000001).long()
                self.qdq.qdqGPU(xv[:-1], norm[:-1], q[:-1], r[:-1])
                return torch.cat([q[:-1].view(-1), xv[-1][:-num_tail].view(-1)]).view(x.shape)
            else:
                return xv[-1][:-num_tail].view(x.shape)
        else:
            q = torch.zeros_like(x)
            r = torch.randint_like(x, 1000001).long()
            self.qdq.qdqGPU(x, norm, q, r)
            return q
    
    def state_dict(self):
        if self.method == 'none':
            return {}
        return {
            'levels': self.levels,
            'means': self.grad_dist_nb.means,
            'sigmas': self.grad_dist_nb.sigmas,
            'norms': self.grad_dist_nb.norms,
            'sigma': self.grad_dist_nl.sigma,
            'mean': self.grad_dist_nl.mean,
            'error': self.error
        }
    
    def load_state_dict(self, state):
        if self.method == 'none':
            return
        self.levels = state['levels']
        self.grad_dist_nb = CondNormalTruncHist(
            state['means'], state['sigmas'], state['norms'], -1,
            1, nbins=100000, bin_type='linear')

        self.grad_dist_nl = TruncNorm(
            state['mean'], state['sigma'], -1,
            1, nbins=100000, bin_type='linear')
        self.qdq = QDQ(self.levels)

        self.error = state['error']