def load_state_dict(self, state): if self.method == 'none': return self.levels = state['levels'] self.grad_dist_nb = CondNormalTruncHist( state['means'], state['sigmas'], state['norms'], -1, 1, nbins=100000, bin_type='linear') self.grad_dist_nl = TruncNorm( state['mean'], state['sigma'], -1, 1, nbins=100000, bin_type='linear') self.qdq = QDQ(self.levels) self.error = state['error']
class QuantizeMultiBucket(object): def __init__(self, method, bits, bucket_size, multiplier, **kwargs): """ QSGD: qdqL2 + levels_uni NUQSGD: qdqL2 + levels_exp QSGD-inf: qdqLinf + levels_uni """ self.method = method if method == 'q': self.levels = get_uniform_levels(bits) self.norm_type = 'fro' elif method == 'nuq': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'qinf': self.levels = get_uniform_levels(bits) self.norm_type = float('inf') elif method == 'none': return self.bucket_size = bucket_size self.bits = bits self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda() self.qdq = QDQ(self.levels) def quantize(self, x): if self.method == 'none': return x assert isinstance(x, torch.cuda.FloatTensor) bucket_size = self.bucket_size num_tail = math.ceil(x.numel() / bucket_size) * bucket_size - x.numel() xv = torch.cat( (x.view(-1), torch.zeros(num_tail, dtype=x.dtype, device=x.device))) xv = xv.view(-1, bucket_size) norm = xv.norm(p=self.norm_type, dim=1, keepdim=True).expand( xv.shape[0], xv.shape[1]).contiguous().view(-1).contiguous() q = torch.zeros_like(x) r = torch.randint_like(x, 1000001).long() self.qdq.qdqGPU(x, norm, q, r) return q
def qdq_gpu(a): assert isinstance(a, torch.cuda.FloatTensor) bucket_size = 16 asize = a.size() num_tail = math.ceil(a.numel()/bucket_size)*bucket_size-a.numel() av = torch.cat((a.view(-1), torch.zeros_like(a)[:num_tail])) c = torch.zeros_like(a) av = av.view(-1, bucket_size) norm = av.norm(dim=1, keepdim=True).expand( av.shape[0], av.shape[1]).contiguous().view(-1).contiguous() print('norm', norm) r = torch.randint_like(a, 1000001).long() levels = get_uniform_levels(4).cuda() print('levels', levels) print('#levels', len(levels)) qdq = QDQ(levels) qdq.qdqGPU(a, norm, c, r) return c.view(asize)
def __init__(self, method, bits, bucket_size, **kwargs): """ QSGD: qdqL2 + levels_uni NUQSGD: qdqL2 + levels_exp QSGD-inf: qdqLinf + levels_uni """ self.method = method if method == 'q': self.levels = get_uniform_levels(bits) self.qdq = qdqL2 elif method == 'nuq': self.levels = get_exp_levels(bits) self.qdq = qdqL2 elif method == 'qinf': self.levels = get_uniform_levels(bits) self.qdq = qdqLinf self.bucket_size = bucket_size self.bits = bits self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda() self.qdq = QDQ(self.levels)
class QuantizeSingleBucket(object): def __init__(self, method, bits, bucket_size, **kwargs): """ QSGD: qdqL2 + levels_uni NUQSGD: qdqL2 + levels_exp QSGD-inf: qdqLinf + levels_uni """ self.method = method if method == 'q': self.levels = get_uniform_levels(bits) self.qdq = qdqL2 elif method == 'nuq': self.levels = get_exp_levels(bits) self.qdq = qdqL2 elif method == 'qinf': self.levels = get_uniform_levels(bits) self.qdq = qdqLinf self.bucket_size = bucket_size self.bits = bits self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda() self.qdq = QDQ(self.levels) def quantize(self, x): q = x.clone() bucket_size = self.bucket_size num_bucket = int(np.ceil(len(x) / bucket_size)) for bucket_i in range(num_bucket): start = bucket_i * bucket_size end = min((bucket_i + 1) * bucket_size, len(x)) x_bucket = x[start:end].clone() q_bucket = q[start:end].clone() norm = x_bucket.norm() self.qdq.qdqGPU(x_bucket, float(norm), q_bucket) q[start:end] = q_bucket return q
def __init__(self, method, bits, bucket_size, multiplier, **kwargs): """ QSGD: qdqL2 + levels_uni NUQSGD: qdqL2 + levels_exp QSGD-inf: qdqLinf + levels_uni """ self.method = method if method == 'q': self.levels = get_uniform_levels(bits) self.norm_type = 'fro' elif method == 'nuq': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'qinf': self.levels = get_uniform_levels(bits) self.norm_type = float('inf') elif method == 'none': return self.bucket_size = bucket_size self.bits = bits self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda() self.qdq = QDQ(self.levels)
def update_levels(self): interval = self.interval mean = self.mean bits = self.bits variance = self.variance grad_dist_nl = self.grad_dist_nl grad_dist_nb = self.grad_dist_nb sigma = torch.sqrt(torch.tensor(self.variance)).cpu().item() half_point = int(len(self.levels) / 2) quantile_levels = get_quantile_levels(bits, grad_dist_nb) uniform_levels = get_uniform_levels( self.bits) exp_levels = get_exp_levels( self.bits, 0.5) bits = self.bits if self.method == 'alq': inv = self.inv sym = self.symmetric epochs = self.epochs initial_levels = self.levels levels_qua, _, losses_qua = alq( quantile_levels, grad_dist_nl, epochs, inv, sym) levels_uniform, _, losses_uni = alq( uniform_levels, grad_dist_nl, epochs, inv, sym) levels_exp, _, losses_exp = alq( exp_levels, grad_dist_nl, epochs, inv, sym) candidate_levels = np.asarray( [levels_qua, levels_uniform, levels_exp]) candidate_losses = np.asarray( [losses_qua[-1], losses_uni[-1], losses_exp[-1]]) self.levels = candidate_levels[np.argsort(candidate_losses)][0] elif self.method == 'alq_nb': epochs = self.epochs inv = self.inv sym = self.symmetric quantile_levels = get_quantile_levels(bits, grad_dist_nb) levels_qua, _, losses_qua = alq( quantile_levels, grad_dist_nb, epochs, inv, sym) levels_uniform, _, losses_uni = alq( uniform_levels, grad_dist_nb, epochs, inv, sym) levels_exp, _, losses_exp = alq( exp_levels, grad_dist_nb, epochs, inv, sym) candidate_levels = np.asarray( [levels_qua, levels_uniform, levels_exp]) candidate_losses = np.asarray( [losses_qua[-1], losses_uni[-1], losses_exp[-1]]) self.levels = candidate_levels[np.argsort(candidate_losses)][0] elif self.method == 'amq': initial_points = [] if self.previous_best is None: initial_points = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9] else: initial_points = [0.1, 0.2, 0.3, 0.4, self.previous_best, 0.5, 0.8, 0.9] optimal_points = [] for point in initial_points: optimal_p, _ = amq_norm_less(point, grad_dist_nl, bits, self.amq_lr, self.amq_epochs) optimal_points.append(optimal_p) optimal_points_costs = [ grad_dist_nl.estimate_variance(get_exp_levels(bits, p)[ half_point:]) for p in optimal_points] index = np.argmin(optimal_points_costs) self.multiplier = optimal_points[index] self.previous_best = self.multiplier self.levels = get_exp_levels(bits, self.multiplier) elif self.method == 'amq_nb': initial_points = [] if self.previous_best is None: initial_points = [0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 0.9] else: initial_points = [0.1, 0.2, 0.3, 0.4, self.previous_best, 0.5, 0.8, 0.9] optimal_points = [] for point in initial_points: optimal_p, _ = amq_norm_based(point, grad_dist_nb, bits, self.amq_lr, self.amq_epochs) optimal_points.append(optimal_p) optimal_points_costs = [ grad_dist_nb.estimate_variance(get_exp_levels(bits, p)[ half_point:]) for p in optimal_points] index = np.argmin(optimal_points_costs) self.multiplier = optimal_points[index] self.previous_best = self.multiplier self.levels = get_exp_levels(self.bits, self.multiplier) self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda() self.qdq = QDQ(self.levels)
def __init__(self, method, bits, bucket_size, multiplier, **kwargs): """ QSGD: qdqL2 + levels_uni NUQSGD: qdqL2 + levels_exp QSGD-inf: qdqLinf + levels_uni """ self.method = method self.multiplier = multiplier if kwargs['interval'] != None: self.interval = kwargs['interval'] a, b = (-self.interval - 0) / 0.1, (self.interval - 0) / 0.1 if method == 'q': self.levels = get_uniform_levels(bits) self.norm_type = 'fro' elif method == 'nuq': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'qinf': self.levels = get_uniform_levels(bits) self.norm_type = float('inf') elif method == 'nuq2': self.levels = get_quantile_levels( bits, 0, 0.1, -self.interval, self.interval) self.norm_type = 'fro' elif method == 'nuq2inf': self.levels = get_quantile_levels( bits, 0, 0.1, -self.interval, self.interval) self.norm_type = float('inf') elif method == 'amq': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'amq_nb': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'alq': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'alq_nb': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'trn': self.levels = get_ternary_levels() self.norm_type = float('inf') elif method == 'none': return self.number_of_iterations = 0 self.gradient_samples = [] self.gradient_samples_overtime = [] self.previous_best = None self.bucket_size = bucket_size self.bits = bits self.epochs = kwargs['cd_epochs'] self.path = kwargs['path'] self.amq_lr = kwargs['amq_lr'] self.amq_epochs = kwargs['amq_epochs'] self.symmetric = kwargs['symmetric'] self.inv = kwargs['inv'] self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda() self.qdq = QDQ(self.levels) self.mean_weights = 0 self.variance_weights = 0.1 self.error = None
class QuantizeMultiBucket(object): def __init__(self, method, bits, bucket_size, multiplier, **kwargs): """ QSGD: qdqL2 + levels_uni NUQSGD: qdqL2 + levels_exp QSGD-inf: qdqLinf + levels_uni """ self.method = method self.multiplier = multiplier if kwargs['interval'] != None: self.interval = kwargs['interval'] a, b = (-self.interval - 0) / 0.1, (self.interval - 0) / 0.1 if method == 'q': self.levels = get_uniform_levels(bits) self.norm_type = 'fro' elif method == 'nuq': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'qinf': self.levels = get_uniform_levels(bits) self.norm_type = float('inf') elif method == 'nuq2': self.levels = get_quantile_levels( bits, 0, 0.1, -self.interval, self.interval) self.norm_type = 'fro' elif method == 'nuq2inf': self.levels = get_quantile_levels( bits, 0, 0.1, -self.interval, self.interval) self.norm_type = float('inf') elif method == 'amq': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'amq_nb': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'alq': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'alq_nb': self.levels = get_exp_levels(bits, multiplier) self.norm_type = 'fro' elif method == 'trn': self.levels = get_ternary_levels() self.norm_type = float('inf') elif method == 'none': return self.number_of_iterations = 0 self.gradient_samples = [] self.gradient_samples_overtime = [] self.previous_best = None self.bucket_size = bucket_size self.bits = bits self.epochs = kwargs['cd_epochs'] self.path = kwargs['path'] self.amq_lr = kwargs['amq_lr'] self.amq_epochs = kwargs['amq_epochs'] self.symmetric = kwargs['symmetric'] self.inv = kwargs['inv'] self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda() self.qdq = QDQ(self.levels) self.mean_weights = 0 self.variance_weights = 0.1 self.error = None def set_mean_variance(self, stats): self.mean = mean = stats['nl']['mean'] self.variance = variance = stats['nl']['sigma'] ** 2 self.norms = norms = stats['nb'] self.number_of_iterations += 1 interval = self.interval sigma = torch.sqrt(torch.tensor(self.variance)).cpu().item() self.grad_dist_nb = CondNormalTruncHist( norms['means'], norms['sigmas'], norms['norms'], -interval, interval, nbins=100000, bin_type='linear') self.grad_dist_nl = TruncNorm( mean, sigma, -interval, interval, nbins=100000, bin_type='linear') self.error = self.grad_dist_nb.estimate_variance(self.levels.cpu()) if self.method == 'amq': np.savetxt(self.path + '/norms_mean' + str(self.number_of_iterations), np.asarray(self.norms['means'])) np.savetxt(self.path + '/norms_sigma' + str(self.number_of_iterations), np.asarray(self.norms['sigmas'])) np.savetxt(self.path + '/norms_norm' + str(self.number_of_iterations), np.asarray(self.norms['norms'])) def update_levels(self): interval = self.interval mean = self.mean bits = self.bits variance = self.variance grad_dist_nl = self.grad_dist_nl grad_dist_nb = self.grad_dist_nb sigma = torch.sqrt(torch.tensor(self.variance)).cpu().item() half_point = int(len(self.levels) / 2) quantile_levels = get_quantile_levels(bits, grad_dist_nb) uniform_levels = get_uniform_levels( self.bits) exp_levels = get_exp_levels( self.bits, 0.5) bits = self.bits if self.method == 'alq': inv = self.inv sym = self.symmetric epochs = self.epochs initial_levels = self.levels levels_qua, _, losses_qua = alq( quantile_levels, grad_dist_nl, epochs, inv, sym) levels_uniform, _, losses_uni = alq( uniform_levels, grad_dist_nl, epochs, inv, sym) levels_exp, _, losses_exp = alq( exp_levels, grad_dist_nl, epochs, inv, sym) candidate_levels = np.asarray( [levels_qua, levels_uniform, levels_exp]) candidate_losses = np.asarray( [losses_qua[-1], losses_uni[-1], losses_exp[-1]]) self.levels = candidate_levels[np.argsort(candidate_losses)][0] elif self.method == 'alq_nb': epochs = self.epochs inv = self.inv sym = self.symmetric quantile_levels = get_quantile_levels(bits, grad_dist_nb) levels_qua, _, losses_qua = alq( quantile_levels, grad_dist_nb, epochs, inv, sym) levels_uniform, _, losses_uni = alq( uniform_levels, grad_dist_nb, epochs, inv, sym) levels_exp, _, losses_exp = alq( exp_levels, grad_dist_nb, epochs, inv, sym) candidate_levels = np.asarray( [levels_qua, levels_uniform, levels_exp]) candidate_losses = np.asarray( [losses_qua[-1], losses_uni[-1], losses_exp[-1]]) self.levels = candidate_levels[np.argsort(candidate_losses)][0] elif self.method == 'amq': initial_points = [] if self.previous_best is None: initial_points = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.8, 0.9] else: initial_points = [0.1, 0.2, 0.3, 0.4, self.previous_best, 0.5, 0.8, 0.9] optimal_points = [] for point in initial_points: optimal_p, _ = amq_norm_less(point, grad_dist_nl, bits, self.amq_lr, self.amq_epochs) optimal_points.append(optimal_p) optimal_points_costs = [ grad_dist_nl.estimate_variance(get_exp_levels(bits, p)[ half_point:]) for p in optimal_points] index = np.argmin(optimal_points_costs) self.multiplier = optimal_points[index] self.previous_best = self.multiplier self.levels = get_exp_levels(bits, self.multiplier) elif self.method == 'amq_nb': initial_points = [] if self.previous_best is None: initial_points = [0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 0.9] else: initial_points = [0.1, 0.2, 0.3, 0.4, self.previous_best, 0.5, 0.8, 0.9] optimal_points = [] for point in initial_points: optimal_p, _ = amq_norm_based(point, grad_dist_nb, bits, self.amq_lr, self.amq_epochs) optimal_points.append(optimal_p) optimal_points_costs = [ grad_dist_nb.estimate_variance(get_exp_levels(bits, p)[ half_point:]) for p in optimal_points] index = np.argmin(optimal_points_costs) self.multiplier = optimal_points[index] self.previous_best = self.multiplier self.levels = get_exp_levels(self.bits, self.multiplier) self.levels = torch.as_tensor(self.levels, dtype=torch.float32).cuda() self.qdq = QDQ(self.levels) def quantize(self, x, ig_sm_bkts): if self.method == 'none': return x assert isinstance(x, torch.cuda.FloatTensor) bucket_size = self.bucket_size num_tail = math.ceil(x.numel()/bucket_size)*bucket_size-x.numel() xv = torch.cat((x.view(-1), torch.zeros(num_tail, dtype=x.dtype, device=x.device))) xv = xv.view(-1, bucket_size) norm = xv.norm(p=self.norm_type, dim=1, keepdim=True).expand( xv.shape[0], xv.shape[1]).contiguous().view(-1).contiguous() if ig_sm_bkts: if xv.shape[0] > 1: q = torch.zeros_like(xv) r = torch.randint_like(xv, 1000001).long() self.qdq.qdqGPU(xv[:-1], norm[:-1], q[:-1], r[:-1]) return torch.cat([q[:-1].view(-1), xv[-1][:-num_tail].view(-1)]).view(x.shape) else: return xv[-1][:-num_tail].view(x.shape) else: q = torch.zeros_like(x) r = torch.randint_like(x, 1000001).long() self.qdq.qdqGPU(x, norm, q, r) return q def state_dict(self): if self.method == 'none': return {} return { 'levels': self.levels, 'means': self.grad_dist_nb.means, 'sigmas': self.grad_dist_nb.sigmas, 'norms': self.grad_dist_nb.norms, 'sigma': self.grad_dist_nl.sigma, 'mean': self.grad_dist_nl.mean, 'error': self.error } def load_state_dict(self, state): if self.method == 'none': return self.levels = state['levels'] self.grad_dist_nb = CondNormalTruncHist( state['means'], state['sigmas'], state['norms'], -1, 1, nbins=100000, bin_type='linear') self.grad_dist_nl = TruncNorm( state['mean'], state['sigma'], -1, 1, nbins=100000, bin_type='linear') self.qdq = QDQ(self.levels) self.error = state['error']