def check_right_bits(tensor_iterator, num_quant_points, bucket_size): '''numQ_quant_points is the number of quantization points per tensor. If it is a int, it is assumed it is the same for all tensors''' if isinstance(num_quant_points, int): is_int_quant_points = True else: is_int_quant_points = False scaling_function = quantization.ScalingFunction('linear', False, False, bucket_size=bucket_size) for idx_tensor, tensor in enumerate(tensor_iterator): if hasattr(tensor, 'data'): tensor = tensor.data #TODO: This does not work, as you're supposed to use the original scaling factors, not the ones #you find in the quantized tensor; for example, in the latter there will always be 1, which is not #always correct tensor = scaling_function.scale_down(tensor) distinct_elements = np.unique( tensor.view(-1).cpu().numpy().round(decimals=5)) num_distinct_elements = len(distinct_elements) if is_int_quant_points: curr_num_quant_points = num_quant_points else: curr_num_quant_points = num_quant_points[idx_tensor] if num_distinct_elements > curr_num_quant_points + 3: return False return True
def preprocess(self, tensor): if not self.modifyInPlace: tensor = tensor.clone() scaling_function = quantization.ScalingFunction(type_scaling='linear', max_element=self.maxElementAllowed, subtract_mean=self.subtractMean, bucket_size=self.bucket_size, modify_in_place=True) tensor = scaling_function.scale_down(tensor) tensor_type = tensor.type() is_tensor_cuda = tensor.is_cuda if is_tensor_cuda: numpyTensor = tensor.view(-1).cpu().numpy() else: numpyTensor = tensor.view(-1).numpy() self.search_sorted_obj = SearchSorted(numpyTensor.copy()) self.tensors_info = (tensor_type, is_tensor_cuda) self.scaling_function = scaling_function
def optimize_quantization_points(modelToQuantize, train_loader, test_loader, initial_learning_rate=1e-5, initial_momentum=0.9, epochs_to_train=30, print_every=500, use_nesterov=True, learning_rate_style='generic', numPointsPerTensor=16, assignBitsAutomatically=False, bucket_size=None, use_distillation_loss=True, initialize_method='quantiles', quantize_first_and_last_layer=True): print('Preparing training - pre processing tensors') numTensorsNetwork = sum(1 for _ in modelToQuantize.parameters()) initialize_method = initialize_method.lower() if initialize_method not in ('quantiles', 'uniform'): raise ValueError( 'The initialization method must be either quantiles or uniform') if isinstance(numPointsPerTensor, int): numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork if len(numPointsPerTensor) != numTensorsNetwork: raise ValueError( 'numPointsPerTensor must be equal to the number of tensor in the network') if quantize_first_and_last_layer is False: numPointsPerTensor = numPointsPerTensor[1:-1] # same scaling function that is used inside nonUniformQUantization. It is important they are the same scalingFunction = quantization.ScalingFunction( 'linear', False, False, bucket_size, False) # if assigning bits automatically, use the 2-norm of the gradient to determine weights importance if assignBitsAutomatically: num_to_estimate_grad = 5 modelToQuantize.zero_grad() for idx_minibatch, batch in enumerate(train_loader, start=1): cnn_hf.forward_and_backward(modelToQuantize, batch, idx_batch=idx_minibatch, epoch=0, use_distillation_loss=False) if idx_minibatch >= num_to_estimate_grad: break # now we compute the 2-norm of the gradient for each parameter fisherInformation = [] for idx, p in enumerate(modelToQuantize.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue fisherInformation.append((p.grad.data/num_to_estimate_grad).norm()) # zero the grad we computed modelToQuantize.zero_grad() # now we use a simple linear proportion to assign bits # the minimum number of points is half what was given as input numPointsPerTensor = quantization.help_functions.assign_bits_automatically(fisherInformation, numPointsPerTensor, input_is_point=True) # initialize the points using the percentile function so as to make them all usable pointsPerTensor = [] if initialize_method == 'quantiles': for idx, p in enumerate(modelToQuantize.parameters()): if quantize_first_and_last_layer is True: currPointsPerTensor = numPointsPerTensor[idx] else: if idx == 0 or idx == numTensorsNetwork - 1: continue currPointsPerTensor = numPointsPerTensor[idx-1] initial_points = quantization.help_functions.initialize_quantization_points(p.data, scalingFunction, currPointsPerTensor) initial_points = Variable(initial_points, requires_grad=True) # do a dummy backprop so that the grad attribute is initialized. We need this because we call # the .backward() function manually later on (since pytorch can't assign variables to model # parameters) initial_points.sum().backward() pointsPerTensor.append(initial_points) elif initialize_method == 'uniform': for numPoint in numPointsPerTensor: initial_points = torch.FloatTensor( [x/(numPoint-1) for x in range(numPoint)]) if USE_CUDA: initial_points = initial_points.cuda() initial_points = Variable(initial_points, requires_grad=True) # do a dummy backprop so that the grad attribute is initialized. We need this because we call # the .backward() function manually later on (since pytorch can't assign variables to model # parameters) initial_points.sum().backward() pointsPerTensor.append(initial_points) else: raise ValueError # dealing with 0 momentum options_optimizer = {} if initial_momentum != 0: options_optimizer = { 'momentum': initial_momentum, 'nesterov': use_nesterov} optimizer = optim.SGD( pointsPerTensor, lr=initial_learning_rate, **options_optimizer) lr_scheduler = cnn_hf.LearningRateScheduler( initial_learning_rate, learning_rate_style) startTime = time.time() pred_accuracy_epochs = [] losses_epochs = [] last_loss_saved = float('inf') number_minibatches_per_epoch = len(train_loader) if print_every > number_minibatches_per_epoch: print_every = number_minibatches_per_epoch // 2 modelToQuantize.eval() quantizedModel = copy.deepcopy(modelToQuantize) epoch = 0 quantizationFunctions = [] for idx, p in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue # efficient version of nonUniformQuantization quant_fun = quantization.nonUniformQuantization_variable(max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size, pre_process_tensors=True, tensor=p.data) quantizationFunctions.append(quant_fun) print('Pre processing done, training started') for epoch in range(epochs_to_train): quantizedModel.train() print_loss_total = 0 for idx_minibatch, data in enumerate(train_loader, start=1): # zero the gradient of the parameters model quantizedModel.zero_grad() optimizer.zero_grad() # quantize the model parameters for idx, p_quantized in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue currIdx = idx - 1 else: currIdx = idx # efficient quantization p_quantized.data = quantizationFunctions[currIdx].forward( None, pointsPerTensor[currIdx].data) print_loss = cnn_hf.forward_and_backward(quantizedModel, data, idx_minibatch, epoch, use_distillation_loss=use_distillation_loss, teacher_model=modelToQuantize) # now get the gradient of the pointsPerTensor for idx, p in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue currIdx = idx - 1 else: currIdx = idx pointsPerTensor[currIdx].grad.data = quantizationFunctions[currIdx].backward(p.grad.data)[ 1] optimizer.step() # after optimzer.step() we need to make sure that the points are still sorted. Implementation detail for points in pointsPerTensor: points.data = torch.sort(points.data)[0] # print statistics print_loss_total += print_loss if (idx_minibatch) % print_every == 0: last_loss_saved = print_loss_total / print_every str_to_print = 'Time Elapsed: {}, [Epoch: {}, Minibatch: {}], loss: {:3f}'.format( mhf.timeSince(startTime), epoch + 1, idx_minibatch, last_loss_saved) if pred_accuracy_epochs: str_to_print += '. Last prediction accuracy: {:2f}%'.format( pred_accuracy_epochs[-1] * 100) print(str_to_print) print_loss_total = 0 losses_epochs.append(last_loss_saved) curr_pred_accuracy = evaluateModel( quantizedModel, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print(' === Epoch: {} - prediction accuracy {:2f}% === '.format(epoch + 1, curr_pred_accuracy * 100)) # updating the learning rate new_learning_rate, stop_training = lr_scheduler.update_learning_rate( epoch, 1 - curr_pred_accuracy) if stop_training is True: break for p in optimizer.param_groups: try: p['lr'] = new_learning_rate except: pass print('Finished Training in {} epochs'.format(epoch + 1)) informationDict = {'predictionAccuracy': pred_accuracy_epochs, 'numEpochsTrained': epoch+1, 'lossSaved': losses_epochs} # IMPORTANT: When there are batch normalization layers, important information is contained # also in the running mean and runnin var values of the batch normalization layers. Since these are not # parameters, they don't show up in model.parameter() list (and they don't have quantization points # associated with it). So if I return just the optimized quantization points, and quantize the model # weight with them, I will have inferior performance because the running mean and var of the batch normalization # layers won't be saved. To solve this issue I also return the quantized model state dict, that contains # not only the parameter of the models but also this statistics for the batch normalization layers return quantizedModel.state_dict(), pointsPerTensor, informationDict
def optimize_quantization_points(modelToQuantize, train_loader, test_loader, options, optim=None, numPointsPerTensor=16, assignBitsAutomatically=False, use_distillation_loss=False, bucket_size=None): print('Preparing training - pre processing tensors') if options is None: options = onmt.standard_options.stdOptions if not isinstance(options, dict): options = mhf.convertToDictionary(options) options = handle_options(options) options = mhf.convertToNamedTuple(options) modelToQuantize.eval() quantizedModel = copy.deepcopy(modelToQuantize) fields = train_loader.dataset.fields train_loss = make_loss_compute(quantizedModel, fields["tgt"].vocab, train_loader.dataset, options.copy_attn, options.copy_attn_force) valid_loss = make_loss_compute(quantizedModel, fields["tgt"].vocab, test_loader.dataset, options.copy_attn, options.copy_attn_force) trunc_size = options.truncated_decoder # Badly named... shard_size = options.max_generator_batches numTensorsNetwork = sum(1 for _ in quantizedModel.parameters()) if isinstance(numPointsPerTensor, int): numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork if len(numPointsPerTensor) != numTensorsNetwork: raise ValueError( 'numPointsPerTensor must be equal to the number of tensor in the network' ) scalingFunction = quantization.ScalingFunction(type_scaling='linear', max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size) quantizedModel.zero_grad() dummy_optim = create_optimizer( quantizedModel, options) #dummy optim, just to pass to trainer if assignBitsAutomatically: trainer = thf.MyTrainer(quantizedModel, train_loader, test_loader, train_loss, valid_loss, dummy_optim, trunc_size, shard_size) batch = next(iter(train_loader)) quantizedModel.zero_grad() trainer.forward_and_backward(0, batch, 0, onmt.Statistics(), None) fisherInformation = [] for p in quantizedModel.parameters(): fisherInformation.append(p.grad.data.norm()) numPointsPerTensor = qhf.assign_bits_automatically(fisherInformation, numPointsPerTensor, input_is_point=True) quantizedModel.zero_grad() del trainer del optim # initialize the points using the percentile function so as to make them all usable pointsPerTensor = [] for idx, p in enumerate(quantizedModel.parameters()): initial_points = qhf.initialize_quantization_points( p.data, scalingFunction, numPointsPerTensor[idx]) initial_points = Variable(initial_points, requires_grad=True) # do a dummy backprop so that the grad attribute is initialized. We need this because we call # the .backward() function manually later on (since pytorch can't assign variables to model # parameters) initial_points.sum().backward() pointsPerTensor.append(initial_points) optionsOpt = copy.deepcopy(mhf.convertToDictionary(options)) optimizer = create_optimizer(pointsPerTensor, mhf.convertToNamedTuple(optionsOpt)) trainer = thf.MyTrainer(quantizedModel, train_loader, test_loader, train_loss, valid_loss, dummy_optim, trunc_size, shard_size) perplexity_epochs = [] quantizationFunctions = [] for idx, p in enumerate(modelToQuantize.parameters()): #efficient version of nonUniformQuantization quant_fun = quantization.nonUniformQuantization_variable( max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size, pre_process_tensors=True, tensor=p.data) quantizationFunctions.append(quant_fun) print('Pre processing done, training started') for epoch in range(options.start_epoch, options.epochs + 1): train_stats = onmt.Statistics() quantizedModel.train() for idx_batch, batch in enumerate(train_loader): #zero the gradient quantizedModel.zero_grad() # quantize the weights for idx, p_quantized in enumerate(quantizedModel.parameters()): #I am using the efficient version of nonUniformQuantization. The tensors (that don't change across #iterations) are saved inside the quantization function, and we only need to pass the quantization #points p_quantized.data = quantizationFunctions[idx].forward( None, pointsPerTensor[idx].data) trainer.forward_and_backward(idx_batch, batch, epoch, train_stats, report_func, use_distillation_loss, modelToQuantize) # now get the gradient of the pointsPerTensor for idx, p in enumerate(quantizedModel.parameters()): pointsPerTensor[idx].grad.data = quantizationFunctions[ idx].backward(p.grad.data)[1] optimizer.step() # after optimzer.step() we need to make sure that the points are still sorted for points in pointsPerTensor: points.data = torch.sort(points.data)[0] print('Train perplexity: %g' % train_stats.ppl()) print('Train accuracy: %g' % train_stats.accuracy()) # 2. Validate on the validation set. valid_stats = trainer.validate() print('Validation perplexity: %g' % valid_stats.ppl()) print('Validation accuracy: %g' % valid_stats.accuracy()) perplexity_epochs.append(valid_stats.ppl()) # 3. Update the learning rate optimizer.updateLearningRate(valid_stats.ppl(), epoch) informationDict = {} informationDict['perplexity'] = perplexity_epochs informationDict[ 'numEpochsTrained'] = options.epochs + 1 - options.start_epoch return pointsPerTensor, informationDict