def load_dataset(self): print('Loading dataset from {}'.format(self.dataFolder)) startTime = time.time() # Load train and test data. self.trainSet = torch.load(self.processedFilesPath[1]) self.testSet = torch.load(self.processedFilesPath[2]) #Then load the fields fields = onmt.IO.ONMTDataset.load_fields( torch.load(self.processedFilesPath[0])) self.fields = dict([(k, f) for (k, f) in fields.items() if k in self.trainSet.examples[0].__dict__]) self.trainSet.fields = self.fields self.testSet.fields = self.fields print(' * number of training sentences: %d' % len(self.trainSet)) print('Dataset loaded in {}'.format(mhf.timeSince(startTime)))
def optimize_quantization_points(modelToQuantize, train_loader, test_loader, initial_learning_rate=1e-5, initial_momentum=0.9, epochs_to_train=30, print_every=500, use_nesterov=True, learning_rate_style='generic', numPointsPerTensor=16, assignBitsAutomatically=False, bucket_size=None, use_distillation_loss=True, initialize_method='quantiles', quantize_first_and_last_layer=True): print('Preparing training - pre processing tensors') numTensorsNetwork = sum(1 for _ in modelToQuantize.parameters()) initialize_method = initialize_method.lower() if initialize_method not in ('quantiles', 'uniform'): raise ValueError( 'The initialization method must be either quantiles or uniform') if isinstance(numPointsPerTensor, int): numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork if len(numPointsPerTensor) != numTensorsNetwork: raise ValueError( 'numPointsPerTensor must be equal to the number of tensor in the network') if quantize_first_and_last_layer is False: numPointsPerTensor = numPointsPerTensor[1:-1] # same scaling function that is used inside nonUniformQUantization. It is important they are the same scalingFunction = quantization.ScalingFunction( 'linear', False, False, bucket_size, False) # if assigning bits automatically, use the 2-norm of the gradient to determine weights importance if assignBitsAutomatically: num_to_estimate_grad = 5 modelToQuantize.zero_grad() for idx_minibatch, batch in enumerate(train_loader, start=1): cnn_hf.forward_and_backward(modelToQuantize, batch, idx_batch=idx_minibatch, epoch=0, use_distillation_loss=False) if idx_minibatch >= num_to_estimate_grad: break # now we compute the 2-norm of the gradient for each parameter fisherInformation = [] for idx, p in enumerate(modelToQuantize.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue fisherInformation.append((p.grad.data/num_to_estimate_grad).norm()) # zero the grad we computed modelToQuantize.zero_grad() # now we use a simple linear proportion to assign bits # the minimum number of points is half what was given as input numPointsPerTensor = quantization.help_functions.assign_bits_automatically(fisherInformation, numPointsPerTensor, input_is_point=True) # initialize the points using the percentile function so as to make them all usable pointsPerTensor = [] if initialize_method == 'quantiles': for idx, p in enumerate(modelToQuantize.parameters()): if quantize_first_and_last_layer is True: currPointsPerTensor = numPointsPerTensor[idx] else: if idx == 0 or idx == numTensorsNetwork - 1: continue currPointsPerTensor = numPointsPerTensor[idx-1] initial_points = quantization.help_functions.initialize_quantization_points(p.data, scalingFunction, currPointsPerTensor) initial_points = Variable(initial_points, requires_grad=True) # do a dummy backprop so that the grad attribute is initialized. We need this because we call # the .backward() function manually later on (since pytorch can't assign variables to model # parameters) initial_points.sum().backward() pointsPerTensor.append(initial_points) elif initialize_method == 'uniform': for numPoint in numPointsPerTensor: initial_points = torch.FloatTensor( [x/(numPoint-1) for x in range(numPoint)]) if USE_CUDA: initial_points = initial_points.cuda() initial_points = Variable(initial_points, requires_grad=True) # do a dummy backprop so that the grad attribute is initialized. We need this because we call # the .backward() function manually later on (since pytorch can't assign variables to model # parameters) initial_points.sum().backward() pointsPerTensor.append(initial_points) else: raise ValueError # dealing with 0 momentum options_optimizer = {} if initial_momentum != 0: options_optimizer = { 'momentum': initial_momentum, 'nesterov': use_nesterov} optimizer = optim.SGD( pointsPerTensor, lr=initial_learning_rate, **options_optimizer) lr_scheduler = cnn_hf.LearningRateScheduler( initial_learning_rate, learning_rate_style) startTime = time.time() pred_accuracy_epochs = [] losses_epochs = [] last_loss_saved = float('inf') number_minibatches_per_epoch = len(train_loader) if print_every > number_minibatches_per_epoch: print_every = number_minibatches_per_epoch // 2 modelToQuantize.eval() quantizedModel = copy.deepcopy(modelToQuantize) epoch = 0 quantizationFunctions = [] for idx, p in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue # efficient version of nonUniformQuantization quant_fun = quantization.nonUniformQuantization_variable(max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size, pre_process_tensors=True, tensor=p.data) quantizationFunctions.append(quant_fun) print('Pre processing done, training started') for epoch in range(epochs_to_train): quantizedModel.train() print_loss_total = 0 for idx_minibatch, data in enumerate(train_loader, start=1): # zero the gradient of the parameters model quantizedModel.zero_grad() optimizer.zero_grad() # quantize the model parameters for idx, p_quantized in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue currIdx = idx - 1 else: currIdx = idx # efficient quantization p_quantized.data = quantizationFunctions[currIdx].forward( None, pointsPerTensor[currIdx].data) print_loss = cnn_hf.forward_and_backward(quantizedModel, data, idx_minibatch, epoch, use_distillation_loss=use_distillation_loss, teacher_model=modelToQuantize) # now get the gradient of the pointsPerTensor for idx, p in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue currIdx = idx - 1 else: currIdx = idx pointsPerTensor[currIdx].grad.data = quantizationFunctions[currIdx].backward(p.grad.data)[ 1] optimizer.step() # after optimzer.step() we need to make sure that the points are still sorted. Implementation detail for points in pointsPerTensor: points.data = torch.sort(points.data)[0] # print statistics print_loss_total += print_loss if (idx_minibatch) % print_every == 0: last_loss_saved = print_loss_total / print_every str_to_print = 'Time Elapsed: {}, [Epoch: {}, Minibatch: {}], loss: {:3f}'.format( mhf.timeSince(startTime), epoch + 1, idx_minibatch, last_loss_saved) if pred_accuracy_epochs: str_to_print += '. Last prediction accuracy: {:2f}%'.format( pred_accuracy_epochs[-1] * 100) print(str_to_print) print_loss_total = 0 losses_epochs.append(last_loss_saved) curr_pred_accuracy = evaluateModel( quantizedModel, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print(' === Epoch: {} - prediction accuracy {:2f}% === '.format(epoch + 1, curr_pred_accuracy * 100)) # updating the learning rate new_learning_rate, stop_training = lr_scheduler.update_learning_rate( epoch, 1 - curr_pred_accuracy) if stop_training is True: break for p in optimizer.param_groups: try: p['lr'] = new_learning_rate except: pass print('Finished Training in {} epochs'.format(epoch + 1)) informationDict = {'predictionAccuracy': pred_accuracy_epochs, 'numEpochsTrained': epoch+1, 'lossSaved': losses_epochs} # IMPORTANT: When there are batch normalization layers, important information is contained # also in the running mean and runnin var values of the batch normalization layers. Since these are not # parameters, they don't show up in model.parameter() list (and they don't have quantization points # associated with it). So if I return just the optimized quantization points, and quantize the model # weight with them, I will have inferior performance because the running mean and var of the batch normalization # layers won't be saved. To solve this issue I also return the quantized model state dict, that contains # not only the parameter of the models but also this statistics for the batch normalization layers return quantizedModel.state_dict(), pointsPerTensor, informationDict
def train_model(model, train_loader, test_loader, initial_learning_rate=0.001, use_nesterov=True, initial_momentum=0.9, weight_decayL2=0.00022, epochs_to_train=100, print_every=500, learning_rate_style='generic', use_distillation_loss=False, teacher_model=None, quantizeWeights=False, numBits=8, grad_clipping_threshold=False, start_epoch=0, bucket_size=None, quantizationFunctionToUse='uniformLinearScaling', backprop_quantization_style='none', estimate_quant_grad_every=1, add_gradient_noise=False, ask_teacher_strategy=('always', None), quantize_first_and_last_layer=True, mix_with_differentiable_quantization=False): # backprop_quantization_style determines how to modify the gradients to take into account the # quantization function. Specifically, one can use 'none', where gradients are not modified, # 'truncated', where gradient values outside -1 and 1 are truncated to 0 (as per the paper # specified in the comments) and 'complicated', which is the temp name for my idea which is slow and complicated # to compute if use_distillation_loss is True and teacher_model is None: raise ValueError( 'To compute distillation loss you have to pass the teacher model') if teacher_model is not None: teacher_model.eval() learning_rate_style = learning_rate_style.lower() lr_scheduler = cnn_hf.LearningRateScheduler( initial_learning_rate, learning_rate_style) new_learning_rate = initial_learning_rate optimizer = optim.SGD(model.parameters(), lr=initial_learning_rate, nesterov=use_nesterov, momentum=initial_momentum, weight_decay=weight_decayL2) startTime = time.time() pred_accuracy_epochs = [] percentages_asked_teacher = [] losses_epochs = [] informationDict = {} last_loss_saved = float('inf') step_since_last_grad_quant_estimation = 1 number_minibatches_per_epoch = len(train_loader) if quantizeWeights: quantizationFunctionToUse = quantizationFunctionToUse.lower() if backprop_quantization_style is None: backprop_quantization_style = 'none' backprop_quantization_style = backprop_quantization_style.lower() if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower(): s = 2 ** (numBits - 1) type_of_scaling = 'absmax' elif quantizationFunctionToUse == 'uniformLinearScaling'.lower(): s = 2 ** numBits type_of_scaling = 'linear' else: raise ValueError( 'The specified quantization function is not present') if backprop_quantization_style is None or backprop_quantization_style in ('none', 'truncated'): def quantizeFunctions(x): return quantization.uniformQuantization(x, s, type_of_scaling=type_of_scaling, stochastic_rounding=False, max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size)[0] elif backprop_quantization_style == 'complicated': quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling, stochastic_rounding=False, max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size) for _ in model.parameters()] else: raise ValueError( 'The specified backprop_quantization_style not recognized') num_parameters = sum(1 for _ in model.parameters()) def quantize_weights_model(model): for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_parameters-1: continue # don't quantize first and last layer if backprop_quantization_style == 'truncated': p.data.clamp_(-1, 1) if backprop_quantization_style in ('none', 'truncated'): p.data = quantizeFunctions(p.data) elif backprop_quantization_style == 'complicated': p.data = quantizeFunctions[idx].forward(p.data) else: raise ValueError def backward_quant_weights_model(model): if backprop_quantization_style == 'none': return for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_parameters-1: continue # don't quantize first and last layer # Now some sort of backward. For the none style, we don't do anything. # for the truncated style, we just need to truncate the grad weights # as per the paper here: https://arxiv.org/pdf/1609.07061.pdf # if we are quantizing, I put gradient values above 1 to 0. # their case it not immediately applicable to ours, but let's try this out if backprop_quantization_style == 'truncated': p.grad.data[p.data.abs() > 1] = 0 elif backprop_quantization_style == 'complicated': p.grad.data = quantizeFunctions[idx].backward(p.grad.data) if print_every > number_minibatches_per_epoch: print_every = number_minibatches_per_epoch // 2 try: epoch = start_epoch for epoch in range(start_epoch, epochs_to_train+start_epoch): print("begin training") if USE_CUDA: print("USE_CUDA") if mix_with_differentiable_quantization: print('=== Starting Quantized Distillation epoch === ') model.train() print_loss_total = 0 count_asked_teacher = 0 count_asked_total = 0 for idx_minibatch, data in enumerate(train_loader, start=1): if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: # we save them because we only want to quantize weights to compute gradients, # but keep using non-quantized weights during the algorithm model_state_dict = model.state_dict() quantize_weights_model(model) model.zero_grad() print_loss, curr_c_teach, curr_c_total = forward_and_backward(model, data, idx_minibatch, epoch, use_distillation_loss=use_distillation_loss, teacher_model=teacher_model, ask_teacher_strategy=ask_teacher_strategy, return_more_info=True) count_asked_teacher += curr_c_teach count_asked_total += curr_c_total # load the non-quantize weights and use them for the update. The quantized # weights are used only to get the quantized gradient if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: model.load_state_dict(model_state_dict) del model_state_dict # free memory if add_gradient_noise and not quantizeWeights: cnn_hf.add_gradient_noise( model, idx_minibatch, epoch, number_minibatches_per_epoch) if grad_clipping_threshold is not False: # gradient clipping for p in model.parameters(): p.grad.data.clamp_(-grad_clipping_threshold, grad_clipping_threshold) if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: backward_quant_weights_model(model) optimizer.step() if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: step_since_last_grad_quant_estimation = 0 step_since_last_grad_quant_estimation += 1 # print statistics print_loss_total += print_loss if (idx_minibatch) % print_every == 0: last_loss_saved = print_loss_total / print_every str_to_print = 'Time Elapsed: {}, [Start Epoch: {}, Epoch: {}, Minibatch: {}], loss: {:3f}'.format( mhf.timeSince(startTime), start_epoch+1, epoch + 1, idx_minibatch, last_loss_saved) if pred_accuracy_epochs: str_to_print += ' Last prediction accuracy: {:2f}%'.format( pred_accuracy_epochs[-1]*100) print(str_to_print) print_loss_total = 0 curr_percentages_asked_teacher = count_asked_teacher / \ count_asked_total if count_asked_total != 0 else 0 percentages_asked_teacher.append(curr_percentages_asked_teacher) losses_epochs.append(last_loss_saved) curr_pred_accuracy = evaluateModel( model, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print(' === Epoch: {} - prediction accuracy {:2f}% === '.format(epoch + 1, curr_pred_accuracy*100)) if mix_with_differentiable_quantization and epoch != start_epoch + epochs_to_train - 1: print('=== Starting Differentiable Quantization epoch === ') # the diff quant step is not done at the last epoch, so we end on a quantized distillation epoch model_state_dict = optimize_quantization_points(model, train_loader, test_loader, new_learning_rate, initial_momentum=initial_momentum, epochs_to_train=1, print_every=print_every, use_nesterov=use_nesterov, learning_rate_style=learning_rate_style, numPointsPerTensor=2**numBits, assignBitsAutomatically=True, bucket_size=bucket_size, use_distillation_loss=True, initialize_method='quantiles', quantize_first_and_last_layer=quantize_first_and_last_layer)[0] model.load_state_dict(model_state_dict) del model_state_dict # free memory losses_epochs.append(last_loss_saved) curr_pred_accuracy = evaluateModel( model, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print(' === Epoch: {} - prediction accuracy {:2f}% === '.format( epoch + 1, curr_pred_accuracy * 100)) # updating the learning rate new_learning_rate, stop_training = lr_scheduler.update_learning_rate( epoch, 1-curr_pred_accuracy) if stop_training is True: break for p in optimizer.param_groups: try: p['lr'] = new_learning_rate except: pass except Exception as e: print('An exception occurred: {}\n. Training has been stopped after {} epochs.'.format( e, epoch)) informationDict['errorFlag'] = True informationDict['numEpochsTrained'] = epoch-start_epoch return model, informationDict except KeyboardInterrupt: print('User stopped training after {} epochs'.format(epoch)) informationDict['errorFlag'] = False informationDict['numEpochsTrained'] = epoch - start_epoch else: print('Finished Training in {} epochs'.format(epoch+1)) informationDict['errorFlag'] = False informationDict['numEpochsTrained'] = epoch + 1 - start_epoch if quantizeWeights: quantize_weights_model(model) if mix_with_differentiable_quantization: informationDict['numEpochsTrained'] *= 2 informationDict['percentages_asked_teacher'] = percentages_asked_teacher informationDict['predictionAccuracy'] = pred_accuracy_epochs informationDict['lossSaved'] = losses_epochs return model, informationDict