Exemple #1
0
def check_right_bits(tensor_iterator, num_quant_points, bucket_size):
    '''numQ_quant_points is the number of quantization points per tensor. If it is a int, it is assumed it is the same
    for all tensors'''

    if isinstance(num_quant_points, int):
        is_int_quant_points = True
    else:
        is_int_quant_points = False

    scaling_function = quantization.ScalingFunction('linear',
                                                    False,
                                                    False,
                                                    bucket_size=bucket_size)
    for idx_tensor, tensor in enumerate(tensor_iterator):
        if hasattr(tensor, 'data'):
            tensor = tensor.data
        #TODO: This does not work, as you're supposed to use the original scaling factors, not the ones
        #you find in the quantized tensor; for example, in the latter there will always be 1, which is not
        #always correct
        tensor = scaling_function.scale_down(tensor)
        distinct_elements = np.unique(
            tensor.view(-1).cpu().numpy().round(decimals=5))
        num_distinct_elements = len(distinct_elements)
        if is_int_quant_points:
            curr_num_quant_points = num_quant_points
        else:
            curr_num_quant_points = num_quant_points[idx_tensor]

        if num_distinct_elements > curr_num_quant_points + 3:
            return False

    return True
    def preprocess(self, tensor):
        if not self.modifyInPlace:
            tensor = tensor.clone()
        scaling_function = quantization.ScalingFunction(type_scaling='linear', max_element=self.maxElementAllowed,
                subtract_mean=self.subtractMean, bucket_size=self.bucket_size, modify_in_place=True)
        tensor = scaling_function.scale_down(tensor)
        tensor_type = tensor.type()
        is_tensor_cuda = tensor.is_cuda
        if is_tensor_cuda:
            numpyTensor = tensor.view(-1).cpu().numpy()
        else:
            numpyTensor = tensor.view(-1).numpy()

        self.search_sorted_obj = SearchSorted(numpyTensor.copy())
        self.tensors_info = (tensor_type, is_tensor_cuda)
        self.scaling_function = scaling_function
def optimize_quantization_points(modelToQuantize, train_loader, test_loader, initial_learning_rate=1e-5,
                                 initial_momentum=0.9, epochs_to_train=30, print_every=500, use_nesterov=True,
                                 learning_rate_style='generic', numPointsPerTensor=16,
                                 assignBitsAutomatically=False, bucket_size=None,
                                 use_distillation_loss=True, initialize_method='quantiles',
                                 quantize_first_and_last_layer=True):

    print('Preparing training - pre processing tensors')

    numTensorsNetwork = sum(1 for _ in modelToQuantize.parameters())
    initialize_method = initialize_method.lower()
    if initialize_method not in ('quantiles', 'uniform'):
        raise ValueError(
            'The initialization method must be either quantiles or uniform')

    if isinstance(numPointsPerTensor, int):
        numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork

    if len(numPointsPerTensor) != numTensorsNetwork:
        raise ValueError(
            'numPointsPerTensor must be equal to the number of tensor in the network')

    if quantize_first_and_last_layer is False:
        numPointsPerTensor = numPointsPerTensor[1:-1]

    # same scaling function that is used inside nonUniformQUantization. It is important they are the same
    scalingFunction = quantization.ScalingFunction(
        'linear', False, False, bucket_size, False)

    # if assigning bits automatically, use the 2-norm of the gradient to determine weights importance
    if assignBitsAutomatically:
        num_to_estimate_grad = 5
        modelToQuantize.zero_grad()
        for idx_minibatch, batch in enumerate(train_loader, start=1):
            cnn_hf.forward_and_backward(modelToQuantize, batch, idx_batch=idx_minibatch, epoch=0,
                                        use_distillation_loss=False)
            if idx_minibatch >= num_to_estimate_grad:
                break

        # now we compute the 2-norm of the gradient for each parameter
        fisherInformation = []
        for idx, p in enumerate(modelToQuantize.parameters()):
            if quantize_first_and_last_layer is False:
                if idx == 0 or idx == numTensorsNetwork - 1:
                    continue
            fisherInformation.append((p.grad.data/num_to_estimate_grad).norm())

        # zero the grad we computed
        modelToQuantize.zero_grad()

        # now we use a simple linear proportion to assign bits
        # the minimum number of points is half what was given as input
        numPointsPerTensor = quantization.help_functions.assign_bits_automatically(fisherInformation,
                                                                                   numPointsPerTensor,
                                                                                   input_is_point=True)

    # initialize the points using the percentile function so as to make them all usable
    pointsPerTensor = []
    if initialize_method == 'quantiles':
        for idx, p in enumerate(modelToQuantize.parameters()):
            if quantize_first_and_last_layer is True:
                currPointsPerTensor = numPointsPerTensor[idx]
            else:
                if idx == 0 or idx == numTensorsNetwork - 1:
                    continue
                currPointsPerTensor = numPointsPerTensor[idx-1]
            initial_points = quantization.help_functions.initialize_quantization_points(p.data,
                                                                                        scalingFunction,
                                                                                        currPointsPerTensor)
            initial_points = Variable(initial_points, requires_grad=True)
            # do a dummy backprop so that the grad attribute is initialized. We need this because we call
            # the .backward() function manually later on (since pytorch can't assign variables to model
            # parameters)
            initial_points.sum().backward()
            pointsPerTensor.append(initial_points)
    elif initialize_method == 'uniform':
        for numPoint in numPointsPerTensor:
            initial_points = torch.FloatTensor(
                [x/(numPoint-1) for x in range(numPoint)])
            if USE_CUDA:
                initial_points = initial_points.cuda()
            initial_points = Variable(initial_points, requires_grad=True)
            # do a dummy backprop so that the grad attribute is initialized. We need this because we call
            # the .backward() function manually later on (since pytorch can't assign variables to model
            # parameters)
            initial_points.sum().backward()
            pointsPerTensor.append(initial_points)
    else:
        raise ValueError

    # dealing with 0 momentum
    options_optimizer = {}
    if initial_momentum != 0:
        options_optimizer = {
            'momentum': initial_momentum, 'nesterov': use_nesterov}
    optimizer = optim.SGD(
        pointsPerTensor, lr=initial_learning_rate, **options_optimizer)

    lr_scheduler = cnn_hf.LearningRateScheduler(
        initial_learning_rate, learning_rate_style)
    startTime = time.time()

    pred_accuracy_epochs = []
    losses_epochs = []
    last_loss_saved = float('inf')
    number_minibatches_per_epoch = len(train_loader)

    if print_every > number_minibatches_per_epoch:
        print_every = number_minibatches_per_epoch // 2

    modelToQuantize.eval()
    quantizedModel = copy.deepcopy(modelToQuantize)
    epoch = 0

    quantizationFunctions = []
    for idx, p in enumerate(quantizedModel.parameters()):
        if quantize_first_and_last_layer is False:
            if idx == 0 or idx == numTensorsNetwork - 1:
                continue
        # efficient version of nonUniformQuantization
        quant_fun = quantization.nonUniformQuantization_variable(max_element=False, subtract_mean=False,
                                                                 modify_in_place=False, bucket_size=bucket_size,
                                                                 pre_process_tensors=True, tensor=p.data)

        quantizationFunctions.append(quant_fun)

    print('Pre processing done, training started')

    for epoch in range(epochs_to_train):
        quantizedModel.train()
        print_loss_total = 0
        for idx_minibatch, data in enumerate(train_loader, start=1):

            # zero the gradient of the parameters model
            quantizedModel.zero_grad()
            optimizer.zero_grad()

            # quantize the model parameters
            for idx, p_quantized in enumerate(quantizedModel.parameters()):
                if quantize_first_and_last_layer is False:
                    if idx == 0 or idx == numTensorsNetwork - 1:
                        continue
                    currIdx = idx - 1
                else:
                    currIdx = idx
                # efficient quantization
                p_quantized.data = quantizationFunctions[currIdx].forward(
                    None, pointsPerTensor[currIdx].data)

            print_loss = cnn_hf.forward_and_backward(quantizedModel, data, idx_minibatch, epoch,
                                                     use_distillation_loss=use_distillation_loss,
                                                     teacher_model=modelToQuantize)

            # now get the gradient of the pointsPerTensor
            for idx, p in enumerate(quantizedModel.parameters()):
                if quantize_first_and_last_layer is False:
                    if idx == 0 or idx == numTensorsNetwork - 1:
                        continue
                    currIdx = idx - 1
                else:
                    currIdx = idx
                pointsPerTensor[currIdx].grad.data = quantizationFunctions[currIdx].backward(p.grad.data)[
                    1]

            optimizer.step()

            # after optimzer.step() we need to make sure that the points are still sorted. Implementation detail
            for points in pointsPerTensor:
                points.data = torch.sort(points.data)[0]

            # print statistics
            print_loss_total += print_loss
            if (idx_minibatch) % print_every == 0:
                last_loss_saved = print_loss_total / print_every
                str_to_print = 'Time Elapsed: {}, [Epoch: {}, Minibatch: {}], loss: {:3f}'.format(
                    mhf.timeSince(startTime), epoch + 1, idx_minibatch, last_loss_saved)
                if pred_accuracy_epochs:
                    str_to_print += '. Last prediction accuracy: {:2f}%'.format(
                        pred_accuracy_epochs[-1] * 100)
                print(str_to_print)
                print_loss_total = 0

        losses_epochs.append(last_loss_saved)
        curr_pred_accuracy = evaluateModel(
            quantizedModel, test_loader, fastEvaluation=False)
        pred_accuracy_epochs.append(curr_pred_accuracy)
        print(' === Epoch: {} - prediction accuracy {:2f}% === '.format(epoch +
                                                                        1, curr_pred_accuracy * 100))

        # updating the learning rate
        new_learning_rate, stop_training = lr_scheduler.update_learning_rate(
            epoch, 1 - curr_pred_accuracy)
        if stop_training is True:
            break
        for p in optimizer.param_groups:
            try:
                p['lr'] = new_learning_rate
            except:
                pass

    print('Finished Training in {} epochs'.format(epoch + 1))
    informationDict = {'predictionAccuracy': pred_accuracy_epochs,
                       'numEpochsTrained': epoch+1,
                       'lossSaved': losses_epochs}

    # IMPORTANT: When there are batch normalization layers, important information is contained
    # also in the running mean and runnin var values of the batch normalization layers. Since these are not
    # parameters, they don't show up in model.parameter() list (and they don't have quantization points
    # associated with it). So if I return just the optimized quantization points, and quantize the model
    # weight with them, I will have inferior performance because the running mean and var of the batch normalization
    # layers won't be saved. To solve this issue I also return the quantized model state dict, that contains
    # not only the parameter of the models but also this statistics for the batch normalization layers

    return quantizedModel.state_dict(), pointsPerTensor, informationDict
Exemple #4
0
def optimize_quantization_points(modelToQuantize,
                                 train_loader,
                                 test_loader,
                                 options,
                                 optim=None,
                                 numPointsPerTensor=16,
                                 assignBitsAutomatically=False,
                                 use_distillation_loss=False,
                                 bucket_size=None):

    print('Preparing training - pre processing tensors')

    if options is None: options = onmt.standard_options.stdOptions
    if not isinstance(options, dict):
        options = mhf.convertToDictionary(options)
    options = handle_options(options)
    options = mhf.convertToNamedTuple(options)

    modelToQuantize.eval()
    quantizedModel = copy.deepcopy(modelToQuantize)

    fields = train_loader.dataset.fields
    train_loss = make_loss_compute(quantizedModel, fields["tgt"].vocab,
                                   train_loader.dataset, options.copy_attn,
                                   options.copy_attn_force)
    valid_loss = make_loss_compute(quantizedModel, fields["tgt"].vocab,
                                   test_loader.dataset, options.copy_attn,
                                   options.copy_attn_force)
    trunc_size = options.truncated_decoder  # Badly named...
    shard_size = options.max_generator_batches

    numTensorsNetwork = sum(1 for _ in quantizedModel.parameters())
    if isinstance(numPointsPerTensor, int):
        numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork
    if len(numPointsPerTensor) != numTensorsNetwork:
        raise ValueError(
            'numPointsPerTensor must be equal to the number of tensor in the network'
        )

    scalingFunction = quantization.ScalingFunction(type_scaling='linear',
                                                   max_element=False,
                                                   subtract_mean=False,
                                                   modify_in_place=False,
                                                   bucket_size=bucket_size)

    quantizedModel.zero_grad()
    dummy_optim = create_optimizer(
        quantizedModel, options)  #dummy optim, just to pass to trainer
    if assignBitsAutomatically:
        trainer = thf.MyTrainer(quantizedModel, train_loader, test_loader,
                                train_loss, valid_loss, dummy_optim,
                                trunc_size, shard_size)
        batch = next(iter(train_loader))
        quantizedModel.zero_grad()
        trainer.forward_and_backward(0, batch, 0, onmt.Statistics(), None)
        fisherInformation = []
        for p in quantizedModel.parameters():
            fisherInformation.append(p.grad.data.norm())
        numPointsPerTensor = qhf.assign_bits_automatically(fisherInformation,
                                                           numPointsPerTensor,
                                                           input_is_point=True)
        quantizedModel.zero_grad()
        del trainer
        del optim

    # initialize the points using the percentile function so as to make them all usable
    pointsPerTensor = []
    for idx, p in enumerate(quantizedModel.parameters()):
        initial_points = qhf.initialize_quantization_points(
            p.data, scalingFunction, numPointsPerTensor[idx])
        initial_points = Variable(initial_points, requires_grad=True)
        # do a dummy backprop so that the grad attribute is initialized. We need this because we call
        # the .backward() function manually later on (since pytorch can't assign variables to model
        # parameters)
        initial_points.sum().backward()
        pointsPerTensor.append(initial_points)

    optionsOpt = copy.deepcopy(mhf.convertToDictionary(options))
    optimizer = create_optimizer(pointsPerTensor,
                                 mhf.convertToNamedTuple(optionsOpt))
    trainer = thf.MyTrainer(quantizedModel, train_loader, test_loader,
                            train_loss, valid_loss, dummy_optim, trunc_size,
                            shard_size)
    perplexity_epochs = []

    quantizationFunctions = []
    for idx, p in enumerate(modelToQuantize.parameters()):
        #efficient version of nonUniformQuantization
        quant_fun = quantization.nonUniformQuantization_variable(
            max_element=False,
            subtract_mean=False,
            modify_in_place=False,
            bucket_size=bucket_size,
            pre_process_tensors=True,
            tensor=p.data)

        quantizationFunctions.append(quant_fun)

    print('Pre processing done, training started')

    for epoch in range(options.start_epoch, options.epochs + 1):
        train_stats = onmt.Statistics()
        quantizedModel.train()
        for idx_batch, batch in enumerate(train_loader):

            #zero the gradient
            quantizedModel.zero_grad()

            # quantize the weights
            for idx, p_quantized in enumerate(quantizedModel.parameters()):
                #I am using the efficient version of nonUniformQuantization. The tensors (that don't change across
                #iterations) are saved inside the quantization function, and we only need to pass the quantization
                #points
                p_quantized.data = quantizationFunctions[idx].forward(
                    None, pointsPerTensor[idx].data)

            trainer.forward_and_backward(idx_batch, batch, epoch, train_stats,
                                         report_func, use_distillation_loss,
                                         modelToQuantize)

            # now get the gradient of the pointsPerTensor
            for idx, p in enumerate(quantizedModel.parameters()):
                pointsPerTensor[idx].grad.data = quantizationFunctions[
                    idx].backward(p.grad.data)[1]

            optimizer.step()

            # after optimzer.step() we need to make sure that the points are still sorted
            for points in pointsPerTensor:
                points.data = torch.sort(points.data)[0]

        print('Train perplexity: %g' % train_stats.ppl())
        print('Train accuracy: %g' % train_stats.accuracy())

        # 2. Validate on the validation set.
        valid_stats = trainer.validate()
        print('Validation perplexity: %g' % valid_stats.ppl())
        print('Validation accuracy: %g' % valid_stats.accuracy())
        perplexity_epochs.append(valid_stats.ppl())

        # 3. Update the learning rate
        optimizer.updateLearningRate(valid_stats.ppl(), epoch)

    informationDict = {}
    informationDict['perplexity'] = perplexity_epochs
    informationDict[
        'numEpochsTrained'] = options.epochs + 1 - options.start_epoch
    return pointsPerTensor, informationDict