def optimize_quantization_points(modelToQuantize,
                                 train_loader,
                                 test_loader,
                                 initial_learning_rate=1e-5,
                                 initial_momentum=0.9,
                                 epochs_to_train=30,
                                 print_every=500,
                                 use_nesterov=True,
                                 learning_rate_style='generic',
                                 numPointsPerTensor=16,
                                 assignBitsAutomatically=False,
                                 bucket_size=None,
                                 use_distillation_loss=True,
                                 initialize_method='quantiles',
                                 quantize_first_and_last_layer=True):

    print('Preparing training - pre processing tensors')

    numTensorsNetwork = sum(1 for _ in modelToQuantize.parameters())
    initialize_method = initialize_method.lower()
    if initialize_method not in ('quantiles', 'uniform'):
        raise ValueError(
            'The initialization method must be either quantiles or uniform')

    if isinstance(numPointsPerTensor, int):
        numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork

    if len(numPointsPerTensor) != numTensorsNetwork:
        raise ValueError(
            'numPointsPerTensor must be equal to the number of tensor in the network'
        )

    if quantize_first_and_last_layer is False:
        numPointsPerTensor = numPointsPerTensor[1:-1]

    #same scaling function that is used inside nonUniformQUantization. It is important they are the same
    scalingFunction = quantization.ScalingFunction('linear', False, False,
                                                   bucket_size, False)

    #if assigning bits automatically, use the 2-norm of the gradient to determine weights importance
    if assignBitsAutomatically:
        num_to_estimate_grad = 5
        modelToQuantize.zero_grad()
        for idx_minibatch, batch in enumerate(train_loader, start=1):
            cnn_hf.forward_and_backward(modelToQuantize,
                                        batch,
                                        idx_batch=idx_minibatch,
                                        epoch=0,
                                        use_distillation_loss=False)
            if idx_minibatch >= num_to_estimate_grad:
                break

        #now we compute the 2-norm of the gradient for each parameter
        fisherInformation = []
        for idx, p in enumerate(modelToQuantize.parameters()):
            if quantize_first_and_last_layer is False:
                if idx == 0 or idx == numTensorsNetwork - 1:
                    continue
            fisherInformation.append(
                (p.grad.data / num_to_estimate_grad).norm())

        #zero the grad we computed
        modelToQuantize.zero_grad()

        #now we use a simple linear proportion to assign bits
        #the minimum number of points is half what was given as input
        numPointsPerTensor = quantization.help_functions.assign_bits_automatically(
            fisherInformation, numPointsPerTensor, input_is_point=True)

    #initialize the points using the percentile function so as to make them all usable
    pointsPerTensor = []
    if initialize_method == 'quantiles':
        for idx, p in enumerate(modelToQuantize.parameters()):
            if quantize_first_and_last_layer is True:
                currPointsPerTensor = numPointsPerTensor[idx]
            else:
                if idx == 0 or idx == numTensorsNetwork - 1:
                    continue
                currPointsPerTensor = numPointsPerTensor[idx - 1]
            initial_points = quantization.help_functions.initialize_quantization_points(
                p.data, scalingFunction, currPointsPerTensor)
            initial_points = Variable(initial_points, requires_grad=True)
            # do a dummy backprop so that the grad attribute is initialized. We need this because we call
            # the .backward() function manually later on (since pytorch can't assign variables to model
            # parameters)
            initial_points.sum().backward()
            pointsPerTensor.append(initial_points)
    elif initialize_method == 'uniform':
        for numPoint in numPointsPerTensor:
            initial_points = torch.FloatTensor(
                [x / (numPoint - 1) for x in range(numPoint)])
            if USE_CUDA: initial_points = initial_points.cuda()
            initial_points = Variable(initial_points, requires_grad=True)
            # do a dummy backprop so that the grad attribute is initialized. We need this because we call
            # the .backward() function manually later on (since pytorch can't assign variables to model
            # parameters)
            initial_points.sum().backward()
            pointsPerTensor.append(initial_points)
    else:
        raise ValueError

    #dealing with 0 momentum
    options_optimizer = {}
    if initial_momentum != 0:
        options_optimizer = {
            'momentum': initial_momentum,
            'nesterov': use_nesterov
        }
    optimizer = optim.SGD(pointsPerTensor,
                          lr=initial_learning_rate,
                          **options_optimizer)

    lr_scheduler = cnn_hf.LearningRateScheduler(initial_learning_rate,
                                                learning_rate_style)
    startTime = time.time()

    pred_accuracy_epochs = []
    losses_epochs = []
    last_loss_saved = float('inf')
    number_minibatches_per_epoch = len(train_loader)

    if print_every > number_minibatches_per_epoch:
        print_every = number_minibatches_per_epoch // 2

    modelToQuantize.eval()
    quantizedModel = copy.deepcopy(modelToQuantize)
    epoch = 0

    quantizationFunctions = []
    for idx, p in enumerate(quantizedModel.parameters()):
        if quantize_first_and_last_layer is False:
            if idx == 0 or idx == numTensorsNetwork - 1:
                continue
        #efficient version of nonUniformQuantization
        quant_fun = quantization.nonUniformQuantization_variable(
            max_element=False,
            subtract_mean=False,
            modify_in_place=False,
            bucket_size=bucket_size,
            pre_process_tensors=True,
            tensor=p.data)

        quantizationFunctions.append(quant_fun)

    print('Pre processing done, training started')

    for epoch in range(epochs_to_train):
        quantizedModel.train()
        print_loss_total = 0
        for idx_minibatch, data in enumerate(train_loader, start=1):

            #zero the gradient of the parameters model
            quantizedModel.zero_grad()
            optimizer.zero_grad()

            #quantize the model parameters
            for idx, p_quantized in enumerate(quantizedModel.parameters()):
                if quantize_first_and_last_layer is False:
                    if idx == 0 or idx == numTensorsNetwork - 1:
                        continue
                    currIdx = idx - 1
                else:
                    currIdx = idx
                #efficient quantization
                p_quantized.data = quantizationFunctions[currIdx].forward(
                    None, pointsPerTensor[currIdx].data)

            print_loss = cnn_hf.forward_and_backward(
                quantizedModel,
                data,
                idx_minibatch,
                epoch,
                use_distillation_loss=use_distillation_loss,
                teacher_model=modelToQuantize)

            #now get the gradient of the pointsPerTensor
            for idx, p in enumerate(quantizedModel.parameters()):
                if quantize_first_and_last_layer is False:
                    if idx == 0 or idx == numTensorsNetwork - 1:
                        continue
                    currIdx = idx - 1
                else:
                    currIdx = idx
                pointsPerTensor[currIdx].grad.data = quantizationFunctions[
                    currIdx].backward(p.grad.data)[1]

            optimizer.step()

            #after optimzer.step() we need to make sure that the points are still sorted. Implementation detail
            for points in pointsPerTensor:
                points.data = torch.sort(points.data)[0]

            # print statistics
            print_loss_total += print_loss
            if (idx_minibatch) % print_every == 0:
                last_loss_saved = print_loss_total / print_every
                str_to_print = 'Time Elapsed: {}, [Epoch: {}, Minibatch: {}], loss: {:3f}'.format(
                    mhf.timeSince(startTime), epoch + 1, idx_minibatch,
                    last_loss_saved)
                if pred_accuracy_epochs:
                    str_to_print += '. Last prediction accuracy: {:2f}%'.format(
                        pred_accuracy_epochs[-1] * 100)
                print(str_to_print)
                print_loss_total = 0

        losses_epochs.append(last_loss_saved)
        curr_pred_accuracy = cnn_hf.evaluateModel(quantizedModel,
                                                  test_loader,
                                                  fastEvaluation=False)
        pred_accuracy_epochs.append(curr_pred_accuracy)
        print(' === Epoch: {} - prediction accuracy {:2f}% === '.format(
            epoch + 1, curr_pred_accuracy * 100))

        # updating the learning rate
        new_learning_rate, stop_training = lr_scheduler.update_learning_rate(
            epoch, 1 - curr_pred_accuracy)
        if stop_training is True:
            break
        for p in optimizer.param_groups:
            try:
                p['lr'] = new_learning_rate
            except:
                pass

    print('Finished Training in {} epochs'.format(epoch + 1))
    informationDict = {
        'predictionAccuracy': pred_accuracy_epochs,
        'numEpochsTrained': epoch + 1,
        'lossSaved': losses_epochs
    }

    #IMPORTANT: When there are batch normalization layers, important information is contained
    #also in the running mean and runnin var values of the batch normalization layers. Since these are not
    #parameters, they don't show up in model.parameter() list (and they don't have quantization points
    #associated with it). So if I return just the optimized quantization points, and quantize the model
    #weight with them, I will have inferior performance because the running mean and var of the batch normalization
    #layers won't be saved. To solve this issue I also return the quantized model state dict, that contains
    #not only the parameter of the models but also this statistics for the batch normalization layers

    return quantizedModel.state_dict(), pointsPerTensor, informationDict
if USE_CUDA:
    quant_distilled_model = quant_distilled_model.cuda()
if NUM_GPUS > 1:
    quant_distilled_model = torch.nn.parallel.DataParallel(quant_distilled_model)

if not quant_distilled_model_name in imagenet_manager.saved_models:
    imagenet_manager.add_new_model(quant_distilled_model_name, quantDistilledModelPath,
                                   arguments_creator_function=quantDistilledOptions)

if TRAIN_QUANTIZED_DISTILLED:
    imagenet_manager.train_model(quant_distilled_model, model_name=quant_distilled_model_name,
                                 train_function=convForwModel.train_model,
                                 arguments_train_function={'epochs_to_train': epochsToTrainImageNet,
                                                           'learning_rate_style': 'imagenet',
                                                           'initial_learning_rate': 0.1,
                                                           'use_nesterov':True,
                                                           'initial_momentum':0.9,
                                                           'weight_decayL2':1e-4,
                                                           'start_epoch': 0,
                                                           'print_every':30,
                                                           'use_distillation_loss':True,
                                                           'teacher_model': alexnet_unquantized,
                                                           'quantizeWeights':True,
                                                           'numBits':NUM_BITS,
                                                           'bucket_size':256,
                                                           'quantize_first_and_last_layer': False},
                                 train_loader=train_loader, test_loader=test_loader)
quant_distilled_model.load_state_dict(imagenet_manager.load_model_state_dict(quant_distilled_model_name))

print(cnn_hf.evaluateModel(quant_distilled_model))
def train_model(model,
                train_loader,
                test_loader,
                initial_learning_rate=0.001,
                use_nesterov=True,
                initial_momentum=0.9,
                weight_decayL2=0.00022,
                epochs_to_train=100,
                print_every=500,
                learning_rate_style='generic',
                use_distillation_loss=False,
                teacher_model=None,
                quantizeWeights=False,
                numBits=8,
                grad_clipping_threshold=False,
                start_epoch=0,
                bucket_size=None,
                quantizationFunctionToUse='uniformLinearScaling',
                backprop_quantization_style='none',
                estimate_quant_grad_every=1,
                add_gradient_noise=False,
                ask_teacher_strategy=('always', None),
                quantize_first_and_last_layer=True,
                mix_with_differentiable_quantization=False):

    # backprop_quantization_style determines how to modify the gradients to take into account the
    # quantization function. Specifically, one can use 'none', where gradients are not modified,
    # 'truncated', where gradient values outside -1 and 1 are truncated to 0 (as per the paper
    # specified in the comments) and 'complicated', which is the temp name for my idea which is slow and complicated
    # to compute

    if use_distillation_loss is True and teacher_model is None:
        raise ValueError(
            'To compute distillation loss you have to pass the teacher model')

    if teacher_model is not None:
        teacher_model.eval()

    learning_rate_style = learning_rate_style.lower()
    lr_scheduler = cnn_hf.LearningRateScheduler(initial_learning_rate,
                                                learning_rate_style)
    new_learning_rate = initial_learning_rate
    optimizer = optim.SGD(model.parameters(),
                          lr=initial_learning_rate,
                          nesterov=use_nesterov,
                          momentum=initial_momentum,
                          weight_decay=weight_decayL2)
    startTime = time.time()

    pred_accuracy_epochs = []
    percentages_asked_teacher = []
    losses_epochs = []
    informationDict = {}
    last_loss_saved = float('inf')
    step_since_last_grad_quant_estimation = 1
    number_minibatches_per_epoch = len(train_loader)

    if quantizeWeights:
        quantizationFunctionToUse = quantizationFunctionToUse.lower()
        if backprop_quantization_style is None:
            backprop_quantization_style = 'none'
        backprop_quantization_style = backprop_quantization_style.lower()
        if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower():
            s = 2**(numBits - 1)
            type_of_scaling = 'absmax'
        elif quantizationFunctionToUse == 'uniformLinearScaling'.lower():
            s = 2**numBits
            type_of_scaling = 'linear'
        else:
            raise ValueError(
                'The specified quantization function is not present')

        if backprop_quantization_style is None or backprop_quantization_style in (
                'none', 'truncated'):
            quantizeFunctions = lambda x: quantization.uniformQuantization(
                x,
                s,
                type_of_scaling=type_of_scaling,
                stochastic_rounding=False,
                max_element=False,
                subtract_mean=False,
                modify_in_place=False,
                bucket_size=bucket_size)[0]

        elif backprop_quantization_style == 'complicated':
            quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling,
                                                    stochastic_rounding=False,
                                                    max_element=False,
                                                    subtract_mean=False,
                                                    modify_in_place=False, bucket_size=bucket_size) \
                                 for _ in model.parameters()]
        else:
            raise ValueError(
                'The specified backprop_quantization_style not recognized')

        num_parameters = sum(1 for _ in model.parameters())

        def quantize_weights_model(model):
            for idx, p in enumerate(model.parameters()):
                if quantize_first_and_last_layer is False:
                    if idx == 0 or idx == num_parameters - 1:
                        continue  #don't quantize first and last layer
                if backprop_quantization_style == 'truncated':
                    p.data.clamp_(-1, 1)
                if backprop_quantization_style in ('none', 'truncated'):
                    p.data = quantizeFunctions(p.data)
                elif backprop_quantization_style == 'complicated':
                    p.data = quantizeFunctions[idx].forward(p.data)
                else:
                    raise ValueError

        def backward_quant_weights_model(model):
            if backprop_quantization_style == 'none':
                return

            for idx, p in enumerate(model.parameters()):
                if quantize_first_and_last_layer is False:
                    if idx == 0 or idx == num_parameters - 1:
                        continue  #don't quantize first and last layer

                # Now some sort of backward. For the none style, we don't do anything.
                # for the truncated style, we just need to truncate the grad weights
                # as per the paper here: https://arxiv.org/pdf/1609.07061.pdf
                # if we are quantizing, I put gradient values above 1 to 0.
                # their case it not immediately applicable to ours, but let's try this out
                if backprop_quantization_style == 'truncated':
                    p.grad.data[p.data.abs() > 1] = 0
                elif backprop_quantization_style == 'complicated':
                    p.grad.data = quantizeFunctions[idx].backward(p.grad.data)

    if print_every > number_minibatches_per_epoch:
        print_every = number_minibatches_per_epoch // 2

    try:
        epoch = start_epoch
        for epoch in range(start_epoch, epochs_to_train + start_epoch):
            print("begin training")
            if USE_CUDA:
                print("USE_CUDA")
            if mix_with_differentiable_quantization:
                print('=== Starting Quantized Distillation epoch === ')
            model.train()
            print_loss_total = 0
            count_asked_teacher = 0
            count_asked_total = 0
            for idx_minibatch, data in enumerate(train_loader, start=1):

                if quantizeWeights:
                    if step_since_last_grad_quant_estimation >= estimate_quant_grad_every:
                        # we save them because we only want to quantize weights to compute gradients,
                        # but keep using non-quantized weights during the algorithm
                        model_state_dict = model.state_dict()
                        quantize_weights_model(model)

                model.zero_grad()
                print_loss, curr_c_teach, curr_c_total = cnn_hf.forward_and_backward(
                    model,
                    data,
                    idx_minibatch,
                    epoch,
                    use_distillation_loss=use_distillation_loss,
                    teacher_model=teacher_model,
                    ask_teacher_strategy=ask_teacher_strategy,
                    return_more_info=True)
                count_asked_teacher += curr_c_teach
                count_asked_total += curr_c_total

                #load the non-quantize weights and use them for the update. The quantized
                #weights are used only to get the quantized gradient
                if quantizeWeights:
                    if step_since_last_grad_quant_estimation >= estimate_quant_grad_every:
                        model.load_state_dict(model_state_dict)
                        del model_state_dict  #free memory

                if add_gradient_noise and not quantizeWeights:
                    cnn_hf.add_gradient_noise(model, idx_minibatch, epoch,
                                              number_minibatches_per_epoch)

                if grad_clipping_threshold is not False:
                    # gradient clipping
                    for p in model.parameters():
                        p.grad.data.clamp_(-grad_clipping_threshold,
                                           grad_clipping_threshold)

                if quantizeWeights:
                    if step_since_last_grad_quant_estimation >= estimate_quant_grad_every:
                        backward_quant_weights_model(model)

                optimizer.step()

                if step_since_last_grad_quant_estimation >= estimate_quant_grad_every:
                    step_since_last_grad_quant_estimation = 0

                step_since_last_grad_quant_estimation += 1

                # print statistics
                print_loss_total += print_loss
                if (idx_minibatch) % print_every == 0:
                    last_loss_saved = print_loss_total / print_every
                    str_to_print = 'Time Elapsed: {}, [Start Epoch: {}, Epoch: {}, Minibatch: {}], loss: {:3f}'.format(
                        mhf.timeSince(startTime), start_epoch + 1, epoch + 1,
                        idx_minibatch, last_loss_saved)
                    if pred_accuracy_epochs:
                        str_to_print += ' Last prediction accuracy: {:2f}%'.format(
                            pred_accuracy_epochs[-1] * 100)
                    print(str_to_print)
                    print_loss_total = 0

            curr_percentages_asked_teacher = count_asked_teacher / count_asked_total if count_asked_total != 0 else 0
            percentages_asked_teacher.append(curr_percentages_asked_teacher)
            losses_epochs.append(last_loss_saved)
            curr_pred_accuracy = cnn_hf.evaluateModel(model,
                                                      test_loader,
                                                      fastEvaluation=False)
            pred_accuracy_epochs.append(curr_pred_accuracy)
            print(' === Epoch: {} - prediction accuracy {:2f}% === '.format(
                epoch + 1, curr_pred_accuracy * 100))

            if mix_with_differentiable_quantization and epoch != start_epoch + epochs_to_train - 1:
                print('=== Starting Differentiable Quantization epoch === ')
                #the diff quant step is not done at the last epoch, so we end on a quantized distillation epoch
                model_state_dict = optimize_quantization_points(
                    model,
                    train_loader,
                    test_loader,
                    new_learning_rate,
                    initial_momentum=initial_momentum,
                    epochs_to_train=1,
                    print_every=print_every,
                    use_nesterov=use_nesterov,
                    learning_rate_style=learning_rate_style,
                    numPointsPerTensor=2**numBits,
                    assignBitsAutomatically=True,
                    bucket_size=bucket_size,
                    use_distillation_loss=True,
                    initialize_method='quantiles',
                    quantize_first_and_last_layer=quantize_first_and_last_layer
                )[0]
                model.load_state_dict(model_state_dict)
                del model_state_dict  # free memory
                losses_epochs.append(last_loss_saved)
                curr_pred_accuracy = cnn_hf.evaluateModel(model,
                                                          test_loader,
                                                          fastEvaluation=False)
                pred_accuracy_epochs.append(curr_pred_accuracy)
                print(
                    ' === Epoch: {} - prediction accuracy {:2f}% === '.format(
                        epoch + 1, curr_pred_accuracy * 100))

            #updating the learning rate
            new_learning_rate, stop_training = lr_scheduler.update_learning_rate(
                epoch, 1 - curr_pred_accuracy)
            if stop_training is True:
                break
            for p in optimizer.param_groups:
                try:
                    p['lr'] = new_learning_rate
                except:
                    pass

    except Exception as e:
        print(
            'An exception occurred: {}\n. Training has been stopped after {} epochs.'
            .format(e, epoch))
        informationDict['errorFlag'] = True
        informationDict['numEpochsTrained'] = epoch - start_epoch

        return model, informationDict
    except KeyboardInterrupt:
        print('User stopped training after {} epochs'.format(epoch))
        informationDict['errorFlag'] = False
        informationDict['numEpochsTrained'] = epoch - start_epoch
    else:
        print('Finished Training in {} epochs'.format(epoch + 1))
        informationDict['errorFlag'] = False
        informationDict['numEpochsTrained'] = epoch + 1 - start_epoch

    if quantizeWeights:
        quantize_weights_model(model)

    if mix_with_differentiable_quantization:
        informationDict['numEpochsTrained'] *= 2

    informationDict['percentages_asked_teacher'] = percentages_asked_teacher
    informationDict['predictionAccuracy'] = pred_accuracy_epochs
    informationDict['lossSaved'] = losses_epochs
    return model, informationDict
Ejemplo n.º 4
0
teacherModel = convForwModel.ConvolForwardNet(**convForwModel.teacherModelSpec,
                                              useBatchNorm=USE_BATCH_NORM,
                                              useAffineTransformInBatchNorm=AFFINE_BATCH_NORM)
if USE_CUDA: teacherModel = teacherModel.cuda()
if not model_name in cifar10Manager.saved_models:
    cifar10Manager.add_new_model(model_name, teacherModelPath,
            arguments_creator_function={**convForwModel.teacherModelSpec,
                                        'useBatchNorm':USE_BATCH_NORM,
                                        'useAffineTransformInBatchNorm':AFFINE_BATCH_NORM})
if TRAIN_TEACHER_MODEL:
    cifar10Manager.train_model(teacherModel, model_name=model_name,
                               train_function=convForwModel.train_model,
                               arguments_train_function={'epochs_to_train': epochsToTrainCIFAR},
                               train_loader=train_loader, test_loader=test_loader)
teacherModel.load_state_dict(cifar10Manager.load_model_state_dict(model_name))
cnn_hf.evaluateModel(teacherModel, test_loader, k=5)


#Define the architechtures we want to try
smallerModelSpec0 = {'spec_conv_layers': [(75, 5, 5), (50, 5, 5), (50, 5, 5), (25, 5, 5)],
                    'spec_max_pooling': [(1, 2, 2), (3, 2, 2)],
                    'spec_dropout_rates': [(1, 0.2), (3, 0.3), (4, 0.4)],
                    'spec_linear': [500], 'width': 32, 'height': 32}
smallerModelSpec1 = {'spec_conv_layers': [(50, 5, 5), (25, 5, 5), (25, 5, 5), (10, 5, 5)],
                    'spec_max_pooling': [(1, 2, 2), (3, 2, 2)],
                    'spec_dropout_rates': [(1, 0.2), (3, 0.3), (4, 0.4)],
                    'spec_linear': [400], 'width': 32, 'height': 32}
smallerModelSpec2 = {'spec_conv_layers': [(25, 5, 5), (10, 5, 5), (10, 5, 5), (5, 5, 5)],
                    'spec_max_pooling': [(1, 2, 2), (3, 2, 2)],
                    'spec_dropout_rates': [(1, 0.2), (3, 0.3), (4, 0.4)],
                    'spec_linear': [300], 'width': 32, 'height': 32}
Ejemplo n.º 5
0
        if compute_initial_points is True:
            compute_initial_points = 'quantiles'
        else:
            compute_initial_points = 'uniform'
        str_identifier = 'quantpoints{}bits_auto{}_distill{}_initial"{}"'.format(
            numBit, assign_bits_auto, use_distillation_loss,
            compute_initial_points)
        distilled_quantized_model_name = distilled_model_name + str_identifier
        save_path = cifar10Manager.get_model_base_path(
            distilled_model_name) + str_identifier
        with open(save_path, 'rb') as p:
            quantization_points, infoDict = pickle.load(p)

        distilled_quantized_model = convForwModel.ConvolForwardNet(
            **distilledModelSpec,
            useBatchNorm=USE_BATCH_NORM,
            useAffineTransformInBatchNorm=AFFINE_BATCH_NORM)
        if USE_CUDA:
            distilled_quantized_model = distilled_quantized_model.cuda()
        distilled_quantized_model.load_state_dict(
            torch.load(save_path + '_model_state_dict'))
        reported_accuracy = max(infoDict['predictionAccuracy'])
        actual_accuracy = cnn_hf.evaluateModel(
            distilled_quantized_model,
            test_loader)  #this corresponds to the last one
        #the only problem is that I don't save the model with the max accuracy, but the model at the last epoch
        print(
            'Model "{}" => reported accuracy: {} - actual accuracy: {}'.format(
                distilled_quantized_model_name, reported_accuracy,
                actual_accuracy))
Ejemplo n.º 6
0
    imagenet_manager.add_new_model(quant_distilled_model_name, quantDistilledModelPath,
                                   arguments_creator_function=quantDistilledOptions)

if TRAIN_QUANTIZED_DISTILLED:
    imagenet_manager.train_model(quant_distilled_model, model_name=quant_distilled_model_name,
                                 train_function=convForwModel.train_model,
                                 arguments_train_function={'epochs_to_train': epochsToTrainImageNet,
                                                           'learning_rate_style': 'imagenet',
                                                           'initial_learning_rate': 0.1,
                                                           'use_nesterov':True,
                                                           'initial_momentum':0.9,
                                                           'weight_decayL2':1e-4,
                                                           'start_epoch': 0,
                                                           'print_every':30,
                                                           'use_distillation_loss':True,
                                                           'teacher_model': alexnet_unquantized,
                                                           'quantizeWeights':True,
                                                           'numBits':NUM_BITS,
                                                           'bucket_size':256,
                                                           'quantize_first_and_last_layer': False},
                                 train_loader=train_loader, test_loader=test_loader)
quant_distilled_model.load_state_dict(imagenet_manager.load_model_state_dict(quant_distilled_model_name))
print(cnn_hf.evaluateModel(quant_distilled_model, test_loader, fastEvaluation=False))
print(cnn_hf.evaluateModel(quant_distilled_model, test_loader, fastEvaluation=False, k=5))
print(cnn_hf.evaluateModel(alexnet_unquantized, test_loader, fastEvaluation=False))
print(cnn_hf.evaluateModel(alexnet_unquantized, test_loader, fastEvaluation=False, k=5))
quant_fun = functools.partial(quantization.uniformQuantization, s=2**4, bucket_size=256)
size_mb = mhf.get_size_quantized_model(quant_distilled_model, 4, quant_fun, 256,
                                       quantizeFirstLastLayer=False)
print(size_mb)
                            train_function=convForwModel.train_model,
                            arguments_train_function={'epochs_to_train': epochsToTrainCIFAR},
                            train_loader=train_loader, test_loader=test_loader)
    # else:
    #     cifar10Manager.train_model(teacherModel, model_name=model_name,
    #                         train_function=convForwModel.train_model,
    #                         continue_training_from =1,
    #                         arguments_train_function={'epochs_to_train': epochsToTrainCIFAR},
    #                         train_loader=train_loader, test_loader=test_loader)

    print("Teacher Model Training complete")

print("Eval Teacher model")
print(model_name)
teacherModel.load_state_dict(cifar10Manager.load_model_state_dict(model_name))
acc = cnn_hf.evaluateModel(teacherModel, test_loader, k=1)
print("Top-1 eval acc is {}".format(acc))

smallerModelSpec2 = {'spec_conv_layers': [(25, 5, 5), (10, 5, 5), (10, 5, 5), (5, 5, 5)],
                    'spec_max_pooling': [(1, 2, 2), (3, 2, 2)],
                    'spec_dropout_rates': [(1, 0.2), (3, 0.3), (4, 0.4)],
                    'spec_linear': [300], 'width': 32, 'height': 32}

small_model_name = 'cifar10_smaller_spec2'
model_small_spec = copy.deepcopy(smallerModelSpec2)

smallerModelPath = os.path.join(model_save_path, small_model_name)
smallerModel = convForwModel.ConvolForwardNet(**model_small_spec,
                                                useBatchNorm=True,
                                                useAffineTransformInBatchNorm=True)
if not small_model_name in cifar10Manager.saved_models:
Ejemplo n.º 8
0
Archivo: main.py Proyecto: Flamexmt/LMA
                **smallerModelSpecs[args.stModel],
                activation=args.stud_act,
                numBins=args.num_bins,
                useBatchNorm=USE_BATCH_NORM,
                useAffineTransformInBatchNorm=AFFINE_BATCH_NORM)
        else:
            model = convForwModel.ConvolForwardNet(
                **convForwModel.teacherModelSpec,
                useBatchNorm=USE_BATCH_NORM,
                useAffineTransformInBatchNorm=AFFINE_BATCH_NORM)
        if USE_CUDA: model = model.cuda()
        test_loader = data.getTestLoader(1)
        import time

        start = time.time()
        cnn_hf.evaluateModel(model, test_loader)
        mem = torch.cuda.max_memory_allocated()
        end = time.time()
        avg_time = (end - start) * 1000 / len(test_loader)
        if args.train_teacher:
            str2save = 'teacher_cifar10: time: {} ms, memory: {} M'.format(
                avg_time, mem / (1024**2))
            print(str2save)
        else:
            str2save = 's_{}_{}_nb_{}cifar10: time: {} ms, memory: {} M'.format(
                args.stModel, args.stud_act, args.num_bins, avg_time,
                mem / (1024**2))
            print(str2save)
        with open('memory.txt', 'a') as fr:
            fr.write(str2save + '\n')
        torch.cuda.empty_cache()