def main():
    global args
    args = parser.parse_args()
    device = args.device
    state_dict_compressed = torch.load(args.state_dict_compressed)

    # instantiating model
    model = 'resnet50' if args.model == 'resnet50_semisup' else args.model
    model = resnet_models.__dict__[model](pretrained=False).to(device)
    criterion = nn.CrossEntropyLoss()
    _, test_loader = load_data(batch_size=args.batch_size, nb_workers=args.n_workers)
    watcher = ActivationWatcherResNet(model)

    # conv1 layer (non-compressed)
    layer = 'conv1'
    state_dict_layer = to_device(state_dict_compressed[layer], device)
    attrgetter(layer)(model).load_state_dict(state_dict_layer)
    attrgetter(layer)(model).float()

    # compressed layers
    compressed_layers = watcher.layers[1:]

    # 2 more layers non-compressed for semi-supervised ResNet50
    if args.model == 'resnet50_semisup':
        non_compressed_layers = ['layer1.0.conv3', 'layer1.0.downsample.0']
        for layer in non_compressed_layers:
            compressed_layers.remove(layer)
            state_dict_layer = to_device(state_dict_compressed[layer], device)
            attrgetter(layer)(model).load_state_dict(state_dict_layer)
            attrgetter(layer)(model).float()

    for layer in compressed_layers:
        # recover centroids and assignments
        state_dict_layer = state_dict_compressed[layer]
        centroids = state_dict_layer['centroids'].float().to(device)
        assignments = state_dict_layer['assignments'].long().to(device)
        n_blocks = state_dict_layer['n_blocks']
        is_conv = state_dict_layer['is_conv']
        k = state_dict_layer['k']

        # instantiate matrix
        M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv)
        attrgetter(layer + '.weight')(model).data = M_hat

    # batch norms
    bn_layers = watcher._get_bn_layers()

    for layer in bn_layers:
        state_dict_layer = to_device(state_dict_compressed[layer], device)
        attrgetter(layer)(model).weight.data = state_dict_layer['weight'].float().to(device)
        attrgetter(layer)(model).bias.data = state_dict_layer['bias'].float().to(device)

    # classifier bias
    layer = 'fc'
    state_dict_layer = to_device(state_dict_compressed['fc_bias'], device)
    attrgetter(layer + '.bias')(model).data = state_dict_layer['bias']

    # evaluate the model
    top_1 = evaluate(test_loader, model, criterion, device=device).item()
    print('Top-1 accuracy of quantized model: {:.2f}'.format(top_1))
Beispiel #2
0
def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    transform_test = transforms.Compose([transforms.ToTensor()])
    testset = torchvision.datasets.CIFAR10(root='data',
                                           train=False,
                                           download=True,
                                           transform=transform_test)
    test_loader = data.DataLoader(testset,
                                  batch_size=32,
                                  shuffle=False,
                                  num_workers=10)

    global args
    args = parser.parse_args()
    #device = args.device
    state_dict_compressed = torch.load(args.state_dict_compressed,
                                       map_location='cpu')

    # instantiating model
    model = 'resnet50' if args.model == 'resnet50_semisup' else args.model
    model = resnet_models.__dict__[model](pretrained=False).to(device)
    criterion = nn.CrossEntropyLoss()
    #_, test_loader = load_data(data_path=args.data_path, batch_size=args.batch_size, nb_workers=args.n_workers)
    #transform_test = transforms.Compose([transforms.ToTensor()])
    #testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform_test)
    #test_loader = data.DataLoader(testset, batch_size=32, shuffle=False, num_workers=10)

    watcher = ActivationWatcherResNet(model)

    # conv1 layer (non-compressed)
    layer = 'conv1'
    state_dict_layer = to_device(state_dict_compressed[layer], device)
    attrgetter(layer)(model).load_state_dict(state_dict_layer)
    attrgetter(layer)(model).float()

    # compressed layers
    compressed_layers = watcher.layers[1:]

    # 2 more layers non-compressed for semi-supervised ResNet50
    if args.model == 'resnet50_semisup':
        non_compressed_layers = ['layer1.0.conv3', 'layer1.0.downsample.0']
        for layer in non_compressed_layers:
            compressed_layers.remove(layer)
            state_dict_layer = to_device(state_dict_compressed[layer], device)
            attrgetter(layer)(model).load_state_dict(state_dict_layer)
            attrgetter(layer)(model).float()

    for layer in compressed_layers:
        # recover centroids and assignments
        state_dict_layer = state_dict_compressed[layer]
        centroids = state_dict_layer['centroids'].float().to(device)
        assignments = state_dict_layer['assignments'].long().to(device)
        n_blocks = state_dict_layer['n_blocks']
        is_conv = state_dict_layer['is_conv']
        k = state_dict_layer['k']

        # instantiate matrix
        M_hat = weight_from_centroids(centroids, assignments, n_blocks, k,
                                      is_conv)
        attrgetter(layer + '.weight')(model).data = M_hat

    # batch norms
    bn_layers = watcher._get_bn_layers()

    for layer in bn_layers:
        state_dict_layer = to_device(state_dict_compressed[layer], device)
        attrgetter(layer)(
            model).weight.data = state_dict_layer['weight'].float().to(device)
        attrgetter(layer)(
            model).bias.data = state_dict_layer['bias'].float().to(device)

    # classifier bias
    layer = 'fc'
    state_dict_layer = to_device(state_dict_compressed['fc_bias'], device)
    attrgetter(layer + '.bias')(model).data = state_dict_layer['bias']

    # evaluate the model
    top_1 = evaluate(test_loader, model, criterion, device=device).item()
    print('Top-1 accuracy of quantized model: {:.2f}'.format(top_1))
Beispiel #3
0
def main():
    # get arguments
    global args
    args = parser.parse_args()
    args.block = '' if args.block == 'all' else args.block

    # student model to quantize
    student = models.__dict__[args.model](pretrained=True).cuda()
    student.eval()
    criterion = nn.CrossEntropyLoss().cuda()
    cudnn.benchmark = True

    # layers to quantize (we do not quantize the first 7x7 convolution layer)
    watcher = ActivationWatcher(student)
    layers = [layer for layer in watcher.layers[1:] if args.block in layer]

    # data loading code
    train_loader, test_loader = load_data(data_path=args.data_path, batch_size=args.batch_size, nb_workers=args.n_workers)

    # parameters for the centroids optimizer
    opt_centroids_params_all = []

    # book-keeping for compression statistics (in MB)
    size_uncompressed = compute_size(student)
    size_index = 0
    size_centroids = 0
    size_other = size_uncompressed

    # teacher model
    teacher = models.__dict__[args.model](pretrained=True).cuda()
    teacher.eval()

    # Step 1: iteratively quantize the network layers (quantization + layer-wise centroids distillation)
    print('Step 1: Quantize network')
    t = time.time()
    top_1 = 0

    for layer in layers:
        #  gather input activations
        n_iter_activations = math.ceil(args.n_activations / args.batch_size)
        watcher = ActivationWatcher(student, layer=layer)
        in_activations_current = watcher.watch(train_loader, criterion, n_iter_activations)
        in_activations_current = in_activations_current[layer]

        # get weight matrix and detach it from the computation graph (.data should be enough, adding .detach() as a safeguard)
        M = attrgetter(layer + '.weight.data')(student).detach()
        sizes = M.size()
        is_conv = len(sizes) == 4

        # get padding and stride attributes
        padding = attrgetter(layer)(student).padding if is_conv else 0
        stride = attrgetter(layer)(student).stride if is_conv else 1
        groups = attrgetter(layer)(student).groups if is_conv else 1

        # block size, distinguish between fully connected and convolutional case
        if is_conv:
            out_features, in_features, k, _ = sizes
            block_size = args.block_size_cv if k > 1 else args.block_size_pw
            n_centroids = args.n_centroids_cv if k > 1 else args.n_centroids_pw
            n_blocks = in_features * k * k // block_size
        else:
            k = 1
            out_features, in_features = sizes
            block_size = args.block_size_fc
            n_centroids = args.n_centroids_fc
            n_blocks = in_features // block_size

        # clamp number of centroids for stability
        powers = 2 ** np.arange(0, 16, 1)
        n_vectors = np.prod(sizes) / block_size
        idx_power = bisect_left(powers, n_vectors / args.n_centroids_threshold)
        n_centroids = min(n_centroids, powers[idx_power - 1])

        # compression rations
        bits_per_weight = np.log2(n_centroids) / block_size

        # number of bits per weight
        size_index_layer = bits_per_weight * M.numel() / 8 / 1024 / 1024
        size_index += size_index_layer

        # centroids stored in float16
        size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024
        size_centroids += size_centroids_layer

        # size of non-compressed layers, e.g. BatchNorms or first 7x7 convolution
        size_uncompressed_layer = M.numel() * 4 / 1024 / 1024
        size_other -= size_uncompressed_layer

        # number of samples
        n_samples = dynamic_sampling(layer)

        # print layer size
        print('Quantizing layer: {}, size: {}, n_blocks: {}, block size: {}, ' \
              'centroids: {}, bits/weight: {:.2f}, compressed size: {:.2f} MB'.format(
               layer, list(sizes), n_blocks, block_size, n_centroids,
               bits_per_weight, size_index_layer + size_centroids_layer))

        # quantizer
        quantizer = PQ(in_activations_current, M, n_activations=args.n_activations,
                       n_samples=n_samples, eps=args.eps, n_centroids=n_centroids,
                       n_iter=args.n_iter, n_blocks=n_blocks, k=k,
                       stride=stride, padding=padding, groups=groups)

        if len(args.restart) > 0:
            # do not quantize already quantized layers
            try:
                # load centroids and assignments if already stored
                quantizer.load(args.restart, layer)
                centroids = quantizer.centroids
                assignments = quantizer.assignments

                # quantize weight matrix
                M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv)
                attrgetter(layer + '.weight')(student).data = M_hat
                quantizer.save(args.save, layer)

                # optimizer for global finetuning
                parameters = [p for (n, p) in student.named_parameters() if layer in n and 'bias' not in n]
                centroids_params = {'params': parameters,
                                    'assignments': assignments,
                                    'kernel_size': k,
                                    'n_centroids': n_centroids,
                                    'n_blocks': n_blocks}
                opt_centroids_params_all.append(centroids_params)

                # proceed to next layer
                print('Layer already quantized, proceeding to next layer\n')
                continue

            # otherwise, quantize layer
            except FileNotFoundError:
                print('Quantizing layer')

        # quantize layer
        quantizer.encode()

        # assign quantized weight matrix
        M_hat = quantizer.decode()
        attrgetter(layer + '.weight')(student).data = M_hat

        # top1
        top_1 = evaluate(test_loader, student, criterion).item()

        # book-keeping
        print('Quantizing time: {:.0f}min, Top1 after quantization: {:.2f}\n'.format((time.time() - t) / 60, top_1))
        t = time.time()

        # Step 2: finetune centroids
        print('Finetuning centroids')

        # optimizer for centroids
        parameters = [p for (n, p) in student.named_parameters() if layer in n and 'bias' not in n]
        assignments = quantizer.assignments
        centroids_params = {'params': parameters,
                            'assignments': assignments,
                            'kernel_size': k,
                            'n_centroids': n_centroids,
                            'n_blocks': n_blocks}

        # remember centroids parameters to finetuning at the end
        opt_centroids_params = [centroids_params]
        opt_centroids_params_all.append(centroids_params)

        # custom optimizer
        optimizer_centroids = CentroidSGD(opt_centroids_params, lr=args.lr_centroids,
                                          momentum=args.momentum_centroids,
                                          weight_decay=args.weight_decay_centroids)

        # standard training loop
        n_iter = args.finetune_centroids
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer_centroids, step_size=1, gamma=0.1)

        for epoch in range(1):
            finetune_centroids(train_loader, student, teacher, criterion, optimizer_centroids, n_iter=n_iter)
            top_1 = evaluate(test_loader, student, criterion)
            scheduler.step()
            print('Epoch: {}, Top1: {:.2f}'.format(epoch, top_1))

        print('After {} iterations with learning rate {}, Top1: {:.2f}'.format(n_iter, args.lr_centroids, top_1))

        # book-keeping
        print('Finetuning centroids time: {:.0f}min, Top1 after finetuning centroids: {:.2f}\n'.format((time.time() - t) / 60, top_1))
        t = time.time()

        # saving
        M_hat = attrgetter(layer + '.weight')(student).data
        centroids = centroids_from_weights(M_hat, assignments, n_centroids, n_blocks)
        quantizer.centroids = centroids
        quantizer.save(args.save, layer)

    # End of compression + finetuning of centroids
    size_compressed = size_index + size_centroids + size_other
    print('End of compression, non-compressed teacher model: {:.2f}MB, compressed student model ' \
          '(indexing + centroids + other): {:.2f}MB + {:.2f}MB + {:.2f}MB = {:.2f}MB, compression ratio: {:.2f}x\n'.format(
          size_uncompressed, size_index, size_centroids, size_other, size_compressed, size_uncompressed / size_compressed))

    # Step 3: finetune whole network
    print('Step 3: Finetune whole network')
    t = time.time()

    # custom optimizer
    optimizer_centroids_all = CentroidSGD(opt_centroids_params_all, lr=args.lr_whole,
                                      momentum=args.momentum_whole,
                                      weight_decay=args.weight_decay_whole)

    # standard training loop
    n_iter = args.finetune_whole
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer_centroids_all, step_size=args.finetune_whole_step_size, gamma=0.1)

    for epoch in range(args.finetune_whole_epochs):
        student.train()
        finetune_centroids(train_loader, student, teacher, criterion, optimizer_centroids_all, n_iter=n_iter)
        top_1 = evaluate(test_loader, student, criterion)
        scheduler.step()
        print('Epoch: {}, Top1: {:.2f}'.format(epoch, top_1))

    # state dict pf compressed model
    state_dict_compressed = {}

    # save conv1 (not quantized)
    state_dict_compressed['conv1'] = student.conv1.state_dict()

    # save biases of the classifier
    state_dict_compressed['fc_bias'] = {'bias': student.fc.bias}

    # save batch norms
    bn_layers = watcher._get_bn_layers()

    for bn_layer in bn_layers:
        state_dict_compressed[bn_layer] = attrgetter(bn_layer)(student).state_dict()

    # save quantized layers
    for layer in layers:

        # stats
        M = attrgetter(layer + '.weight.data')(student).detach()
        sizes = M.size()
        is_conv = len(sizes) == 4

        # get padding and stride attributes
        padding = attrgetter(layer)(student).padding if is_conv else 0
        stride = attrgetter(layer)(student).stride if is_conv else 1
        groups = attrgetter(layer)(student).groups if is_conv else 1

        # block size, distinguish between fully connected and convolutional case
        if is_conv:
            out_features, in_features, k, _ = sizes
            block_size = args.block_size_cv if k > 1 else args.block_size_pw
            n_centroids = args.n_centroids_cv
            n_blocks = in_features * k * k // block_size
        else:
            k = 1
            out_features, in_features = sizes
            block_size = args.block_size_fc
            n_centroids = args.n_centroids_fc
            n_blocks = in_features // block_size

        # clamp number of centroids for stability
        powers = 2 ** np.arange(0, 16, 1)
        n_vectors = np.prod(sizes) / block_size
        idx_power = bisect_left(powers, n_vectors / args.n_centroids_threshold)
        n_centroids = min(n_centroids, powers[idx_power - 1])

        # save
        quantizer.load(args.save, layer)
        assignments = quantizer.assignments
        M_hat = attrgetter(layer + '.weight')(student).data
        centroids = centroids_from_weights(M_hat, assignments, n_centroids, n_blocks)
        quantizer.centroids = centroids
        quantizer.save(args.save, layer)
        state_dict_layer = {
            'centroids': centroids.half(),
            'assignments': assignments.short() if 'fc' in layer else assignments.byte(),
            'n_blocks': n_blocks,
            'is_conv': is_conv,
            'k': k
        }
        state_dict_compressed[layer] = state_dict_layer

    # save model
    torch.save(state_dict_compressed, os.path.join(args.save, 'state_dict_compressed.pth'))

    # book-keeping
    print('Finetuning whole network time: {:.0f}min, Top1 after finetuning centroids: {:.2f}\n'.format((time.time() - t) / 60, top_1))
Beispiel #4
0
def main():
    # get arguments
    global args
    args = parser.parse_args()
    args.block = '' if args.block == 'all' else args.block


    PATH = "./models/trained"
    student = torch.load(os.path.join(PATH, "resnet18_2.pth"))
    teacher = torch.load(os.path.join(PATH, "resnet18_2.pth"))


    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    student.to(device)

    teacher.to(device)



    criterion = nn.CrossEntropyLoss().cuda()

    cudnn.benchmark = True

    # layers to quantize (we do not quantize the first 7x7 convolution layer)
    watcher = ActivationWatcher(student)
    layers = [layer for layer in watcher.layers[1:] if args.block in layer]
    # layers = [layer for layer in watcher.layers if args.block in layer]

    # data loading code
    train_loader, test_loader = load_data(batch_size=args.batch_size, nb_workers=args.n_workers)

    # parameters for the centroids optimizer
    opt_centroids_params_all = []

    # book-keeping for compression statistics (in MB)
    size_uncompressed = compute_size(student)
    size_index = 0
    size_centroids = 0
    size_other = size_uncompressed


    t1 = time.time()

    top_1 = evaluate(test_loader, student, criterion)
    print('Time taken validate 10,000 samples : {}s'.format(time.time() - t1))
    # scheduler.step()
    print('Top1 acc of teacher : {:.2f}'.format(top_1))


    # Step 1: iteratively quantize the network layers (quantization + layer-wise centroids distillation)
    print('Loading Quantized network')
    t = time.time()
    top_1 = 0

    for layer in layers:
        #  gather input activations
        n_iter_activations = math.ceil(args.n_activations / args.batch_size)
        watcher = ActivationWatcher(student, layer=layer)
        in_activations_current = watcher.watch(train_loader, criterion, n_iter_activations)
        in_activations_current = in_activations_current[layer]

        # get weight matrix and detach it from the computation graph (.data should be enough, adding .detach() as a safeguard)
        M = attrgetter(layer + '.weight.data')(student).detach()
        sizes = M.size()
        is_conv = len(sizes) == 4

        # get padding and stride attributes
        padding = attrgetter(layer)(student).padding if is_conv else 0
        stride = attrgetter(layer)(student).stride if is_conv else 1
        groups = attrgetter(layer)(student).groups if is_conv else 1

        # block size, distinguish between fully connected and convolutional case
        if is_conv:
            out_features, in_features, k, _ = sizes
            block_size = args.block_size_cv if k > 1 else args.block_size_pw
            n_centroids = args.n_centroids_cv if k > 1 else args.n_centroids_pw
            n_blocks = in_features * k * k // block_size
        else:
            k = 1
            out_features, in_features = sizes
            block_size = args.block_size_fc
            n_centroids = args.n_centroids_fc
            n_blocks = in_features // block_size

        # clamp number of centroids for stability
        powers = 2 ** np.arange(0, 16, 1)
        n_vectors = np.prod(sizes) / block_size
        idx_power = bisect_left(powers, n_vectors / args.n_centroids_threshold)
        n_centroids = min(n_centroids, powers[idx_power - 1])

        # compression rations
        bits_per_weight = np.log2(n_centroids) / block_size

        # number of bits per weight
        size_index_layer = bits_per_weight * M.numel() / 8 / 1024 / 1024
        size_index += size_index_layer

        # centroids stored in float16
        size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024
        size_centroids += size_centroids_layer

        # size of non-compressed layers, e.g. BatchNorms or first 7x7 convolution
        size_uncompressed_layer = M.numel() * 4 / 1024 / 1024
        size_other -= size_uncompressed_layer

        # number of samples
        n_samples = dynamic_sampling(layer)

        # print layer size
        print('Quantized layer: {}, size: {}, n_blocks: {}, block size: {}, ' \
              'centroids: {}, bits/weight: {:.2f}, compressed size: {:.2f} MB'.format(
               layer, list(sizes), n_blocks, block_size, n_centroids,
               bits_per_weight, size_index_layer + size_centroids_layer))

        # quantizer
        quantizer = PQ(in_activations_current, M, n_activations=args.n_activations,
                       n_samples=n_samples, eps=args.eps, n_centroids=n_centroids,
                       n_iter=args.n_iter, n_blocks=n_blocks, k=k,
                       stride=stride, padding=padding, groups=groups)

        if len(args.restart) > 0:
            # do not quantize already quantized layers
            try:
                # load centroids and assignments if already stored
                quantizer.load(args.restart, layer)
                centroids = quantizer.centroids
                assignments = quantizer.assignments

                # quantize weight matrix
                M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv)
                attrgetter(layer + '.weight')(student).data = M_hat
                quantizer.save(args.save, layer)

                # optimizer for global finetuning
                parameters = [p for (n, p) in student.named_parameters() if layer in n and 'bias' not in n]
                centroids_params = {'params': parameters,
                                    'assignments': assignments,
                                    'kernel_size': k,
                                    'n_centroids': n_centroids,
                                    'n_blocks': n_blocks}
                opt_centroids_params_all.append(centroids_params)

                # proceed to next layer
                print('codebook loaded, proceeding to next layer\n')
                continue

            # otherwise, quantize layer
            except FileNotFoundError:
                print('Quantize layer first')


    # End of compression + finetuning of centroids
    size_compressed = size_index + size_centroids + size_other
    print('Non-compressed teacher model: {:.2f}MB, compressed student model ' \
          '(indexing + centroids + other): {:.2f}MB + {:.2f}MB + {:.2f}MB = {:.2f}MB, compression ratio: {:.2f}x\n'.format(
          size_uncompressed, size_index, size_centroids, size_other, size_compressed, size_uncompressed / size_compressed))

    # Step 3: finetune whole network
    print('Validating whole network')
    t = time.time()

    # custom optimizer
    optimizer_centroids_all = CentroidSGD(opt_centroids_params_all, lr=args.lr_whole,
                                      momentum=args.momentum_whole,
                                      weight_decay=args.weight_decay_whole)

    # standard training loop
    n_iter = args.finetune_whole
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer_centroids_all, step_size=args.finetune_whole_step_size, gamma=0.1)

    # for epoch in range(args.finetune_whole_epochs):
    student.train()
    finetune_centroids(train_loader, student, teacher, criterion, optimizer_centroids_all, n_iter=n_iter)
    t1 = time.time()

    top_1 = evaluate(test_loader, student, criterion)
    print('Time taken validate 10,000 samples : {}s'.format(time.time() - t1))
    scheduler.step()
    print('Top1 acc: {:.2f}'.format(top_1))


    print('Total parameters: {}'.format(sum(p.numel() for p in student.parameters() if p.requires_grad)))
Beispiel #5
0
def main():
    global args
    args = parser.parse_args()
    device = args.device
    state_dict_compressed = torch.load(args.state_dict_compressed)
    args.block = '' if args.block == 'all' else args.block
    # instantiating model
    model = 'resnet50' if args.model == 'resnet50_semisup' else args.model
    model = resnet_models.__dict__[model](pretrained=False).to(device)
    model.load_state_dict(torch.load('./running_batchnorm.pth'))
    criterion = nn.CrossEntropyLoss()

    transform_raner = Augmentor.Pipeline()
    transform_raner.random_erasing(probability=0.5, rectangle_area=0.15)
    transform_raner = transform_raner.torch_transform()
    _, test_loader = load_any_data(
        data_path=args.data_path,
        batch_size=args.batch_size,
        nb_workers=args.n_workers,
        transforms_dict={
            'train':
            transforms.Compose([
                transform_raner,
                transforms.RandomHorizontalFlip(),
                transforms.RandomVerticalFlip(),
                transforms.RandomRotation(180, resample=False, expand=False),
                transforms.Resize(224),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225])
            ]),
            'val':
            transforms.Compose([
                transforms.Resize(224),
                transforms.ToTensor(),
                transforms.Normalize([0.485, 0.456, 0.406],
                                     [0.229, 0.224, 0.225])
            ])
        })
    watcher = ActivationWatcherResNet(model)

    #conv1 layer (non-compressed)
    layer = 'conv1'
    state_dict_layer = to_device(state_dict_compressed[layer], device)
    attrgetter(layer)(model).load_state_dict(state_dict_layer)
    attrgetter(layer)(model).float()

    # compressed layers
    compressed_layers = [
        layer for layer in watcher.layers[1:] if args.block in layer
    ]

    # 2 more layers non-compressed for semi-supervised ResNet50
    if args.model == 'resnet50_semisup':
        non_compressed_layers = ['layer1.0.conv3', 'layer1.0.downsample.0']
        for layer in non_compressed_layers:
            compressed_layers.remove(layer)
            state_dict_layer = to_device(state_dict_compressed[layer], device)
            attrgetter(layer)(model).load_state_dict(state_dict_layer)
            attrgetter(layer)(model).float()

    for layer in compressed_layers:
        # recover centroids and assignments
        state_dict_layer = state_dict_compressed[layer]
        centroids = state_dict_layer['centroids'].float().to(device)
        assignments = state_dict_layer['assignments'].long().to(device)
        n_blocks = state_dict_layer['n_blocks']
        is_conv = state_dict_layer['is_conv']
        k = state_dict_layer['k']

        # instantiate matrix
        M_hat = weight_from_centroids(centroids, assignments, n_blocks, k,
                                      is_conv)
        attrgetter(layer + '.weight')(model).data = M_hat

    # batch norms
    bn_layers = watcher._get_bn_layers()
    print(bn_layers)
    for layer in bn_layers:
        state_dict_layer = to_device(state_dict_compressed[layer], device)
        attrgetter(layer)(
            model).weight.data = state_dict_layer['weight'].float().to(device)
        attrgetter(layer)(
            model).bias.data = state_dict_layer['bias'].float().to(device)

    # classifier bias

    layers = ['fc.0', 'fc.3', 'fc.6']
    for i, layer in enumerate(layers):
        state_dict_layer = to_device(state_dict_compressed['fc_bias'], device)
        attrgetter(layer + '.bias')(model).data = state_dict_layer['bias_' +
                                                                   str(i + 1)]

    # model = model.to(device)

    # evaluate the model
    top_1 = evaluate(test_loader,
                     model,
                     criterion,
                     device=device,
                     verbose=True).item()
    print('Top-1 accuracy of quantized model: {:.2f}'.format(top_1))
def main():
    torch.cuda.empty_cache()

    student = real_nvp_model(
        pretrained=True)  # resnet.resnet18_1(pretrained=True).cuda()
    student.eval()
    cudnn.benchmark = True

    criterion = real_nvp_loss.RealNVPLoss().cuda()
    transform_train = transforms.Compose(
        [transforms.RandomHorizontalFlip(),
         transforms.ToTensor()])
    trainset = torchvision.datasets.CIFAR10(root='data',
                                            train=True,
                                            download=True,
                                            transform=transform_train)
    trainloader = data.DataLoader(trainset,
                                  batch_size=16,
                                  shuffle=True,
                                  num_workers=0)

    transform_test = transforms.Compose([transforms.ToTensor()])
    testset = torchvision.datasets.CIFAR10(root='data',
                                           train=False,
                                           download=True,
                                           transform=transform_test)
    testloader = data.DataLoader(testset,
                                 batch_size=16,
                                 shuffle=False,
                                 num_workers=0)

    # parameters for the centroids optimizer
    opt_centroids_params_all = []

    # book-keeping for compression statistics (in MB)
    size_uncompressed = compute_size(student)  #44.591949462890625 mb
    size_index = 0
    size_centroids = 0
    size_other = size_uncompressed

    teacher = real_nvp_model(pretrained=True)
    teacher.eval()

    watcher = ActivationWatcher(student)
    layers = []
    i = 1

    for layer in watcher.layers[1:]:
        if i % 2 == 0:
            layers.append(layer)
        i = i + 1

    restart = 1
    for layer in layers[0:50]:
        print(layer)
        torch.cuda.empty_cache()
        n_iter_activations = math.ceil(1024 / 32)
        watcher = ActivationWatcher(student, layer=layer)
        in_activations_current = watcher.watch(trainloader, criterion,
                                               n_iter_activations)
        in_activations_current = in_activations_current[layer]

        M = attrgetter(layer + '.weight.data')(student).detach()
        sizes = M.size()
        is_conv = len(sizes) == 4

        padding = attrgetter(layer)(student).padding if is_conv else 0
        stride = attrgetter(layer)(student).stride if is_conv else 1
        groups = attrgetter(layer)(student).groups if is_conv else 1

        if is_conv:

            out_features, in_features, k, _ = sizes
            block_size = 9 if k > 1 else 4
            n_centroids = 128 if k > 1 else 128
            n_blocks = in_features * k * k // block_size

        else:
            k = 1
            out_features, in_features = sizes
            block_size = 4
            n_centroids = 256
            n_blocks = in_features // block_size

        powers = 2**np.arange(0, 16, 1)
        n_vectors = np.prod(sizes) / block_size  #4096.0
        idx_power = bisect_left(powers, n_vectors / 4)
        n_centroids = min(n_centroids, powers[idx_power - 1])  #128

        # compression rations
        bits_per_weight = np.log2(n_centroids) / block_size  #0.7778
        # number of bits per weight
        size_index_layer = bits_per_weight * M.numel() / 8 / 1024 / 1024
        size_index += size_index_layer  #0.00341796875
        # centroids stored in float16
        size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024
        size_centroids += size_centroids_layer
        # size of non-compressed layers, e.g. BatchNorms or first 7x7 convolution
        size_uncompressed_layer = M.numel() * 4 / 1024 / 1024
        size_other -= size_uncompressed_layer
        n_samples = 1000

        # quantizer
        quantizer = PQ(in_activations_current,
                       M,
                       n_activations=1024,
                       n_samples=n_samples,
                       eps=1e-8,
                       n_centroids=n_centroids,
                       n_iter=100,
                       n_blocks=n_blocks,
                       k=k,
                       stride=stride,
                       padding=padding,
                       groups=groups)

        if restart:

            try:
                quantizer.load('', layer)
                centroids = quantizer.centroids
                assignments = quantizer.assignments
                # quantize weight matrix
                M_hat = weight_from_centroids(centroids, assignments, n_blocks,
                                              k, is_conv)
                attrgetter(layer + '.weight')(student).data = M_hat
                quantizer.save('', layer)
                # optimizer for global finetuning
                parameters = [
                    attrgetter(layer + '.weight.data')(student).detach()
                ]

                centroids_params = {
                    'params': parameters,
                    'assignments': assignments,
                    'kernel_size': k,
                    'n_centroids': n_centroids,
                    'n_blocks': n_blocks
                }
                opt_centroids_params_all.append(centroids_params)
                # proceed to next layer
                print('Layer already quantized, proceeding to next layer\n')
                continue
            except FileNotFoundError:
                print('Quantizing layer')

        #
        # quantize layer
        quantizer.encode()
        M_hat = quantizer.decode()
        attrgetter(layer + '.weight')(student).data = M_hat

        parameters = []
        parameters = [attrgetter(layer + '.weight.data')(student).detach()]
        assignments = quantizer.assignments
        centroids_params = {
            'params': parameters,
            'assignments': assignments,
            'kernel_size': k,
            'n_centroids': n_centroids,
            'n_blocks': n_blocks
        }
        opt_centroids_params_all.append(centroids_params)
        opt_centroids_params = [centroids_params]
        optimizer_centroids = CentroidSGD(opt_centroids_params,
                                          lr=0.01,
                                          momentum=0.9,
                                          weight_decay=0.0001)
        finetune_centroids(trainloader,
                           student.eval(),
                           teacher,
                           criterion,
                           optimizer_centroids,
                           n_iter=100)

        bpd = evaluate(testloader, student, criterion)
        print('bits per dim:{:.4f} '.format(bpd))
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer_centroids,
                                                    step_size=1,
                                                    gamma=0.1)

        # saving
        M_hat = attrgetter(layer + '.weight')(student).data
        centroids = centroids_from_weights(M_hat, assignments, n_centroids,
                                           n_blocks)
        quantizer.centroids = centroids
        quantizer.save('', layer)
Beispiel #7
0
        output_dir=args.output_dir,
        overwrite_output_dir=False,
        disable_tqdm=True,
        evaluation_strategy="epoch",
        per_device_train_batch_size=args.train_batch_size,
        per_device_eval_batch_size=args.eval_batch_size,
        logging_dir=args.output_dir,
        logging_steps=10,
        load_best_model_at_end=False,
        metric_for_best_model='overall',
        greater_is_better=True)

    args.eval_batch_size = 1
    # unpickle results
    results, preds_list, probs_list, out_label_ids = evaluate(args,
                                                              test_dataset,
                                                              model,
                                                              test_mode=True)
    # print("Results w/o ensemble: ", results)
    # logging.info(results)

    print("probs_list = ", probs_list)
    print("out_label_ids = ", out_label_ids)
    print("evaluate results = ", results)
    results = evaluate_all(out_label_ids, probs_list)
    print("evaluate_all results = ", results)

    print("shape(probs_list) = ", probs_list.shape)
    print("shape(out_label_ids) = ", out_label_ids.shape)

    score_paths = {
        'charbert_xs_open_ua':
Beispiel #8
0
def main(args):
    """ Main function. """

    # --------------------------------- DATA ---------------------------------

    # Tokenizer
    logging.disable(logging.INFO)
    try:
        tokenizer = BertTokenizer.from_pretrained(
            os.path.join('pretrained-models', args.embedding),
            do_lower_case=args.do_lower_case)
    except OSError:
        # For CharacterBert models use BertTokenizer.basic_tokenizer for tokenization
        # and CharacterIndexer for indexing
        tokenizer = BertTokenizer.from_pretrained(
            os.path.join('pretrained-models', 'bert-base-uncased'),
            do_lower_case=args.do_lower_case)
        tokenizer = tokenizer.basic_tokenizer
        characters_indexer = CharacterIndexer()
    logging.disable(logging.NOTSET)

    tokenization_function = tokenizer.tokenize

    # Pre-processsing: apply basic tokenization (both) then split into wordpieces (BERT only)
    data = {}
    for split in ['train', 'test']:
        if args.task == 'classification':
            func = load_classification_dataset
        elif args.task == 'sequence_labelling':
            func = load_sequence_labelling_dataset
        else:
            raise NotImplementedError

        data[split] = func(step=split, do_lower_case=args.do_lower_case)
        retokenize(data[split], tokenization_function)

    logging.info('Splitting training data into train / validation sets...')
    data['validation'] = data['train'][:int(args.validation_ratio *
                                            len(data['train']))]
    data['train'] = data['train'][int(args.validation_ratio *
                                      len(data['train'])):]
    logging.info('New number of training sequences: %d', len(data['train']))
    logging.info('New number of validation sequences: %d',
                 len(data['validation']))

    # Count target labels or classes
    if args.task == 'classification':
        counter_all = Counter([
            example.label
            for example in data['train'] + data['validation'] + data['test']
        ])
        counter = Counter([example.label for example in data['train']])

        # Maximum sequence length is either 512 or maximum token sequence length + 3
        max_seq_length = min(
            512, 3 + max(
                map(len, [
                    e.tokens_a if e.tokens_b is None else e.tokens_a +
                    e.tokens_b
                    for e in data['train'] + data['validation'] + data['test']
                ])))
    elif args.task == 'sequence_labelling':
        counter_all = Counter([
            label
            for example in data['train'] + data['validation'] + data['test']
            for label in example.label_sequence
        ])
        counter = Counter([
            label for example in data['train']
            for label in example.label_sequence
        ])

        # Maximum sequence length is either 512 or maximum token sequence length + 5
        max_seq_length = min(
            512, 5 + max(
                map(len, [
                    e.token_sequence
                    for e in data['train'] + data['validation'] + data['test']
                ])))
    else:
        raise NotImplementedError
    labels = sorted(counter_all.keys())
    num_labels = len(labels)

    logging.info("Goal: predict the following labels")
    for i, label in enumerate(labels):
        logging.info("* %s: %s (count: %s)", label, i, counter[label])

    # Input features: list[token indices] (BERT) or list[list[character indices]] (CharacterBERT)
    pad_token_id = None
    if 'character' not in args.embedding:
        pad_token_id = tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                        ])[0]

    pad_token_label_id = None
    if args.task == 'sequence_labelling':
        pad_token_label_id = CrossEntropyLoss().ignore_index

    dataset = {}
    logging.info("Maximum sequence lenght: %s", max_seq_length)
    for split in data:
        dataset[split] = build_features(
            args,
            split=split,
            tokenizer=tokenizer \
                if 'character' not in args.embedding \
                else characters_indexer,
            examples=data[split],
            labels=labels,
            pad_token_id=pad_token_id,
            pad_token_label_id=pad_token_label_id,
            max_seq_length=max_seq_length)

    del data  # Not used anymore

    # --------------------------------- MODEL ---------------------------------

    # Initialize model
    if args.task == 'classification':
        model = BertForSequenceClassification
    elif args.task == 'sequence_labelling':
        model = BertForTokenClassification
    else:
        raise NotImplementedError

    logging.info('Loading `%s` model...', args.embedding)
    logging.disable(logging.INFO)
    config = BertConfig.from_pretrained(os.path.join('pretrained-models',
                                                     args.embedding),
                                        num_labels=num_labels)
    if 'character' not in args.embedding:
        model = model.from_pretrained(os.path.join('pretrained-models',
                                                   args.embedding),
                                      config=config)
    else:
        model = model(config=config)
        model.bert = CharacterBertModel.from_pretrained(os.path.join(
            'pretrained-models', args.embedding),
                                                        config=config)
    logging.disable(logging.NOTSET)

    model.to(args.device)
    logging.info('Model:\n%s', model)

    # ------------------------------ TRAIN / EVAL ------------------------------

    # Log args
    logging.info('Using the following arguments for training:')
    for k, v in vars(args).items():
        logging.info("* %s: %s", k, v)

    # Training
    if args.do_train:
        global_step, train_loss, best_val_metric, best_val_epoch = train(
            args=args,
            dataset=dataset,
            model=model,
            tokenizer=tokenizer,
            labels=labels,
            pad_token_label_id=pad_token_label_id)
        logging.info("global_step = %s, average training loss = %s",
                     global_step, train_loss)
        logging.info("Best performance: Epoch=%d, Value=%s", best_val_epoch,
                     best_val_metric)

    # Evaluation on test data
    if args.do_predict:

        # Load best model
        if args.task == 'classification':
            model = BertForSequenceClassification
        elif args.task == 'sequence_labelling':
            model = BertForTokenClassification
        else:
            raise NotImplementedError

        logging.disable(logging.INFO)
        if 'character' not in args.embedding:
            model = model.from_pretrained(args.output_dir)
        else:
            state_dict = torch.load(os.path.join(args.output_dir,
                                                 'pytorch_model.bin'),
                                    map_location='cpu')
            model = model(config=config)
            model.bert = CharacterBertModel(config=config)
            model.load_state_dict(state_dict, strict=True)
        logging.disable(logging.NOTSET)
        model.to(args.device)

        # Compute predictions and metrics
        results, _ = evaluate(args=args,
                              eval_dataset=dataset["test"],
                              model=model,
                              labels=labels,
                              pad_token_label_id=pad_token_label_id)

        # Save metrics
        with open(os.path.join(args.output_dir, 'performance_on_test_set.txt'),
                  'w') as f:
            f.write(f'best validation score: {best_val_metric}\n')
            f.write(f'best validation epoch: {best_val_epoch}\n')
            f.write('--- Performance on test set ---\n')
            for k, v in results.items():
                f.write(f'{k}: {v}\n')