def main(): global args args = parser.parse_args() device = args.device state_dict_compressed = torch.load(args.state_dict_compressed) # instantiating model model = 'resnet50' if args.model == 'resnet50_semisup' else args.model model = resnet_models.__dict__[model](pretrained=False).to(device) criterion = nn.CrossEntropyLoss() _, test_loader = load_data(batch_size=args.batch_size, nb_workers=args.n_workers) watcher = ActivationWatcherResNet(model) # conv1 layer (non-compressed) layer = 'conv1' state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)(model).load_state_dict(state_dict_layer) attrgetter(layer)(model).float() # compressed layers compressed_layers = watcher.layers[1:] # 2 more layers non-compressed for semi-supervised ResNet50 if args.model == 'resnet50_semisup': non_compressed_layers = ['layer1.0.conv3', 'layer1.0.downsample.0'] for layer in non_compressed_layers: compressed_layers.remove(layer) state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)(model).load_state_dict(state_dict_layer) attrgetter(layer)(model).float() for layer in compressed_layers: # recover centroids and assignments state_dict_layer = state_dict_compressed[layer] centroids = state_dict_layer['centroids'].float().to(device) assignments = state_dict_layer['assignments'].long().to(device) n_blocks = state_dict_layer['n_blocks'] is_conv = state_dict_layer['is_conv'] k = state_dict_layer['k'] # instantiate matrix M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv) attrgetter(layer + '.weight')(model).data = M_hat # batch norms bn_layers = watcher._get_bn_layers() for layer in bn_layers: state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)(model).weight.data = state_dict_layer['weight'].float().to(device) attrgetter(layer)(model).bias.data = state_dict_layer['bias'].float().to(device) # classifier bias layer = 'fc' state_dict_layer = to_device(state_dict_compressed['fc_bias'], device) attrgetter(layer + '.bias')(model).data = state_dict_layer['bias'] # evaluate the model top_1 = evaluate(test_loader, model, criterion, device=device).item() print('Top-1 accuracy of quantized model: {:.2f}'.format(top_1))
def main(): device = 'cuda' if torch.cuda.is_available() else 'cpu' transform_test = transforms.Compose([transforms.ToTensor()]) testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform_test) test_loader = data.DataLoader(testset, batch_size=32, shuffle=False, num_workers=10) global args args = parser.parse_args() #device = args.device state_dict_compressed = torch.load(args.state_dict_compressed, map_location='cpu') # instantiating model model = 'resnet50' if args.model == 'resnet50_semisup' else args.model model = resnet_models.__dict__[model](pretrained=False).to(device) criterion = nn.CrossEntropyLoss() #_, test_loader = load_data(data_path=args.data_path, batch_size=args.batch_size, nb_workers=args.n_workers) #transform_test = transforms.Compose([transforms.ToTensor()]) #testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform_test) #test_loader = data.DataLoader(testset, batch_size=32, shuffle=False, num_workers=10) watcher = ActivationWatcherResNet(model) # conv1 layer (non-compressed) layer = 'conv1' state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)(model).load_state_dict(state_dict_layer) attrgetter(layer)(model).float() # compressed layers compressed_layers = watcher.layers[1:] # 2 more layers non-compressed for semi-supervised ResNet50 if args.model == 'resnet50_semisup': non_compressed_layers = ['layer1.0.conv3', 'layer1.0.downsample.0'] for layer in non_compressed_layers: compressed_layers.remove(layer) state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)(model).load_state_dict(state_dict_layer) attrgetter(layer)(model).float() for layer in compressed_layers: # recover centroids and assignments state_dict_layer = state_dict_compressed[layer] centroids = state_dict_layer['centroids'].float().to(device) assignments = state_dict_layer['assignments'].long().to(device) n_blocks = state_dict_layer['n_blocks'] is_conv = state_dict_layer['is_conv'] k = state_dict_layer['k'] # instantiate matrix M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv) attrgetter(layer + '.weight')(model).data = M_hat # batch norms bn_layers = watcher._get_bn_layers() for layer in bn_layers: state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)( model).weight.data = state_dict_layer['weight'].float().to(device) attrgetter(layer)( model).bias.data = state_dict_layer['bias'].float().to(device) # classifier bias layer = 'fc' state_dict_layer = to_device(state_dict_compressed['fc_bias'], device) attrgetter(layer + '.bias')(model).data = state_dict_layer['bias'] # evaluate the model top_1 = evaluate(test_loader, model, criterion, device=device).item() print('Top-1 accuracy of quantized model: {:.2f}'.format(top_1))
def main(): # get arguments global args args = parser.parse_args() args.block = '' if args.block == 'all' else args.block # student model to quantize student = models.__dict__[args.model](pretrained=True).cuda() student.eval() criterion = nn.CrossEntropyLoss().cuda() cudnn.benchmark = True # layers to quantize (we do not quantize the first 7x7 convolution layer) watcher = ActivationWatcher(student) layers = [layer for layer in watcher.layers[1:] if args.block in layer] # data loading code train_loader, test_loader = load_data(data_path=args.data_path, batch_size=args.batch_size, nb_workers=args.n_workers) # parameters for the centroids optimizer opt_centroids_params_all = [] # book-keeping for compression statistics (in MB) size_uncompressed = compute_size(student) size_index = 0 size_centroids = 0 size_other = size_uncompressed # teacher model teacher = models.__dict__[args.model](pretrained=True).cuda() teacher.eval() # Step 1: iteratively quantize the network layers (quantization + layer-wise centroids distillation) print('Step 1: Quantize network') t = time.time() top_1 = 0 for layer in layers: # gather input activations n_iter_activations = math.ceil(args.n_activations / args.batch_size) watcher = ActivationWatcher(student, layer=layer) in_activations_current = watcher.watch(train_loader, criterion, n_iter_activations) in_activations_current = in_activations_current[layer] # get weight matrix and detach it from the computation graph (.data should be enough, adding .detach() as a safeguard) M = attrgetter(layer + '.weight.data')(student).detach() sizes = M.size() is_conv = len(sizes) == 4 # get padding and stride attributes padding = attrgetter(layer)(student).padding if is_conv else 0 stride = attrgetter(layer)(student).stride if is_conv else 1 groups = attrgetter(layer)(student).groups if is_conv else 1 # block size, distinguish between fully connected and convolutional case if is_conv: out_features, in_features, k, _ = sizes block_size = args.block_size_cv if k > 1 else args.block_size_pw n_centroids = args.n_centroids_cv if k > 1 else args.n_centroids_pw n_blocks = in_features * k * k // block_size else: k = 1 out_features, in_features = sizes block_size = args.block_size_fc n_centroids = args.n_centroids_fc n_blocks = in_features // block_size # clamp number of centroids for stability powers = 2 ** np.arange(0, 16, 1) n_vectors = np.prod(sizes) / block_size idx_power = bisect_left(powers, n_vectors / args.n_centroids_threshold) n_centroids = min(n_centroids, powers[idx_power - 1]) # compression rations bits_per_weight = np.log2(n_centroids) / block_size # number of bits per weight size_index_layer = bits_per_weight * M.numel() / 8 / 1024 / 1024 size_index += size_index_layer # centroids stored in float16 size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024 size_centroids += size_centroids_layer # size of non-compressed layers, e.g. BatchNorms or first 7x7 convolution size_uncompressed_layer = M.numel() * 4 / 1024 / 1024 size_other -= size_uncompressed_layer # number of samples n_samples = dynamic_sampling(layer) # print layer size print('Quantizing layer: {}, size: {}, n_blocks: {}, block size: {}, ' \ 'centroids: {}, bits/weight: {:.2f}, compressed size: {:.2f} MB'.format( layer, list(sizes), n_blocks, block_size, n_centroids, bits_per_weight, size_index_layer + size_centroids_layer)) # quantizer quantizer = PQ(in_activations_current, M, n_activations=args.n_activations, n_samples=n_samples, eps=args.eps, n_centroids=n_centroids, n_iter=args.n_iter, n_blocks=n_blocks, k=k, stride=stride, padding=padding, groups=groups) if len(args.restart) > 0: # do not quantize already quantized layers try: # load centroids and assignments if already stored quantizer.load(args.restart, layer) centroids = quantizer.centroids assignments = quantizer.assignments # quantize weight matrix M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv) attrgetter(layer + '.weight')(student).data = M_hat quantizer.save(args.save, layer) # optimizer for global finetuning parameters = [p for (n, p) in student.named_parameters() if layer in n and 'bias' not in n] centroids_params = {'params': parameters, 'assignments': assignments, 'kernel_size': k, 'n_centroids': n_centroids, 'n_blocks': n_blocks} opt_centroids_params_all.append(centroids_params) # proceed to next layer print('Layer already quantized, proceeding to next layer\n') continue # otherwise, quantize layer except FileNotFoundError: print('Quantizing layer') # quantize layer quantizer.encode() # assign quantized weight matrix M_hat = quantizer.decode() attrgetter(layer + '.weight')(student).data = M_hat # top1 top_1 = evaluate(test_loader, student, criterion).item() # book-keeping print('Quantizing time: {:.0f}min, Top1 after quantization: {:.2f}\n'.format((time.time() - t) / 60, top_1)) t = time.time() # Step 2: finetune centroids print('Finetuning centroids') # optimizer for centroids parameters = [p for (n, p) in student.named_parameters() if layer in n and 'bias' not in n] assignments = quantizer.assignments centroids_params = {'params': parameters, 'assignments': assignments, 'kernel_size': k, 'n_centroids': n_centroids, 'n_blocks': n_blocks} # remember centroids parameters to finetuning at the end opt_centroids_params = [centroids_params] opt_centroids_params_all.append(centroids_params) # custom optimizer optimizer_centroids = CentroidSGD(opt_centroids_params, lr=args.lr_centroids, momentum=args.momentum_centroids, weight_decay=args.weight_decay_centroids) # standard training loop n_iter = args.finetune_centroids scheduler = torch.optim.lr_scheduler.StepLR(optimizer_centroids, step_size=1, gamma=0.1) for epoch in range(1): finetune_centroids(train_loader, student, teacher, criterion, optimizer_centroids, n_iter=n_iter) top_1 = evaluate(test_loader, student, criterion) scheduler.step() print('Epoch: {}, Top1: {:.2f}'.format(epoch, top_1)) print('After {} iterations with learning rate {}, Top1: {:.2f}'.format(n_iter, args.lr_centroids, top_1)) # book-keeping print('Finetuning centroids time: {:.0f}min, Top1 after finetuning centroids: {:.2f}\n'.format((time.time() - t) / 60, top_1)) t = time.time() # saving M_hat = attrgetter(layer + '.weight')(student).data centroids = centroids_from_weights(M_hat, assignments, n_centroids, n_blocks) quantizer.centroids = centroids quantizer.save(args.save, layer) # End of compression + finetuning of centroids size_compressed = size_index + size_centroids + size_other print('End of compression, non-compressed teacher model: {:.2f}MB, compressed student model ' \ '(indexing + centroids + other): {:.2f}MB + {:.2f}MB + {:.2f}MB = {:.2f}MB, compression ratio: {:.2f}x\n'.format( size_uncompressed, size_index, size_centroids, size_other, size_compressed, size_uncompressed / size_compressed)) # Step 3: finetune whole network print('Step 3: Finetune whole network') t = time.time() # custom optimizer optimizer_centroids_all = CentroidSGD(opt_centroids_params_all, lr=args.lr_whole, momentum=args.momentum_whole, weight_decay=args.weight_decay_whole) # standard training loop n_iter = args.finetune_whole scheduler = torch.optim.lr_scheduler.StepLR(optimizer_centroids_all, step_size=args.finetune_whole_step_size, gamma=0.1) for epoch in range(args.finetune_whole_epochs): student.train() finetune_centroids(train_loader, student, teacher, criterion, optimizer_centroids_all, n_iter=n_iter) top_1 = evaluate(test_loader, student, criterion) scheduler.step() print('Epoch: {}, Top1: {:.2f}'.format(epoch, top_1)) # state dict pf compressed model state_dict_compressed = {} # save conv1 (not quantized) state_dict_compressed['conv1'] = student.conv1.state_dict() # save biases of the classifier state_dict_compressed['fc_bias'] = {'bias': student.fc.bias} # save batch norms bn_layers = watcher._get_bn_layers() for bn_layer in bn_layers: state_dict_compressed[bn_layer] = attrgetter(bn_layer)(student).state_dict() # save quantized layers for layer in layers: # stats M = attrgetter(layer + '.weight.data')(student).detach() sizes = M.size() is_conv = len(sizes) == 4 # get padding and stride attributes padding = attrgetter(layer)(student).padding if is_conv else 0 stride = attrgetter(layer)(student).stride if is_conv else 1 groups = attrgetter(layer)(student).groups if is_conv else 1 # block size, distinguish between fully connected and convolutional case if is_conv: out_features, in_features, k, _ = sizes block_size = args.block_size_cv if k > 1 else args.block_size_pw n_centroids = args.n_centroids_cv n_blocks = in_features * k * k // block_size else: k = 1 out_features, in_features = sizes block_size = args.block_size_fc n_centroids = args.n_centroids_fc n_blocks = in_features // block_size # clamp number of centroids for stability powers = 2 ** np.arange(0, 16, 1) n_vectors = np.prod(sizes) / block_size idx_power = bisect_left(powers, n_vectors / args.n_centroids_threshold) n_centroids = min(n_centroids, powers[idx_power - 1]) # save quantizer.load(args.save, layer) assignments = quantizer.assignments M_hat = attrgetter(layer + '.weight')(student).data centroids = centroids_from_weights(M_hat, assignments, n_centroids, n_blocks) quantizer.centroids = centroids quantizer.save(args.save, layer) state_dict_layer = { 'centroids': centroids.half(), 'assignments': assignments.short() if 'fc' in layer else assignments.byte(), 'n_blocks': n_blocks, 'is_conv': is_conv, 'k': k } state_dict_compressed[layer] = state_dict_layer # save model torch.save(state_dict_compressed, os.path.join(args.save, 'state_dict_compressed.pth')) # book-keeping print('Finetuning whole network time: {:.0f}min, Top1 after finetuning centroids: {:.2f}\n'.format((time.time() - t) / 60, top_1))
def main(): # get arguments global args args = parser.parse_args() args.block = '' if args.block == 'all' else args.block PATH = "./models/trained" student = torch.load(os.path.join(PATH, "resnet18_2.pth")) teacher = torch.load(os.path.join(PATH, "resnet18_2.pth")) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") student.to(device) teacher.to(device) criterion = nn.CrossEntropyLoss().cuda() cudnn.benchmark = True # layers to quantize (we do not quantize the first 7x7 convolution layer) watcher = ActivationWatcher(student) layers = [layer for layer in watcher.layers[1:] if args.block in layer] # layers = [layer for layer in watcher.layers if args.block in layer] # data loading code train_loader, test_loader = load_data(batch_size=args.batch_size, nb_workers=args.n_workers) # parameters for the centroids optimizer opt_centroids_params_all = [] # book-keeping for compression statistics (in MB) size_uncompressed = compute_size(student) size_index = 0 size_centroids = 0 size_other = size_uncompressed t1 = time.time() top_1 = evaluate(test_loader, student, criterion) print('Time taken validate 10,000 samples : {}s'.format(time.time() - t1)) # scheduler.step() print('Top1 acc of teacher : {:.2f}'.format(top_1)) # Step 1: iteratively quantize the network layers (quantization + layer-wise centroids distillation) print('Loading Quantized network') t = time.time() top_1 = 0 for layer in layers: # gather input activations n_iter_activations = math.ceil(args.n_activations / args.batch_size) watcher = ActivationWatcher(student, layer=layer) in_activations_current = watcher.watch(train_loader, criterion, n_iter_activations) in_activations_current = in_activations_current[layer] # get weight matrix and detach it from the computation graph (.data should be enough, adding .detach() as a safeguard) M = attrgetter(layer + '.weight.data')(student).detach() sizes = M.size() is_conv = len(sizes) == 4 # get padding and stride attributes padding = attrgetter(layer)(student).padding if is_conv else 0 stride = attrgetter(layer)(student).stride if is_conv else 1 groups = attrgetter(layer)(student).groups if is_conv else 1 # block size, distinguish between fully connected and convolutional case if is_conv: out_features, in_features, k, _ = sizes block_size = args.block_size_cv if k > 1 else args.block_size_pw n_centroids = args.n_centroids_cv if k > 1 else args.n_centroids_pw n_blocks = in_features * k * k // block_size else: k = 1 out_features, in_features = sizes block_size = args.block_size_fc n_centroids = args.n_centroids_fc n_blocks = in_features // block_size # clamp number of centroids for stability powers = 2 ** np.arange(0, 16, 1) n_vectors = np.prod(sizes) / block_size idx_power = bisect_left(powers, n_vectors / args.n_centroids_threshold) n_centroids = min(n_centroids, powers[idx_power - 1]) # compression rations bits_per_weight = np.log2(n_centroids) / block_size # number of bits per weight size_index_layer = bits_per_weight * M.numel() / 8 / 1024 / 1024 size_index += size_index_layer # centroids stored in float16 size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024 size_centroids += size_centroids_layer # size of non-compressed layers, e.g. BatchNorms or first 7x7 convolution size_uncompressed_layer = M.numel() * 4 / 1024 / 1024 size_other -= size_uncompressed_layer # number of samples n_samples = dynamic_sampling(layer) # print layer size print('Quantized layer: {}, size: {}, n_blocks: {}, block size: {}, ' \ 'centroids: {}, bits/weight: {:.2f}, compressed size: {:.2f} MB'.format( layer, list(sizes), n_blocks, block_size, n_centroids, bits_per_weight, size_index_layer + size_centroids_layer)) # quantizer quantizer = PQ(in_activations_current, M, n_activations=args.n_activations, n_samples=n_samples, eps=args.eps, n_centroids=n_centroids, n_iter=args.n_iter, n_blocks=n_blocks, k=k, stride=stride, padding=padding, groups=groups) if len(args.restart) > 0: # do not quantize already quantized layers try: # load centroids and assignments if already stored quantizer.load(args.restart, layer) centroids = quantizer.centroids assignments = quantizer.assignments # quantize weight matrix M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv) attrgetter(layer + '.weight')(student).data = M_hat quantizer.save(args.save, layer) # optimizer for global finetuning parameters = [p for (n, p) in student.named_parameters() if layer in n and 'bias' not in n] centroids_params = {'params': parameters, 'assignments': assignments, 'kernel_size': k, 'n_centroids': n_centroids, 'n_blocks': n_blocks} opt_centroids_params_all.append(centroids_params) # proceed to next layer print('codebook loaded, proceeding to next layer\n') continue # otherwise, quantize layer except FileNotFoundError: print('Quantize layer first') # End of compression + finetuning of centroids size_compressed = size_index + size_centroids + size_other print('Non-compressed teacher model: {:.2f}MB, compressed student model ' \ '(indexing + centroids + other): {:.2f}MB + {:.2f}MB + {:.2f}MB = {:.2f}MB, compression ratio: {:.2f}x\n'.format( size_uncompressed, size_index, size_centroids, size_other, size_compressed, size_uncompressed / size_compressed)) # Step 3: finetune whole network print('Validating whole network') t = time.time() # custom optimizer optimizer_centroids_all = CentroidSGD(opt_centroids_params_all, lr=args.lr_whole, momentum=args.momentum_whole, weight_decay=args.weight_decay_whole) # standard training loop n_iter = args.finetune_whole scheduler = torch.optim.lr_scheduler.StepLR(optimizer_centroids_all, step_size=args.finetune_whole_step_size, gamma=0.1) # for epoch in range(args.finetune_whole_epochs): student.train() finetune_centroids(train_loader, student, teacher, criterion, optimizer_centroids_all, n_iter=n_iter) t1 = time.time() top_1 = evaluate(test_loader, student, criterion) print('Time taken validate 10,000 samples : {}s'.format(time.time() - t1)) scheduler.step() print('Top1 acc: {:.2f}'.format(top_1)) print('Total parameters: {}'.format(sum(p.numel() for p in student.parameters() if p.requires_grad)))
def main(): global args args = parser.parse_args() device = args.device state_dict_compressed = torch.load(args.state_dict_compressed) args.block = '' if args.block == 'all' else args.block # instantiating model model = 'resnet50' if args.model == 'resnet50_semisup' else args.model model = resnet_models.__dict__[model](pretrained=False).to(device) model.load_state_dict(torch.load('./running_batchnorm.pth')) criterion = nn.CrossEntropyLoss() transform_raner = Augmentor.Pipeline() transform_raner.random_erasing(probability=0.5, rectangle_area=0.15) transform_raner = transform_raner.torch_transform() _, test_loader = load_any_data( data_path=args.data_path, batch_size=args.batch_size, nb_workers=args.n_workers, transforms_dict={ 'train': transforms.Compose([ transform_raner, transforms.RandomHorizontalFlip(), transforms.RandomVerticalFlip(), transforms.RandomRotation(180, resample=False, expand=False), transforms.Resize(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), 'val': transforms.Compose([ transforms.Resize(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) }) watcher = ActivationWatcherResNet(model) #conv1 layer (non-compressed) layer = 'conv1' state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)(model).load_state_dict(state_dict_layer) attrgetter(layer)(model).float() # compressed layers compressed_layers = [ layer for layer in watcher.layers[1:] if args.block in layer ] # 2 more layers non-compressed for semi-supervised ResNet50 if args.model == 'resnet50_semisup': non_compressed_layers = ['layer1.0.conv3', 'layer1.0.downsample.0'] for layer in non_compressed_layers: compressed_layers.remove(layer) state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)(model).load_state_dict(state_dict_layer) attrgetter(layer)(model).float() for layer in compressed_layers: # recover centroids and assignments state_dict_layer = state_dict_compressed[layer] centroids = state_dict_layer['centroids'].float().to(device) assignments = state_dict_layer['assignments'].long().to(device) n_blocks = state_dict_layer['n_blocks'] is_conv = state_dict_layer['is_conv'] k = state_dict_layer['k'] # instantiate matrix M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv) attrgetter(layer + '.weight')(model).data = M_hat # batch norms bn_layers = watcher._get_bn_layers() print(bn_layers) for layer in bn_layers: state_dict_layer = to_device(state_dict_compressed[layer], device) attrgetter(layer)( model).weight.data = state_dict_layer['weight'].float().to(device) attrgetter(layer)( model).bias.data = state_dict_layer['bias'].float().to(device) # classifier bias layers = ['fc.0', 'fc.3', 'fc.6'] for i, layer in enumerate(layers): state_dict_layer = to_device(state_dict_compressed['fc_bias'], device) attrgetter(layer + '.bias')(model).data = state_dict_layer['bias_' + str(i + 1)] # model = model.to(device) # evaluate the model top_1 = evaluate(test_loader, model, criterion, device=device, verbose=True).item() print('Top-1 accuracy of quantized model: {:.2f}'.format(top_1))
def main(): torch.cuda.empty_cache() student = real_nvp_model( pretrained=True) # resnet.resnet18_1(pretrained=True).cuda() student.eval() cudnn.benchmark = True criterion = real_nvp_loss.RealNVPLoss().cuda() transform_train = transforms.Compose( [transforms.RandomHorizontalFlip(), transforms.ToTensor()]) trainset = torchvision.datasets.CIFAR10(root='data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=16, shuffle=True, num_workers=0) transform_test = transforms.Compose([transforms.ToTensor()]) testset = torchvision.datasets.CIFAR10(root='data', train=False, download=True, transform=transform_test) testloader = data.DataLoader(testset, batch_size=16, shuffle=False, num_workers=0) # parameters for the centroids optimizer opt_centroids_params_all = [] # book-keeping for compression statistics (in MB) size_uncompressed = compute_size(student) #44.591949462890625 mb size_index = 0 size_centroids = 0 size_other = size_uncompressed teacher = real_nvp_model(pretrained=True) teacher.eval() watcher = ActivationWatcher(student) layers = [] i = 1 for layer in watcher.layers[1:]: if i % 2 == 0: layers.append(layer) i = i + 1 restart = 1 for layer in layers[0:50]: print(layer) torch.cuda.empty_cache() n_iter_activations = math.ceil(1024 / 32) watcher = ActivationWatcher(student, layer=layer) in_activations_current = watcher.watch(trainloader, criterion, n_iter_activations) in_activations_current = in_activations_current[layer] M = attrgetter(layer + '.weight.data')(student).detach() sizes = M.size() is_conv = len(sizes) == 4 padding = attrgetter(layer)(student).padding if is_conv else 0 stride = attrgetter(layer)(student).stride if is_conv else 1 groups = attrgetter(layer)(student).groups if is_conv else 1 if is_conv: out_features, in_features, k, _ = sizes block_size = 9 if k > 1 else 4 n_centroids = 128 if k > 1 else 128 n_blocks = in_features * k * k // block_size else: k = 1 out_features, in_features = sizes block_size = 4 n_centroids = 256 n_blocks = in_features // block_size powers = 2**np.arange(0, 16, 1) n_vectors = np.prod(sizes) / block_size #4096.0 idx_power = bisect_left(powers, n_vectors / 4) n_centroids = min(n_centroids, powers[idx_power - 1]) #128 # compression rations bits_per_weight = np.log2(n_centroids) / block_size #0.7778 # number of bits per weight size_index_layer = bits_per_weight * M.numel() / 8 / 1024 / 1024 size_index += size_index_layer #0.00341796875 # centroids stored in float16 size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024 size_centroids += size_centroids_layer # size of non-compressed layers, e.g. BatchNorms or first 7x7 convolution size_uncompressed_layer = M.numel() * 4 / 1024 / 1024 size_other -= size_uncompressed_layer n_samples = 1000 # quantizer quantizer = PQ(in_activations_current, M, n_activations=1024, n_samples=n_samples, eps=1e-8, n_centroids=n_centroids, n_iter=100, n_blocks=n_blocks, k=k, stride=stride, padding=padding, groups=groups) if restart: try: quantizer.load('', layer) centroids = quantizer.centroids assignments = quantizer.assignments # quantize weight matrix M_hat = weight_from_centroids(centroids, assignments, n_blocks, k, is_conv) attrgetter(layer + '.weight')(student).data = M_hat quantizer.save('', layer) # optimizer for global finetuning parameters = [ attrgetter(layer + '.weight.data')(student).detach() ] centroids_params = { 'params': parameters, 'assignments': assignments, 'kernel_size': k, 'n_centroids': n_centroids, 'n_blocks': n_blocks } opt_centroids_params_all.append(centroids_params) # proceed to next layer print('Layer already quantized, proceeding to next layer\n') continue except FileNotFoundError: print('Quantizing layer') # # quantize layer quantizer.encode() M_hat = quantizer.decode() attrgetter(layer + '.weight')(student).data = M_hat parameters = [] parameters = [attrgetter(layer + '.weight.data')(student).detach()] assignments = quantizer.assignments centroids_params = { 'params': parameters, 'assignments': assignments, 'kernel_size': k, 'n_centroids': n_centroids, 'n_blocks': n_blocks } opt_centroids_params_all.append(centroids_params) opt_centroids_params = [centroids_params] optimizer_centroids = CentroidSGD(opt_centroids_params, lr=0.01, momentum=0.9, weight_decay=0.0001) finetune_centroids(trainloader, student.eval(), teacher, criterion, optimizer_centroids, n_iter=100) bpd = evaluate(testloader, student, criterion) print('bits per dim:{:.4f} '.format(bpd)) scheduler = torch.optim.lr_scheduler.StepLR(optimizer_centroids, step_size=1, gamma=0.1) # saving M_hat = attrgetter(layer + '.weight')(student).data centroids = centroids_from_weights(M_hat, assignments, n_centroids, n_blocks) quantizer.centroids = centroids quantizer.save('', layer)
output_dir=args.output_dir, overwrite_output_dir=False, disable_tqdm=True, evaluation_strategy="epoch", per_device_train_batch_size=args.train_batch_size, per_device_eval_batch_size=args.eval_batch_size, logging_dir=args.output_dir, logging_steps=10, load_best_model_at_end=False, metric_for_best_model='overall', greater_is_better=True) args.eval_batch_size = 1 # unpickle results results, preds_list, probs_list, out_label_ids = evaluate(args, test_dataset, model, test_mode=True) # print("Results w/o ensemble: ", results) # logging.info(results) print("probs_list = ", probs_list) print("out_label_ids = ", out_label_ids) print("evaluate results = ", results) results = evaluate_all(out_label_ids, probs_list) print("evaluate_all results = ", results) print("shape(probs_list) = ", probs_list.shape) print("shape(out_label_ids) = ", out_label_ids.shape) score_paths = { 'charbert_xs_open_ua':
def main(args): """ Main function. """ # --------------------------------- DATA --------------------------------- # Tokenizer logging.disable(logging.INFO) try: tokenizer = BertTokenizer.from_pretrained( os.path.join('pretrained-models', args.embedding), do_lower_case=args.do_lower_case) except OSError: # For CharacterBert models use BertTokenizer.basic_tokenizer for tokenization # and CharacterIndexer for indexing tokenizer = BertTokenizer.from_pretrained( os.path.join('pretrained-models', 'bert-base-uncased'), do_lower_case=args.do_lower_case) tokenizer = tokenizer.basic_tokenizer characters_indexer = CharacterIndexer() logging.disable(logging.NOTSET) tokenization_function = tokenizer.tokenize # Pre-processsing: apply basic tokenization (both) then split into wordpieces (BERT only) data = {} for split in ['train', 'test']: if args.task == 'classification': func = load_classification_dataset elif args.task == 'sequence_labelling': func = load_sequence_labelling_dataset else: raise NotImplementedError data[split] = func(step=split, do_lower_case=args.do_lower_case) retokenize(data[split], tokenization_function) logging.info('Splitting training data into train / validation sets...') data['validation'] = data['train'][:int(args.validation_ratio * len(data['train']))] data['train'] = data['train'][int(args.validation_ratio * len(data['train'])):] logging.info('New number of training sequences: %d', len(data['train'])) logging.info('New number of validation sequences: %d', len(data['validation'])) # Count target labels or classes if args.task == 'classification': counter_all = Counter([ example.label for example in data['train'] + data['validation'] + data['test'] ]) counter = Counter([example.label for example in data['train']]) # Maximum sequence length is either 512 or maximum token sequence length + 3 max_seq_length = min( 512, 3 + max( map(len, [ e.tokens_a if e.tokens_b is None else e.tokens_a + e.tokens_b for e in data['train'] + data['validation'] + data['test'] ]))) elif args.task == 'sequence_labelling': counter_all = Counter([ label for example in data['train'] + data['validation'] + data['test'] for label in example.label_sequence ]) counter = Counter([ label for example in data['train'] for label in example.label_sequence ]) # Maximum sequence length is either 512 or maximum token sequence length + 5 max_seq_length = min( 512, 5 + max( map(len, [ e.token_sequence for e in data['train'] + data['validation'] + data['test'] ]))) else: raise NotImplementedError labels = sorted(counter_all.keys()) num_labels = len(labels) logging.info("Goal: predict the following labels") for i, label in enumerate(labels): logging.info("* %s: %s (count: %s)", label, i, counter[label]) # Input features: list[token indices] (BERT) or list[list[character indices]] (CharacterBERT) pad_token_id = None if 'character' not in args.embedding: pad_token_id = tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0] pad_token_label_id = None if args.task == 'sequence_labelling': pad_token_label_id = CrossEntropyLoss().ignore_index dataset = {} logging.info("Maximum sequence lenght: %s", max_seq_length) for split in data: dataset[split] = build_features( args, split=split, tokenizer=tokenizer \ if 'character' not in args.embedding \ else characters_indexer, examples=data[split], labels=labels, pad_token_id=pad_token_id, pad_token_label_id=pad_token_label_id, max_seq_length=max_seq_length) del data # Not used anymore # --------------------------------- MODEL --------------------------------- # Initialize model if args.task == 'classification': model = BertForSequenceClassification elif args.task == 'sequence_labelling': model = BertForTokenClassification else: raise NotImplementedError logging.info('Loading `%s` model...', args.embedding) logging.disable(logging.INFO) config = BertConfig.from_pretrained(os.path.join('pretrained-models', args.embedding), num_labels=num_labels) if 'character' not in args.embedding: model = model.from_pretrained(os.path.join('pretrained-models', args.embedding), config=config) else: model = model(config=config) model.bert = CharacterBertModel.from_pretrained(os.path.join( 'pretrained-models', args.embedding), config=config) logging.disable(logging.NOTSET) model.to(args.device) logging.info('Model:\n%s', model) # ------------------------------ TRAIN / EVAL ------------------------------ # Log args logging.info('Using the following arguments for training:') for k, v in vars(args).items(): logging.info("* %s: %s", k, v) # Training if args.do_train: global_step, train_loss, best_val_metric, best_val_epoch = train( args=args, dataset=dataset, model=model, tokenizer=tokenizer, labels=labels, pad_token_label_id=pad_token_label_id) logging.info("global_step = %s, average training loss = %s", global_step, train_loss) logging.info("Best performance: Epoch=%d, Value=%s", best_val_epoch, best_val_metric) # Evaluation on test data if args.do_predict: # Load best model if args.task == 'classification': model = BertForSequenceClassification elif args.task == 'sequence_labelling': model = BertForTokenClassification else: raise NotImplementedError logging.disable(logging.INFO) if 'character' not in args.embedding: model = model.from_pretrained(args.output_dir) else: state_dict = torch.load(os.path.join(args.output_dir, 'pytorch_model.bin'), map_location='cpu') model = model(config=config) model.bert = CharacterBertModel(config=config) model.load_state_dict(state_dict, strict=True) logging.disable(logging.NOTSET) model.to(args.device) # Compute predictions and metrics results, _ = evaluate(args=args, eval_dataset=dataset["test"], model=model, labels=labels, pad_token_label_id=pad_token_label_id) # Save metrics with open(os.path.join(args.output_dir, 'performance_on_test_set.txt'), 'w') as f: f.write(f'best validation score: {best_val_metric}\n') f.write(f'best validation epoch: {best_val_epoch}\n') f.write('--- Performance on test set ---\n') for k, v in results.items(): f.write(f'{k}: {v}\n')