def test_channel_prune(self): orig_net = resnet18(num_classes=10).to(device) channel_prune(orig_net) state_dict = torch.load(MODEL_FILE) orig_net = resnet18(num_classes=10).to(device) orig_net.load_state_dict(state_dict) apply_compression_results(orig_net, MASK_FILE) orig_net.eval() net = resnet18(num_classes=10).to(device) net.load_state_dict(state_dict) net.eval() data = torch.randn(BATCH_SIZE, 3, 128, 128).to(device) ms = ModelSpeedup(net, data, MASK_FILE, confidence=8) ms.speedup_model() ms.bound_model(data) net.eval() ori_sum = orig_net(data).abs().sum().item() speeded_sum = net(data).abs().sum().item() print(ori_sum, speeded_sum) assert (abs(ori_sum - speeded_sum) / abs(ori_sum) < RELATIVE_THRESHOLD) or \ (abs(ori_sum - speeded_sum) < ABSOLUTE_THRESHOLD)
def test_speedup_bigmodel(self): prune_model_l1(BigModel()) model = BigModel() apply_compression_results(model, MASK_FILE, 'cpu') model.eval() mask_out = model(dummy_input) model.train() ms = ModelSpeedup(model, dummy_input, MASK_FILE, confidence=8) ms.speedup_model() assert model.training model.eval() speedup_out = model(dummy_input) if not torch.allclose(mask_out, speedup_out, atol=1e-07): print('input:', dummy_input.size(), torch.abs(dummy_input).sum((2, 3))) print('mask_out:', mask_out) print('speedup_out:', speedup_out) raise RuntimeError('model speedup inference result is incorrect!') orig_model = BigModel() assert model.backbone2.conv1.out_channels == int( orig_model.backbone2.conv1.out_channels * SPARSITY) assert model.backbone2.conv2.in_channels == int( orig_model.backbone2.conv2.in_channels * SPARSITY) assert model.backbone2.conv2.out_channels == int( orig_model.backbone2.conv2.out_channels * SPARSITY) assert model.backbone2.fc1.in_features == int( orig_model.backbone2.fc1.in_features * SPARSITY)
def model_inference(config): masks_file = config['masks_file'] device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') # device = torch.device(config['device']) if config['model_name'] == 'vgg16': model = VGG(depth=16) elif config['model_name'] == 'vgg19': model = VGG(depth=19) elif config['model_name'] == 'lenet': model = LeNet() model.to(device) model.eval() dummy_input = torch.randn(config['input_shape']).to(device) use_mask_out = use_speedup_out = None # must run use_mask before use_speedup because use_speedup modify the model if use_mask: apply_compression_results(model, masks_file, device) start = time.time() for _ in range(32): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) if use_speedup: m_speedup = ModelSpeedup(model, dummy_input, masks_file, device) m_speedup.speedup_model() start = time.time() for _ in range(32): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) if compare_results: if torch.allclose(use_mask_out, use_speedup_out, atol=1e-07): print('the outputs from use_mask and use_speedup are the same') else: raise RuntimeError( 'the outputs from use_mask and use_speedup are different')
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") os.makedirs(args.experiment_data_dir, exist_ok=True) # prepare model and data train_loader, test_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size) model, optimizer, scheduler = get_model_optimizer_scheduler( args, device, train_loader, test_loader, criterion) dummy_input = get_dummy_input(args, device) flops, params, results = count_flops_params(model, dummy_input) print(f"FLOPs: {flops}, params: {params}") print('start pruning...') model_path = os.path.join( args.experiment_data_dir, 'pruned_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner)) mask_path = os.path.join( args.experiment_data_dir, 'mask_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner)) pruner = get_pruner(model, args.pruner, device, optimizer, args.dependency_aware) model = pruner.compress() if args.multi_gpu and torch.cuda.device_count() > 1: model = nn.DataParallel(model) if args.test_only: test(args, model, device, criterion, test_loader) best_top1 = 0 for epoch in range(args.fine_tune_epochs): pruner.update_epoch(epoch) print('# Epoch {} #'.format(epoch)) train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() top1 = test(args, model, device, criterion, test_loader) if top1 > best_top1: best_top1 = top1 # Export the best model, 'model_path' stores state_dict of the pruned model, # mask_path stores mask_dict of the pruned model pruner.export_model(model_path=model_path, mask_path=mask_path) if args.nni: nni.report_final_result(best_top1) if args.speed_up: # reload the best checkpoint for speed-up args.pretrained_model_dir = model_path model, _, _ = get_model_optimizer_scheduler(args, device, train_loader, test_loader, criterion) model.eval() apply_compression_results(model, mask_path, device) # test model speed start = time.time() for _ in range(32): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) m_speedup = ModelSpeedup(model, dummy_input, mask_path, device) m_speedup.speedup_model() flops, params, results = count_flops_params(model, dummy_input) print(f"FLOPs: {flops}, params: {params}") start = time.time() for _ in range(32): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) top1 = test(args, model, device, criterion, test_loader)
batch_size=batch_size, drop_last=True) valid_un_dataloader = DataLoader(total_dataset.valid_un_dataset, shuffle=False, batch_size=batch_size, drop_last=True) #train(train_un_dataloader,valid_un_dataloader, model, criterion=SMR_loss,save_path = best_un_model_path, lr = 1e-3) #print('unsupervised learning complete!') model_path = best_un_model_path mask_path = './model/DUU_models_%d_%d_%d_%d_%d_un_mask_model.pth' % ( Nt, Nr, K, dk, SNR_dB) config_list = [{'sparsity': 0.8, 'op_types': ['Conv2d']}] pruner = L2FilterPruner(model, config_list) pruner.compress() model.eval() pruner.export_model(model_path=model_path, mask_path=mask_path) apply_compression_results(model, mask_path, device) '''test''' test_dataloader = DataLoader(total_dataset.test_un_dataset, shuffle=False, batch_size=batch_size, drop_last=True) model = torch.load(best_su_model_path) su_performance = test(test_dataloader, model, criterion=SMR_loss) print('supervised learning performance:' + str(su_performance)) model = torch.load(best_un_model_path) un_performance = test(test_dataloader, model, criterion=SMR_loss) print('unsupervised learning performance:' + str(un_performance)) #python train.py --Nt 64 --Nr 4 --K 10 --dk 2 --SNR 0 --SNR_channel 100 --gpu 0 --mode gpu --batch_size 200 --epoch 1000 --factor 1 --test_length 2000
return test_acc.item() dummy_input = torch.ones([64, 3, 32, 32]).cuda() model = torchvision.models.vgg19_bn(num_classes=10) model.avgpool = nn.AdaptiveAvgPool2d((1, 1)) model.classifier = nn.Linear(512, 10) model.load_state_dict(torch.load('pruned_vgg19_cifar10.pth')) model.cuda() model.eval() model(dummy_input) #first time infer will cost much time # mask use_mask_out = use_speedup_out = None apply_compression_results(model, 'mask_vgg19_cifar10.pth') start = time.time() for _ in range(320): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) print(test(model)) # speedup m_speedup = ModelSpeedup(model, dummy_input, 'mask_vgg19_cifar10.pth') m_speedup.speedup_model() start = time.time() for _ in range(320): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) print(test(model))