def test_speedup_bigmodel(self): prune_model_l1(BigModel()) model = BigModel() apply_compression_results(model, MASK_FILE, 'cpu') model.eval() mask_out = model(dummy_input) model.train() ms = ModelSpeedup(model, dummy_input, MASK_FILE) ms.speedup_model() assert model.training model.eval() speedup_out = model(dummy_input) if not torch.allclose(mask_out, speedup_out, atol=1e-07): print('input:', dummy_input.size(), torch.abs(dummy_input).sum((2, 3))) print('mask_out:', mask_out) print('speedup_out:', speedup_out) raise RuntimeError('model speedup inference result is incorrect!') orig_model = BigModel() assert model.backbone2.conv1.out_channels == int( orig_model.backbone2.conv1.out_channels * SPARSITY) assert model.backbone2.conv2.in_channels == int( orig_model.backbone2.conv2.in_channels * SPARSITY) assert model.backbone2.conv2.out_channels == int( orig_model.backbone2.conv2.out_channels * SPARSITY) assert model.backbone2.fc1.in_features == int( orig_model.backbone2.fc1.in_features * SPARSITY)
def slim_speedup(masks_file, model_checkpoint): device = torch.device('cuda') model = VGG(depth=19) model.to(device) model.eval() dummy_input = torch.randn(64, 3, 32, 32) if use_mask: apply_compression_results(model, masks_file) dummy_input = dummy_input.to(device) start = time.time() for _ in range(32): out = model(dummy_input) #print(out.size(), out) print('mask elapsed time: ', time.time() - start) return else: #print("model before: ", model) m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file) m_speedup.speedup_model() #print("model after: ", model) dummy_input = dummy_input.to(device) start = time.time() for _ in range(32): out = model(dummy_input) #print(out.size(), out) print('speedup elapsed time: ', time.time() - start) return
def fpgm_speedup(masks_file, model_checkpoint): from fpgm_torch_mnist import Mnist device = torch.device('cpu') model = Mnist() model.to(device) model.print_conv_filter_sparsity() dummy_input = torch.randn(64, 1, 28, 28) if use_mask: apply_compression_results(model, masks_file) dummy_input = dummy_input.to(device) start = time.time() for _ in range(40): out = model(dummy_input) print('mask elapsed time: ', time.time() - start) #print(out.size(), out) return else: m_speedup = ModelSpeedup(model, dummy_input.to(device), masks_file) m_speedup.speedup_model() dummy_input = dummy_input.to(device) start = time.time() for _ in range(40): out = model(dummy_input) print('speedup elapsed time: ', time.time() - start) #print(out.size(), out) return
def test_speedup_vgg16(self): prune_model_l1(vgg16()) model = vgg16() model.train() ms = ModelSpeedup(model, torch.randn(2, 3, 32, 32), MASK_FILE) ms.speedup_model() orig_model = vgg16() assert model.training assert model.features[2].out_channels == int( orig_model.features[2].out_channels * SPARSITY) assert model.classifier[0].in_features == int( orig_model.classifier[0].in_features * SPARSITY)
def test_speedup_bigmodel(self): prune_model_l1(BigModel()) model = BigModel() model.train() ms = ModelSpeedup(model, torch.randn(2, 1, 28, 28), './l1_mask.pth') ms.speedup_model() orig_model = BigModel() assert model.training assert model.backbone2.conv1.out_channels == int( orig_model.backbone2.conv1.out_channels * SPARSITY) assert model.backbone2.conv2.in_channels == int( orig_model.backbone2.conv2.in_channels * SPARSITY) assert model.backbone2.conv2.out_channels == int( orig_model.backbone2.conv2.out_channels * SPARSITY) assert model.backbone2.fc1.in_features == int( orig_model.backbone2.fc1.in_features * SPARSITY)
def model_inference(config): masks_file = config['masks_file'] device = torch.device(config['device']) if config['model_name'] == 'vgg16': model = VGG(depth=16) elif config['model_name'] == 'vgg19': model = VGG(depth=19) elif config['model_name'] == 'naive': from model_prune_torch import NaiveModel model = NaiveModel() model.to(device) model.eval() dummy_input = torch.randn(config['input_shape']).to(device) use_mask_out = use_speedup_out = None # must run use_mask before use_speedup because use_speedup modify the model if use_mask: apply_compression_results(model, masks_file, 'cpu' if config['device'] == 'cpu' else None) start = time.time() for _ in range(32): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) if use_speedup: m_speedup = ModelSpeedup(model, dummy_input, masks_file, 'cpu' if config['device'] == 'cpu' else None) m_speedup.speedup_model() start = time.time() for _ in range(32): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) if compare_results: if torch.allclose(use_mask_out, use_speedup_out, atol=1e-07): print('the outputs from use_mask and use_speedup are the same') else: raise RuntimeError( 'the outputs from use_mask and use_speedup are different')
test(model, device, test_data_loader) torch.save(model.state_dict(), 'pretrained_model.pth') print("start model pruning...") optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-4) best_top1 = 0 # pruner = SlimPruner(model, config_list, optimizer) pruner = ActivationMeanRankFilterPruner(model, config_list, optimizer) model = pruner.compress() for epoch in range(prune_epochs): pruner.update_epoch(epoch) print("# Epoch {} #".format(epoch)) train(model, device, train_data_loader, optimizer) top1 = test(model, device, test_data_loader) if top1 > best_top1: pruner.export_model(model_path='pruned_model.pth', mask_path='pruned_mask.pth') from nni.compression.torch import apply_compression_results from nni.compression.speedup.torch import ModelSpeedup model = MobileModel().cuda() model.eval() apply_compression_results(model, 'pruned_mask.pth', None) m_speedup = ModelSpeedup(model, torch.randn(1, 3, 224, 224).cuda(), 'pruned_mask.pth', None) m_speedup.speedup_model() torch.save(model.state_dict(), 'pruned_speedup_model.pth')
dummy_input = torch.randn((64, 3, 224, 224)).cuda() model = MobileNetV2(n_class=config.num_classes, width_mult=1.0) model.cuda() start = time.time() for i in range(32): output = model(dummy_input) end = time.time() print("Time for original model:", end - start) model.load_state_dict(torch.load('results/pruned/pruned_model.pth')) mask_file = './results/pruned/pruned_mask.pth' apply_compression_results(model, mask_file, 'cuda') start = time.time() for i in range(32): mask_output = model(dummy_input) end = time.time() print("Time for masked model:", end - start) m_speedup = ModelSpeedup(model, dummy_input, mask_file, 'cuda') m_speedup.speedup_model() start = time.time() for i in range(32): speedup_output = model(dummy_input) end = time.time() print("Time for speedup model:", end - start)