def export_img_classifier_to_onnx(model, onnx_fname, dataset, add_softmax=True, **kwargs): """Export a PyTorch image classifier to ONNX. Args: add_softmax: when True, adds softmax layer to the output model. kwargs: arguments to be passed to torch.onnx.export """ dummy_input = distiller.get_dummy_input(dataset, distiller.model_device(model)) # Pytorch doesn't support exporting modules wrapped in DataParallel non_para_model = distiller.make_non_parallel_copy(model) try: if add_softmax: # Explicitly add a softmax layer, because it is needed for the ONNX inference phase. # TorchVision models use nn.CrossEntropyLoss for computing the loss, # instead of adding a softmax layer non_para_model.original_forward = non_para_model.forward softmax = torch.nn.Softmax(dim=-1) non_para_model.forward = lambda input: softmax( non_para_model.original_forward(input)) torch.onnx.export(non_para_model, dummy_input, onnx_fname, **kwargs) msglogger.info("Exported the model to ONNX format at %s" % os.path.realpath(onnx_fname)) finally: del non_para_model
def _convert_ptq_to_pytorch(model, args): msglogger.info('Converting Distiller PTQ model to PyTorch quantization API') dummy_input = distiller.get_dummy_input(input_shape=model.input_shape) model = quantization.convert_distiller_ptq_model_to_pytorch(model, dummy_input, backend=args.qe_pytorch_backend) msglogger.debug('\nModel after conversion:\n{}'.format(model)) args.device = 'cpu' return model
def draw_img_classifier_to_file(model, png_fname, dataset=None, display_param_nodes=False, rankdir='TB', styles=None, input_shape=None): """Draw a PyTorch image classifier to a PNG file. This a helper function that simplifies the interface of draw_model_to_file(). Args: model: PyTorch model instance png_fname (string): PNG file name dataset (string): one of 'imagenet' or 'cifar10'. This is required in order to create a dummy input of the correct shape. display_param_nodes (boolean): if True, draw the parameter nodes rankdir: diagram direction. 'TB'/'BT' is Top-to-Bottom/Bottom-to-Top 'LR'/'R/L' is Left-to-Rt/Rt-to-Left styles: a dictionary of styles. Key is module name. Value is a legal pydot style dictionary. For example: styles['conv1'] = {'shape': 'oval', 'fillcolor': 'gray', 'style': 'rounded, filled'} input_shape (tuple): List of integers representing the input shape. Used only if 'dataset' is None """ dummy_input = distiller.get_dummy_input(dataset=dataset, device=distiller.model_device(model), input_shape=input_shape) try: non_para_model = distiller.make_non_parallel_copy(model) g = SummaryGraph(non_para_model, dummy_input) draw_model_to_file(g, png_fname, display_param_nodes, rankdir, styles) print("Network PNG image generation completed") except FileNotFoundError: print("An error has occured while generating the network PNG image.") print("Please check that you have graphviz installed.") print("\t$ sudo apt-get install graphviz") finally: del non_para_model
def model_summary(model, what, dataset=None): if what.startswith('png'): draw_img_classifier_to_file(model, 'model.png', dataset, what == 'png_w_params') elif what == 'sparsity': pylogger = PythonLogger(msglogger) csvlogger = CsvLogger() distiller.log_weights_sparsity(model, -1, loggers=[pylogger, csvlogger]) elif what == 'compute': try: dummy_input = distiller.get_dummy_input(dataset, distiller.model_device(model)) except ValueError as e: print(e) return df = model_performance_summary(model, dummy_input, 1) t = tabulate(df, headers='keys', tablefmt='psql', floatfmt=".5f") total_macs = df['MACs'].sum() print(t) print("Total MACs: " + "{:,}".format(total_macs)) elif what == 'model': # print the simple form of the model print(model) elif what == 'modules': # Print the names of non-leaf modules # Remember that in PyTorch not every node is a module (e.g. F.relu). # Also remember that parameterless modules, like nn.MaxPool2d, can be used multiple # times in the same model, but they will only appear once in the modules list. nodes = [] for name, module in model.named_modules(): # Only print leaf modules if len(module._modules) == 0: nodes.append([name, module.__class__.__name__]) print(tabulate(nodes, headers=['Name', 'Type'])) else: raise ValueError("%s is not a supported summary type" % what)
def test_scope_name_workarounds(): class ModelWithGemms(nn.Module): def __init__(self): super(ModelWithGemms, self).__init__() self.drop1 = nn.Dropout() self.fc1 = nn.Linear(100, 50) self.relu1 = nn.ReLU(inplace=True) self.drop2 = nn.Dropout() self.fc2 = nn.Linear(50, 25) self.relu2 = nn.ReLU(inplace=True) self.fc3 = nn.Linear(25, 1) self.drop3 = nn.Dropout() def forward(self, x): x = self.drop1(x) x = self.fc1(x) x = self.relu1(x) x = self.drop2(x) x = self.fc2(x) x = self.relu2(x) x = self.fc3(x) x = self.drop3(x) return x m = ModelWithGemms() dummy_input = distiller.get_dummy_input(input_shape=(1, 100)) expected_types = ('Gemm', 'Relu', 'Gemm', 'Relu', 'Gemm') # We have workarounds for 2 issues: # 1. GEMM ops get the scope name of the op that came before them # 2. Ops that come before a dropout op get the scope name of the dropout op # If both conditions apply, empirically that #2 is the issue that manifests # For the model above we expect the ops in the graph to be named (in order): # 'fc1', 'relu1', 'fc2', 'relu2', 'fc3' # (note that dropout ops are dropped) # # But without our workarounds in place, we'll get: # 'drop1', 'drop2', 'drop2__1', 'relu2', 'drop3' # # What happens is: # * 'fc1' - issue #1 applies, so 'fc1' --> 'drop1' # * 'relu1' - issue #2 applies, so 'relu1' --> 'drop2' # * 'fc2' - issue #1 applies, so 'fc1' --> 'drop2__1' ('__1' suffix because 'drop2' already exists) # * 'relu2' should be ok as-is # * 'fc3' is susceptible to both issues - it's a GEMM op AND it comes before a dropout. As mentioned above, # issue #2 "wins", so 'fc3' --> 'drop3' # We test without the workarounds as a means to see if the issues still exist. New PyTorch versions # may fix them, in which case we can remove the workarounds sg = SummaryGraph(m, dummy_input, apply_scope_name_workarounds=False) names, types = zip(*[(op_name, op['type']) for op_name, op in sg.ops.items()]) assert names == ('drop1', 'drop2', 'drop2__1', 'relu2', 'drop3') assert types == expected_types # Now test with the workarounds sg = SummaryGraph(m, dummy_input) names, types = zip(*[(op_name, op['type']) for op_name, op in sg.ops.items()]) assert names == ('fc1', 'relu1', 'fc2', 'relu2', 'fc3') assert types == expected_types
def test_gemm_nodes_scope_names(): class ModelWithGemms(nn.Module): def __init__(self): super(ModelWithGemms, self).__init__() self.drop1 = nn.Dropout() self.fc1 = nn.Linear(100, 50) self.relu1 = nn.ReLU(inplace=True) self.drop2 = nn.Dropout() self.fc2 = nn.Linear(50, 25) self.relu2 = nn.ReLU(inplace=True) self.fc3 = nn.Linear(25, 1) def forward(self, x): # Isn't this pretty... return self.fc3(self.relu2(self.fc2(self.drop2(self.relu1(self.fc1(self.drop1(x))))))) m = ModelWithGemms() sg = SummaryGraph(m, distiller.get_dummy_input(input_shape=(1, 100))) # For the model above we expect the ops to be named (in order): # 'drop1', 'fc1', 'relu1', 'drop2', 'fc2', 'relu2', 'fc3' # But without our workaround in place, they'll be named: # 'drop1', 'drop1__1', 'relu1', 'drop2', 'drop2__1', 'relu2', 'relu2__1' # (that is - each FC node gets the name of the node before) names, types = zip(*[(op_name, op['type']) for op_name, op in sg.ops.items()]) assert names == ('drop1', 'fc1', 'relu1', 'drop2', 'fc2', 'relu2', 'fc3') assert types == ('Dropout', 'Gemm', 'Relu', 'Dropout', 'Gemm', 'Relu', 'Gemm')
def test_named_params_layers(dataset, arch, parallel): model = create_model(False, dataset, arch, parallel=parallel) sgraph = SummaryGraph(model, distiller.get_dummy_input(dataset)) sgraph_layer_names = set(k for k, i, j in sgraph.named_params_layers()) for layer_name in sgraph_layer_names: assert sgraph.find_op( layer_name ) is not None, '{} was not found in summary graph'.format(layer_name)
def test_weights_size_attr(dataset, arch, parallel): model = create_model(False, dataset, arch, parallel=parallel) sgraph = SummaryGraph(model, distiller.get_dummy_input(dataset)) distiller.assign_layer_fq_names(model) for name, mod in model.named_modules(): if isinstance(mod, nn.Conv2d) or isinstance(mod, nn.Linear): op = sgraph.find_op(name) assert op is not None assert op['attrs']['weights_vol'] == distiller.volume(mod.weight)
def test_scope_name_workarounds(): class ModelWithGemms(nn.Module): def __init__(self): super(ModelWithGemms, self).__init__() self.drop1 = nn.Dropout() self.fc1 = nn.Linear(100, 50) self.relu1 = nn.ReLU(inplace=True) self.drop2 = nn.Dropout() self.fc2 = nn.Linear(50, 25) self.relu2 = nn.ReLU(inplace=True) self.fc3 = nn.Linear(25, 1) self.drop3 = nn.Dropout() def forward(self, x): x = self.drop1(x) x = self.fc1(x) x = self.relu1(x) x = self.drop2(x) x = self.fc2(x) x = self.relu2(x) x = self.fc3(x) x = self.drop3(x) return x m = ModelWithGemms() dummy_input = distiller.get_dummy_input(input_shape=(1, 100)) expected_types = ('Gemm', 'Relu', 'Gemm', 'Relu', 'Gemm') # We have a workaround for the following issue: # (used to be 2 issues but one got fixed in PyTorch 1.2) # * Ops that come before a dropout op get the scope name of the dropout op # For the model above we expect the ops in the graph to be named (in order): # 'fc1', 'relu1', 'fc2', 'relu2', 'fc3' # (note that dropout ops are dropped) # # But since 'relu1' and 'fc3' come before a dropout op, without the workaround in place we'll get: # 'fc1', 'drop2', 'fc2', 'relu2', 'drop3' # We test without the workarounds as a means to see if the issues still exist. New PyTorch versions # may fix them, in which case we can remove the workarounds sg = SummaryGraph(m, dummy_input, apply_scope_name_workarounds=False) names, types = zip(*[(op_name, op['type']) for op_name, op in sg.ops.items()]) assert names == ('fc1', 'drop2', 'fc2', 'relu2', 'drop3') assert types == expected_types # Now test with the workarounds sg = SummaryGraph(m, dummy_input) names, types = zip(*[(op_name, op['type']) for op_name, op in sg.ops.items()]) assert names == ('fc1', 'relu1', 'fc2', 'relu2', 'fc3') assert types == expected_types
def test_sg_macs(): '''Compare the MACs of different modules as computed by a SummaryGraph and model summary.''' import common sg = create_graph('imagenet', 'mobilenet') assert sg model, _ = common.setup_test('mobilenet', 'imagenet', parallel=False) df_compute = distiller.model_performance_summary(model, distiller.get_dummy_input('imagenet')) modules_macs = df_compute.loc[:, ['Name', 'MACs']] for name, mod in model.named_modules(): if isinstance(mod, (nn.Conv2d, nn.Linear)): summary_macs = int(modules_macs.loc[modules_macs.Name == name].MACs) sg_macs = sg.find_op(name)['attrs']['MACs'] assert summary_macs == sg_macs
def get_model_compute_budget(model, dataset, layers_to_prune=None): """Return the compute budget of the Convolution layers in an image-classifier. """ dummy_input = distiller.get_dummy_input(dataset) g = SummaryGraph(model, dummy_input) total_macs = 0 for name, m in model.named_modules(): if isinstance(m, torch.nn.Conv2d): # Use the SummaryGraph to obtain some other details of the models conv_op = g.find_op(normalize_module_name(name)) assert conv_op is not None total_macs += conv_op['attrs']['MACs'] del g return total_macs
def test_compute_summary(): dataset = "cifar10" arch = "simplenet_cifar" model, _ = common.setup_test(arch, dataset, parallel=True) df_compute = distiller.model_performance_summary( model, distiller.get_dummy_input(dataset)) module_macs = df_compute.loc[:, 'MACs'].to_list() # [conv1, conv2, fc1, fc2, fc3] assert module_macs == [352800, 240000, 48000, 10080, 840] dataset = "imagenet" arch = "mobilenet" model, _ = common.setup_test(arch, dataset, parallel=True) df_compute = distiller.model_performance_summary( model, distiller.get_dummy_input(dataset)) module_macs = df_compute.loc[:, 'MACs'].to_list() expected_macs = [ 10838016, 3612672, 25690112, 1806336, 25690112, 3612672, 51380224, 903168, 25690112, 1806336, 51380224, 451584, 25690112, 903168, 51380224, 903168, 51380224, 903168, 51380224, 903168, 51380224, 903168, 51380224, 225792, 25690112, 451584, 51380224, 1024000 ] assert module_macs == expected_macs
def quantize_and_test_model(test_loader, model, criterion, args, loggers=None, scheduler=None, save_flag=True): """Collect stats using test_loader (when stats file is absent), clone the model and quantize the clone, and finally, test it. args.device is allowed to differ from the model's device. When args.qe_calibration is set to None, uses 0.05 instead. scheduler - pass scheduler to store it in checkpoint save_flag - defaults to save both quantization statistics and checkpoint. """ if hasattr(model, 'quantizer_metadata') and \ model.quantizer_metadata['type'] == distiller.quantization.PostTrainLinearQuantizer: raise RuntimeError('Trying to invoke post-training quantization on a model that has already been post-' 'train quantized. Model was likely loaded from a checkpoint. Please run again without ' 'passing the --quantize-eval flag') if not (args.qe_dynamic or args.qe_stats_file or args.qe_config_file): args_copy = copy.deepcopy(args) args_copy.qe_calibration = args.qe_calibration if args.qe_calibration is not None else 0.05 # set stats into args stats field args.qe_stats_file = acts_quant_stats_collection( model, criterion, loggers, args_copy, save_to_file=save_flag) args_qe = copy.deepcopy(args) if args.device == 'cpu': # NOTE: Even though args.device is CPU, we allow here that model is not in CPU. qe_model = distiller.make_non_parallel_copy(model).cpu() else: qe_model = copy.deepcopy(model).to(args.device) quantizer = quantization.PostTrainLinearQuantizer.from_args(qe_model, args_qe) dummy_input = distiller.get_dummy_input(input_shape=model.input_shape) quantizer.prepare_model(dummy_input) if args.qe_convert_pytorch: qe_model = _convert_ptq_to_pytorch(qe_model, args_qe) test_res = test(test_loader, qe_model, criterion, loggers, args=args_qe) if save_flag: checkpoint_name = 'quantized' apputils.save_checkpoint(0, args_qe.arch, qe_model, scheduler=scheduler, name='_'.join([args_qe.name, checkpoint_name]) if args_qe.name else checkpoint_name, dir=msglogger.logdir, extras={'quantized_top1': test_res[0]}) del qe_model return test_res
def test_merge_pad_avgpool(): class ModelWithAvgPool(nn.Module): def __init__(self): super(ModelWithAvgPool, self).__init__() self.conv = nn.Conv2d(3, 10, 5) self.avgpool = nn.AvgPool2d(2) def forward(self, input): return self.avgpool(self.conv(input)) m = ModelWithAvgPool() sg = SummaryGraph(m, distiller.get_dummy_input(input_shape=(1, 3, 50, 50))) avgpool_ops = [op_name for op_name in sg.ops if 'avgpool' in op_name] assert len(avgpool_ops) == 1 assert sg.ops[avgpool_ops[0]]['name'] == 'avgpool' assert sg.ops[avgpool_ops[0]]['type'] == 'AveragePool'
def evaluate_model(model, criterion, test_loader, loggers, activations_collectors, args, scheduler=None): # This sample application can be invoked to evaluate the accuracy of your model on # the test dataset. # You can optionally quantize the model to 8-bit integer before evaluation. # For example: # python3 compress_classifier.py --arch resnet20_cifar ../data.cifar10 -p=50 --resume-from=checkpoint.pth.tar --evaluate if not isinstance(loggers, list): loggers = [loggers] if args.quantize_eval: model.cpu() quantizer = quantization.PostTrainLinearQuantizer.from_args( model, args) quantizer.prepare_model( distiller.get_dummy_input(input_shape=model.input_shape)) model.to(args.device) top1, _, _ = test(test_loader, model, criterion, loggers, activations_collectors, args=args) if args.quantize_eval: checkpoint_name = 'quantized' apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=scheduler, name='_'.join([args.name, checkpoint_name]) if args.name else checkpoint_name, dir=msglogger.logdir, extras={'quantized_top1': top1})
def collect_conv_details(model, dataset, perform_thinning, layers_to_prune=None): dummy_input = distiller.get_dummy_input(dataset) g = SummaryGraph(model, dummy_input) conv_layers = OrderedDict() total_macs = 0 total_params = 0 for id, (name, m) in enumerate(model.named_modules()): if isinstance(m, torch.nn.Conv2d): conv = SimpleNamespace() conv.t = len(conv_layers) conv.k = m.kernel_size[0] conv.stride = m.stride # Use the SummaryGraph to obtain some other details of the models conv_op = g.find_op(normalize_module_name(name)) assert conv_op is not None conv.weights_vol = conv_op['attrs']['weights_vol'] total_params += conv.weights_vol conv.macs = conv_op['attrs']['MACs'] conv_pname = name + ".weight" conv_p = distiller.model_find_param(model, conv_pname) if not perform_thinning: #conv.macs *= distiller.density_ch(conv_p) # Channel pruning conv.macs *= distiller.density_3D(conv_p) # Filter pruning total_macs += conv.macs conv.ofm_h = g.param_shape(conv_op['outputs'][0])[2] conv.ofm_w = g.param_shape(conv_op['outputs'][0])[3] conv.ifm_h = g.param_shape(conv_op['inputs'][0])[2] conv.ifm_w = g.param_shape(conv_op['inputs'][0])[3] conv.name = name conv.id = id if layers_to_prune is None or name in layers_to_prune: conv_layers[len(conv_layers)] = conv return conv_layers, total_macs, total_params
def quantize_and_test_model(test_loader, model, criterion, args, loggers=None, scheduler=None, save_flag=True): """Collect stats using test_loader (when stats file is absent), clone the model and quantize the clone, and finally, test it. args.device is allowed to differ from the model's device. When args.qe_calibration is set to None, uses 0.05 instead. scheduler - pass scheduler to store it in checkpoint save_flag - defaults to save both quantization statistics and checkpoint. """ if not (args.qe_dynamic or args.qe_stats_file or args.qe_config_file): args_copy = copy.deepcopy(args) args_copy.qe_calibration = args.qe_calibration if args.qe_calibration is not None else 0.05 # set stats into args stats field args.qe_stats_file = acts_quant_stats_collection( model, criterion, loggers, args_copy, save_to_file=save_flag) args_qe = copy.deepcopy(args) if args.device == 'cpu': # NOTE: Even though args.device is CPU, we allow here that model is not in CPU. qe_model = distiller.make_non_parallel_copy(model).cpu() else: qe_model = copy.deepcopy(model).to(args.device) quantizer = quantization.PostTrainLinearQuantizer.from_args(qe_model, args_qe) quantizer.prepare_model(distiller.get_dummy_input(input_shape=model.input_shape)) test_res = test(test_loader, qe_model, criterion, loggers, args=args_qe) if save_flag: checkpoint_name = 'quantized' apputils.save_checkpoint(0, args_qe.arch, qe_model, scheduler=scheduler, name='_'.join([args_qe.name, checkpoint_name]) if args_qe.name else checkpoint_name, dir=msglogger.logdir, extras={'quantized_top1': test_res[0]}) del qe_model return test_res
def arbitrary_channel_pruning(config, channels_to_remove, is_parallel): """Test removal of arbitrary channels. The test receives a specification of channels to remove. Based on this specification, the channels are pruned and then physically removed from the model (via a "thinning" process). """ model, zeros_mask_dict = common.setup_test(config.arch, config.dataset, is_parallel) pair = config.module_pairs[0] conv2 = common.find_module_by_name(model, pair[1]) assert conv2 is not None # Test that we can access the weights tensor of the first convolution in layer 1 conv2_p = distiller.model_find_param(model, pair[1] + ".weight") assert conv2_p is not None assert conv2_p.dim() == 4 num_channels = conv2_p.size(1) cnt_nnz_channels = num_channels - len(channels_to_remove) mask = create_channels_mask(conv2_p, channels_to_remove) assert distiller.density_ch(mask) == ( conv2.in_channels - len(channels_to_remove)) / conv2.in_channels # Cool, so now we have a mask for pruning our channels. # Use the mask to prune zeros_mask_dict[pair[1] + ".weight"].mask = mask zeros_mask_dict[pair[1] + ".weight"].apply_mask(conv2_p) all_channels = set([ch for ch in range(num_channels)]) nnz_channels = set(distiller.non_zero_channels(conv2_p)) channels_removed = all_channels - nnz_channels logger.info("Channels removed {}".format(channels_removed)) # Now, let's do the actual network thinning distiller.remove_channels(model, zeros_mask_dict, config.arch, config.dataset, optimizer=None) conv1 = common.find_module_by_name(model, pair[0]) assert conv1 assert conv1.out_channels == cnt_nnz_channels assert conv2.in_channels == cnt_nnz_channels assert conv1.weight.size(0) == cnt_nnz_channels assert conv2.weight.size(1) == cnt_nnz_channels if config.bn_name is not None: bn1 = common.find_module_by_name(model, config.bn_name) assert bn1.running_var.size(0) == cnt_nnz_channels assert bn1.running_mean.size(0) == cnt_nnz_channels assert bn1.num_features == cnt_nnz_channels assert bn1.bias.size(0) == cnt_nnz_channels assert bn1.weight.size(0) == cnt_nnz_channels dummy_input = distiller.get_dummy_input(config.dataset, distiller.model_device(model)) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.1) run_forward_backward(model, optimizer, dummy_input) # Let's test saving and loading a thinned model. # We save 3 times, and load twice, to make sure to cover some corner cases: # - Make sure that after loading, the model still has hold of the thinning recipes # - Make sure that after a 2nd load, there no problem loading (in this case, the # tensors are already thin, so this is a new flow) # (1) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None) model_2 = create_model(False, config.dataset, config.arch, parallel=is_parallel) model(dummy_input) model_2(dummy_input) conv2 = common.find_module_by_name(model_2, pair[1]) assert conv2 is not None model_2 = load_lean_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') run_forward_backward(model, optimizer, dummy_input) # (2) compression_scheduler = distiller.CompressionScheduler(model) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None, scheduler=compression_scheduler) model_2 = load_lean_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done") # (3) save_checkpoint(epoch=0, arch=config.arch, model=model_2, optimizer=None, scheduler=compression_scheduler) model_2 = load_lean_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done 2")
def _create_graph(dataset, model): dummy_input = distiller.get_dummy_input(dataset, distiller.model_device(model)) return SummaryGraph(model, dummy_input)
def create_graph(input_shape, model): dummy_input = distiller.get_dummy_input( device=distiller.model_device(model), input_shape=input_shape) return SummaryGraph(model, dummy_input)
def create_graph(dataset, arch, parallel=False): dummy_input = distiller.get_dummy_input(dataset) model = create_model(False, dataset, arch, parallel) assert model is not None return SummaryGraph(model, dummy_input)
def test_adjacency_map(parallel, dedicated_modules): class TestModel(nn.Module): def __init__(self): super(TestModel, self).__init__() self.conv = nn.Conv2d(3, 10, 5) self.bn = nn.BatchNorm2d(10) self.post_conv_bn = nn.ModuleList([nn.Tanh(), nn.ReLU()]) def forward(self, x): res = self.conv(x) y = self.bn(res) for m in self.post_conv_bn: y = m(y) return y + res def check_adj_entry(actual, expected): assert actual.op_meta == expected.op_meta assert actual.predecessors == expected.predecessors assert actual.successors == expected.successors prefix = 'module.' if parallel else '' m = TestModel() if parallel: m = nn.DataParallel(m) sg = SummaryGraph(m, distiller.get_dummy_input(input_shape=(1, 3, 10, 10))) adj_map = sg.adjacency_map(dedicated_modules_only=dedicated_modules) if dedicated_modules: assert len(adj_map) == 4 else: assert len(adj_map) == 5 conv_op_meta = OpSimpleMetadata(prefix + 'conv', 'Conv') bn_op_meta = OpSimpleMetadata(prefix + 'bn', 'BatchNormalization') tanh_op_meta = OpSimpleMetadata(prefix + 'post_conv_bn.0', 'Tanh') relu_op_meta = OpSimpleMetadata(prefix + 'post_conv_bn.1', 'Relu') add_op_meta = OpSimpleMetadata('top_level_op', 'Add') name = conv_op_meta.name assert name in adj_map expected = AdjacentsEntry(conv_op_meta) expected.successors = [bn_op_meta] if dedicated_modules else [ bn_op_meta, add_op_meta ] check_adj_entry(adj_map[name], expected) name = bn_op_meta.name assert name in adj_map expected = AdjacentsEntry(bn_op_meta) expected.predecessors = [conv_op_meta] expected.successors = [tanh_op_meta] check_adj_entry(adj_map[name], expected) name = tanh_op_meta.name assert name in adj_map expected = AdjacentsEntry(tanh_op_meta) expected.predecessors = [bn_op_meta] expected.successors = [relu_op_meta] check_adj_entry(adj_map[name], expected) name = relu_op_meta.name assert name in adj_map expected = AdjacentsEntry(relu_op_meta) expected.predecessors = [tanh_op_meta] expected.successors = [] if dedicated_modules else [add_op_meta] check_adj_entry(adj_map[name], expected) name = add_op_meta.name if dedicated_modules: assert name not in adj_map else: assert name in adj_map expected = AdjacentsEntry(add_op_meta) expected.predecessors = [relu_op_meta, conv_op_meta] check_adj_entry(adj_map[name], expected)
def create_graph(dataset, model, arch=None): dummy_input = distiller.get_dummy_input(dataset, distiller.model_device(model), model_name=arch) return SummaryGraph(model, dummy_input)
def get_network_details(model, dataset, dependency_type, layers_to_prune=None): def make_conv(model, conv_module, g, name, seq_id, layer_id): conv = SimpleNamespace() conv.name = name conv.id = layer_id conv.t = seq_id conv.k = conv_module.kernel_size[0] conv.stride = conv_module.stride # Use the SummaryGraph to obtain some other details of the models conv_op = g.find_op(normalize_module_name(name)) assert conv_op is not None conv.weights_vol = conv_op['attrs']['weights_vol'] conv.macs = conv_op['attrs']['MACs'] conv.n_ofm = conv_op['attrs']['n_ofm'] conv.n_ifm = conv_op['attrs']['n_ifm'] conv_pname = name + ".weight" conv_p = distiller.model_find_param(model, conv_pname) conv.ofm_h = g.param_shape(conv_op['outputs'][0])[2] conv.ofm_w = g.param_shape(conv_op['outputs'][0])[3] conv.ifm_h = g.param_shape(conv_op['inputs'][0])[2] conv.ifm_w = g.param_shape(conv_op['inputs'][0])[3] return conv def make_fc(model, fc_module, g, name, seq_id, layer_id): fc = SimpleNamespace() fc.name = name fc.id = layer_id fc.t = seq_id # Use the SummaryGraph to obtain some other details of the models fc_op = g.find_op(normalize_module_name(name)) assert fc_op is not None fc.weights_vol = fc_op['attrs']['weights_vol'] fc.macs = fc_op['attrs']['MACs'] fc.n_ofm = fc_op['attrs']['n_ofm'] fc.n_ifm = fc_op['attrs']['n_ifm'] fc_pname = name + ".weight" fc_p = distiller.model_find_param(model, fc_pname) return fc dummy_input = distiller.get_dummy_input(dataset) g = SummaryGraph(model, dummy_input) all_layers = OrderedDict() pruned_indices = list() dependent_layers = set() total_macs = 0 total_params = 0 layers = OrderedDict({mod_name: m for mod_name, m in model.named_modules() if isinstance(m, (torch.nn.Conv2d, torch.nn.Linear))}) for layer_id, (name, m) in enumerate(layers.items()): if isinstance(m, torch.nn.Conv2d): conv = make_conv(model, m, g, name, seq_id=len(pruned_indices), layer_id=layer_id) all_layers[layer_id] = conv total_params += conv.weights_vol total_macs += conv.macs if layers_to_prune is None or name in layers_to_prune: pruned_indices.append(layer_id) # Find the data-dependent layers of this convolution from utils.data_dependencies import find_dependencies conv.dependencies = list() find_dependencies(dependency_type, g, all_layers, name, conv.dependencies) dependent_layers.add(tuple(conv.dependencies)) elif isinstance(m, torch.nn.Linear): fc = make_fc(model, m, g, name, seq_id=len(pruned_indices), layer_id=layer_id) all_layers[layer_id] = fc total_macs += fc.macs total_params += fc.weights_vol def convert_layer_names_to_indices(layer_names): """Args: layer_names - list of layer names Returns: list of layer indices """ layer_indices = [index for name in layer_names for index, layer in all_layers.items() if layer.name == name[0]] return layer_indices dependent_indices = convert_layer_names_to_indices(dependent_layers) return all_layers, pruned_indices, dependent_indices, total_macs, total_params
def ranked_filter_pruning(config, ratio_to_prune, is_parallel, rounding_fn=math.floor): """Test L1 ranking and pruning of filters. First we rank and prune the filters of a Convolutional layer using a L1RankedStructureParameterPruner. Then we physically remove the filters from the model (via "thining" process). """ logger.info("executing: %s (invoked by %s)" % (inspect.currentframe().f_code.co_name, inspect.currentframe().f_back.f_code.co_name)) model, zeros_mask_dict = common.setup_test(config.arch, config.dataset, is_parallel) for pair in config.module_pairs: # Test that we can access the weights tensor of the first convolution in layer 1 conv1_p = distiller.model_find_param(model, pair[0] + ".weight") assert conv1_p is not None num_filters = conv1_p.size(0) # Test that there are no zero-filters assert distiller.sparsity_3D(conv1_p) == 0.0 # Create a filter-ranking pruner pruner = distiller.pruning.L1RankedStructureParameterPruner( "filter_pruner", group_type="Filters", desired_sparsity=ratio_to_prune, weights=pair[0] + ".weight", rounding_fn=rounding_fn) pruner.set_param_mask(conv1_p, pair[0] + ".weight", zeros_mask_dict, meta=None) conv1 = common.find_module_by_name(model, pair[0]) assert conv1 is not None # Test that the mask has the correct fraction of filters pruned. # We asked for 10%, but there are only 16 filters, so we have to settle for 1/16 filters expected_cnt_removed_filters = int(ratio_to_prune * conv1.out_channels) expected_pruning = expected_cnt_removed_filters / conv1.out_channels masker = zeros_mask_dict[pair[0] + ".weight"] assert masker is not None assert distiller.sparsity_3D(masker.mask) == expected_pruning # Use the mask to prune assert distiller.sparsity_3D(conv1_p) == 0 masker.apply_mask(conv1_p) assert distiller.sparsity_3D(conv1_p) == expected_pruning # Remove filters conv2 = common.find_module_by_name(model, pair[1]) assert conv2 is not None assert conv1.out_channels == num_filters assert conv2.in_channels == num_filters # Test thinning distiller.remove_filters(model, zeros_mask_dict, config.arch, config.dataset, optimizer=None) assert conv1.out_channels == num_filters - expected_cnt_removed_filters assert conv2.in_channels == num_filters - expected_cnt_removed_filters # Test the thinned model dummy_input = distiller.get_dummy_input(config.dataset, distiller.model_device(model)) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.1) run_forward_backward(model, optimizer, dummy_input) return model, zeros_mask_dict