def step(self, action: float, model: Module): _, _, current_statistics = count_flops_params(model, self.dummy_input, verbose=False) current_statistics = { result['name']: result for result in current_statistics } index = self.pruning_op_names.index(self.current_op_name) action = 1 - current_statistics[self.current_op_name][ self.target] / self.current_op_target total_current_target = sum([ current_statistics[name][self.target] for name in self.pruning_op_names ]) previous_pruning_target = self.under_pruning_target - total_current_target rest_target = sum([ current_statistics[name][self.target] for name in self.pruning_op_names[index + 1:] ]) self.layer_embedding[index][ -3] = previous_pruning_target / self.under_pruning_target # reduced self.layer_embedding[index][ -2] = rest_target / self.under_pruning_target # rest self.layer_embedding[index][-1] = action # last action observation = self.layer_embedding[index, :].copy() return action, 0, observation, self.is_final_layer()
def count_flops(model, log=None, device=None): dummy_input = torch.rand([1, 3, 256, 256]) if device is not None: dummy_input = dummy_input.to(device) flops, params, results = count_flops_params(model, dummy_input) print(f"FLOPs: {flops}, params: {params}") if log is not None: log.write(f"FLOPs: {flops}, params: {params}\n") return flops, params
def __init__(self, model: Module, config_list: List[Dict], dummy_input: Tensor, total_sparsity: float, max_sparsity_per_layer: Dict[str, float], target: str = 'flops'): pruning_op_names = [] [ pruning_op_names.extend(config['op_names']) for config in config_list_canonical(model, config_list) ] self.pruning_ops = OrderedDict() self.pruning_types = [] for i, (name, layer) in enumerate(model.named_modules()): if name in pruning_op_names: op_type = type(layer).__name__ stride = np.power(np.prod(layer.stride), 1 / len(layer.stride)) if hasattr( layer, 'stride') else 0 kernel_size = np.power(np.prod(layer.kernel_size), 1 / len(layer.kernel_size)) if hasattr( layer, 'kernel_size') else 1 self.pruning_ops[name] = (i, op_type, stride, kernel_size) self.pruning_types.append(op_type) self.pruning_types = list(set(self.pruning_types)) self.pruning_op_names = list(self.pruning_ops.keys()) self.dummy_input = dummy_input self.total_sparsity = total_sparsity self.max_sparsity_per_layer = max_sparsity_per_layer assert target in ['flops', 'params'] self.target = target self.origin_target, self.origin_params_num, self.origin_statistics = count_flops_params( model, dummy_input, verbose=False) self.origin_statistics = { result['name']: result for result in self.origin_statistics } self.under_pruning_target = sum([ self.origin_statistics[name][self.target] for name in self.pruning_op_names ]) self.excepted_pruning_target = self.total_sparsity * self.under_pruning_target
def correct_action(self, action: float, model: Module): try: op_name = next(self.ops_iter) index = self.pruning_op_names.index(op_name) _, _, current_statistics = count_flops_params(model, self.dummy_input, verbose=False) current_statistics = { result['name']: result for result in current_statistics } total_current_target = sum([ current_statistics[name][self.target] for name in self.pruning_op_names ]) previous_pruning_target = self.under_pruning_target - total_current_target max_rest_pruning_target = sum([ current_statistics[name][self.target] * self.max_sparsity_per_layer[name] for name in self.pruning_op_names[index + 1:] ]) min_current_pruning_target = self.excepted_pruning_target - previous_pruning_target - max_rest_pruning_target max_current_pruning_target_1 = self.origin_statistics[op_name][ self.target] * self.max_sparsity_per_layer[op_name] - ( self.origin_statistics[op_name][self.target] - current_statistics[op_name][self.target]) max_current_pruning_target_2 = self.excepted_pruning_target - previous_pruning_target max_current_pruning_target = min(max_current_pruning_target_1, max_current_pruning_target_2) min_action = min_current_pruning_target / current_statistics[ op_name][self.target] max_action = max_current_pruning_target / current_statistics[ op_name][self.target] if min_action > self.max_sparsity_per_layer[op_name]: _logger.warning( '[%s] min action > max sparsity per layer: %f > %f', op_name, min_action, self.max_sparsity_per_layer[op_name]) action = max(0., min(max_action, max(min_action, action))) self.current_op_name = op_name self.current_op_target = current_statistics[op_name][self.target] except StopIteration: raise Error('Something goes wrong, this should not happen.') return action
def _calculate_flops(self, eps=0.001): """FLOPs cost.""" flops_lut = [{} for i in range(self.cnt_layers)] layer_id = 0 for stage_name in self.lut_ops: stage_ops = self.lut_ops[stage_name] ops_num = self.layer_num[stage_name] for _ in range(ops_num): for op_name in stage_ops: layer_config = self.layer_configs[layer_id] key_params = {"fm_size": layer_config[3]} op = stage_ops[op_name](*layer_config[0:3], **key_params) # measured in Flops in_shape = self.layer_in_shapes[layer_id] x = (1, in_shape[0], in_shape[1], in_shape[2]) flops, _, _ = count_flops_params(op, x, verbose=False) flops = eps if flops == 0.0 else flops flops_lut[layer_id][op_name] = float(flops) layer_id += 1 return flops_lut
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") os.makedirs(args.experiment_data_dir, exist_ok=True) transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) train_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=True, download=True, transform=transform), batch_size=64, ) test_loader = torch.utils.data.DataLoader(datasets.MNIST( 'data', train=False, transform=transform), batch_size=1000) # Step1. Model Pretraining model = NaiveModel().to(device) criterion = torch.nn.NLLLoss() optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) flops, params, _ = count_flops_params(model, (1, 1, 28, 28), verbose=False) if args.pretrained_model_dir is None: args.pretrained_model_dir = os.path.join(args.experiment_data_dir, f'pretrained.pth') best_acc = 0 for epoch in range(args.pretrain_epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = test(args, model, device, criterion, test_loader) if acc > best_acc: best_acc = acc state_dict = model.state_dict() model.load_state_dict(state_dict) torch.save(state_dict, args.pretrained_model_dir) print(f'Model saved to {args.pretrained_model_dir}') else: state_dict = torch.load(args.pretrained_model_dir) model.load_state_dict(state_dict) best_acc = test(args, model, device, criterion, test_loader) dummy_input = torch.randn([1000, 1, 28, 28]).to(device) time_cost = get_model_time_cost(model, dummy_input) # 125.49 M, 0.85M, 93.29, 1.1012 print( f'Pretrained model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}' ) # Step2. Model Pruning config_list = [{'sparsity': args.sparsity, 'op_types': ['Conv2d']}] kw_args = {} if args.dependency_aware: dummy_input = torch.randn([1000, 1, 28, 28]).to(device) print('Enable the dependency_aware mode') # note that, not all pruners support the dependency_aware mode kw_args['dependency_aware'] = True kw_args['dummy_input'] = dummy_input pruner = L1FilterPruner(model, config_list, **kw_args) model = pruner.compress() pruner.get_pruned_weights() mask_path = os.path.join(args.experiment_data_dir, 'mask.pth') model_path = os.path.join(args.experiment_data_dir, 'pruned.pth') pruner.export_model(model_path=model_path, mask_path=mask_path) pruner._unwrap_model() # unwrap all modules to normal state # Step3. Model Speedup m_speedup = ModelSpeedup(model, dummy_input, mask_path, device) m_speedup.speedup_model() print('model after speedup', model) flops, params, _ = count_flops_params(model, dummy_input, verbose=False) acc = test(args, model, device, criterion, test_loader) time_cost = get_model_time_cost(model, dummy_input) print( f'Pruned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {acc: .2f}, Time Cost: {time_cost}' ) # Step4. Model Finetuning optimizer = optim.Adadelta(model.parameters(), lr=args.pretrain_lr) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) best_acc = 0 for epoch in range(args.finetune_epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = test(args, model, device, criterion, test_loader) if acc > best_acc: best_acc = acc state_dict = model.state_dict() model.load_state_dict(state_dict) save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth') torch.save(state_dict, save_path) flops, params, _ = count_flops_params(model, dummy_input, verbose=True) time_cost = get_model_time_cost(model, dummy_input) # FLOPs 28.48 M, #Params: 0.18M, Accuracy: 89.03, Time Cost: 1.03 print( f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}, Time Cost: {time_cost}' ) print(f'Model saved to {save_path}') # Step5. Model Quantization via QAT config_list = [{ 'quant_types': ['weight', 'output'], 'quant_bits': { 'weight': 8, 'output': 8 }, 'op_names': ['conv1'] }, { 'quant_types': ['output'], 'quant_bits': { 'output': 8 }, 'op_names': ['relu1'] }, { 'quant_types': ['weight', 'output'], 'quant_bits': { 'weight': 8, 'output': 8 }, 'op_names': ['conv2'] }, { 'quant_types': ['output'], 'quant_bits': { 'output': 8 }, 'op_names': ['relu2'] }] optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5) quantizer = QAT_Quantizer(model, config_list, optimizer) quantizer.compress() # Step6. Quantization Aware Training best_acc = 0 for epoch in range(1): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = test(args, model, device, criterion, test_loader) if acc > best_acc: best_acc = acc state_dict = model.state_dict() calibration_path = os.path.join(args.experiment_data_dir, 'calibration.pth') calibration_config = quantizer.export_model(model_path, calibration_path) print("calibration_config: ", calibration_config) # Step7. Model Speedup batch_size = 32 input_shape = (batch_size, 1, 28, 28) engine = ModelSpeedupTensorRT(model, input_shape, config=calibration_config, batchsize=32) engine.compress() test_trt(engine, test_loader)
model, total_epoch=args.pretrain_epochs) criterion = torch.nn.CrossEntropyLoss() pre_best_acc = 0.0 best_state_dict = None for i in range(args.pretrain_epochs): trainer(model, optimizer, criterion) scheduler.step() acc = evaluator(model) if acc > pre_best_acc: pre_best_acc = acc best_state_dict = model.state_dict() print("Best accuracy: {}".format(pre_best_acc)) model.load_state_dict(best_state_dict) pre_flops, pre_params, _ = count_flops_params( model, torch.randn([128, 3, 32, 32]).to(device)) g_epoch = 0 # Start to prune and speedup print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50) config_list = [{ 'total_sparsity': 0.5, 'op_types': ['Conv2d'], }] # make sure you have used nni.trace to wrap the optimizer class before initialize traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(), lr=0.01, momentum=0.9,
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") os.makedirs(args.experiment_data_dir, exist_ok=True) # prepare model and data train_loader, test_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size) model, optimizer, scheduler = get_model_optimizer_scheduler( args, device, train_loader, test_loader, criterion) dummy_input = get_dummy_input(args, device) flops, params, results = count_flops_params(model, dummy_input) print(f"FLOPs: {flops}, params: {params}") print('start pruning...') model_path = os.path.join( args.experiment_data_dir, 'pruned_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner)) mask_path = os.path.join( args.experiment_data_dir, 'mask_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner)) pruner = get_pruner(model, args.pruner, device, optimizer, args.dependency_aware) model = pruner.compress() if args.multi_gpu and torch.cuda.device_count() > 1: model = nn.DataParallel(model) if args.test_only: test(args, model, device, criterion, test_loader) best_top1 = 0 for epoch in range(args.fine_tune_epochs): pruner.update_epoch(epoch) print('# Epoch {} #'.format(epoch)) train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() top1 = test(args, model, device, criterion, test_loader) if top1 > best_top1: best_top1 = top1 # Export the best model, 'model_path' stores state_dict of the pruned model, # mask_path stores mask_dict of the pruned model pruner.export_model(model_path=model_path, mask_path=mask_path) if args.nni: nni.report_final_result(best_top1) if args.speed_up: # reload the best checkpoint for speed-up args.pretrained_model_dir = model_path model, _, _ = get_model_optimizer_scheduler(args, device, train_loader, test_loader, criterion) model.eval() apply_compression_results(model, mask_path, device) # test model speed start = time.time() for _ in range(32): use_mask_out = model(dummy_input) print('elapsed time when use mask: ', time.time() - start) m_speedup = ModelSpeedup(model, dummy_input, mask_path, device) m_speedup.speedup_model() flops, params, results = count_flops_params(model, dummy_input) print(f"FLOPs: {flops}, params: {params}") start = time.time() for _ in range(32): use_speedup_out = model(dummy_input) print('elapsed time when use speedup: ', time.time() - start) top1 = test(args, model, device, criterion, test_loader)
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = parse_args() ######################################################################### # Prepare model, tokenizer, dataset, optimizer, and the scheduler logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # Load dataset and tokenizer, and then preprocess the dataset raw_dataset, is_regression, num_labels = get_raw_dataset(args.task_name) tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True) processed_datasets = preprocess(args, tokenizer, raw_dataset) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"] # Load pretrained model config = AutoConfig.from_pretrained(args.model_name, num_labels=num_labels, finetuning_task=args.task_name) model = AutoModelForSequenceClassification.from_pretrained(args.model_name, config=config) model.to(device) ######################################################################### # Finetune on the target GLUE task before pruning optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer( args, tokenizer, model, train_dataset, eval_dataset) train_steps = args.num_train_epochs * len(train_dataloader) lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=train_steps) metric = load_metric("glue", args.task_name) logger.info( "================= Finetuning before pruning =================") train_model(args, model, is_regression, train_dataloader, eval_dataloader, optimizer, lr_scheduler, metric, device) if args.output_dir is not None: torch.save(model.state_dict(), args.output_dir + "/model_before_pruning.pt") if args.task_name == "mnli": final_eval_for_mnli(args, model, processed_datasets, metric, data_collator) ######################################################################### # Pruning optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer( args, tokenizer, model, train_dataset, eval_dataset) dummy_input = next(iter(train_dataloader))["input_ids"].to(device) flops, params, results = count_flops_params(model, dummy_input) print( f"Initial model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M" ) # Here criterion is embedded in the model. Upper levels can just pass None to trainer. def trainer(model, optimizer, criterion, epoch): return trainer_helper(model, train_dataloader, optimizer, device) def forward_runner(model): return forward_runner_helper(model, train_dataloader, device) # example: prune different layers with different sparsity attention_name_groups = list( zip([ "bert.encoder.layer.{}.attention.self.query".format(i) for i in range(12) ], [ "bert.encoder.layer.{}.attention.self.key".format(i) for i in range(12) ], [ "bert.encoder.layer.{}.attention.self.value".format(i) for i in range(12) ], [ "bert.encoder.layer.{}.attention.output.dense".format(i) for i in range(12) ])) kwargs = { "ranking_criterion": args.ranking_criterion, "global_sort": args.global_sort, "num_iterations": args.num_iterations, "epochs_per_iteration": args.epochs_per_iteration, "attention_name_groups": attention_name_groups, "head_hidden_dim": 64, "trainer": trainer, "optimizer": optimizer, "forward_runner": forward_runner } config_list = [{ "sparsity": args.sparsity, "op_types": ["Linear"], "op_names": [x for layer in attention_name_groups[:6] for x in layer] }, { "sparsity": args.sparsity / 2, "op_types": ["Linear"], "op_names": [x for layer in attention_name_groups[6:] for x in layer] }] pruner = TransformerHeadPruner(model, config_list, **kwargs) pruner.compress() ######################################################################### # uncomment the following part to export the pruned model masks # model_path = os.path.join(args.output_dir, "pruned_{}_{}.pth".format(args.model_name, args.task_name)) # mask_path = os.path.join(args.output_dir, "mask_{}_{}.pth".format(args.model_name, args.task_name)) # pruner.export_model(model_path=model_path, mask_path=mask_path) ######################################################################### # Speedup # Currently, speeding up Transformers through NNI ModelSpeedup is not supported because of shape inference issues. # However, if you are using the transformers library, you can use the following workaround: # The following code gets the head pruning decisions from the pruner and calls the _prune_heads() function # implemented in models from the transformers library to speed up the model. if args.speed_up: speedup_rules = {} for group_idx, group in enumerate(pruner.attention_name_groups): # get the layer index layer_idx = None for part in group[0].split("."): try: layer_idx = int(part) break except: continue if layer_idx is not None: speedup_rules[layer_idx] = pruner.pruned_heads[group_idx] pruner._unwrap_model() model.bert._prune_heads(speedup_rules) print(model) ######################################################################### # After pruning, finetune again on the target task # Get the metric function metric = load_metric("glue", args.task_name) # re-initialize the optimizer and the scheduler optimizer, _, _, data_collator = get_dataloader_and_optimizer( args, tokenizer, model, train_dataset, eval_dataset) lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=train_steps) logger.info("================= Finetuning after Pruning =================") train_model(args, model, is_regression, train_dataloader, eval_dataloader, optimizer, lr_scheduler, metric, device) if args.output_dir is not None: torch.save(model.state_dict(), args.output_dir + "/model_after_pruning.pt") if args.task_name == "mnli": final_eval_for_mnli(args, model, processed_datasets, metric, data_collator) flops, params, results = count_flops_params(model, dummy_input) print( f"Final model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") os.makedirs(args.experiment_data_dir, exist_ok=True) # prepare model and data train_loader, test_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size) model, optimizer, scheduler = get_model_optimizer_scheduler( args, device, train_loader, test_loader, criterion) dummy_input = get_dummy_input(args, device) flops, params, results = count_flops_params(model, dummy_input) print(f"FLOPs: {flops}, params: {params}") print(f'start {args.pruner} pruning...') def trainer(model, optimizer, criterion, epoch): return train(args, model, device, train_loader, criterion, optimizer, epoch=epoch) pruner_cls = str2pruner[args.pruner] kw_args = {} config_list = [{'sparsity': args.sparsity, 'op_types': ['Conv2d']}] if args.pruner == 'level': config_list = [{'sparsity': args.sparsity, 'op_types': ['default']}] else: if args.dependency_aware: dummy_input = get_dummy_input(args, device) print('Enable the dependency_aware mode') # note that, not all pruners support the dependency_aware mode kw_args['dependency_aware'] = True kw_args['dummy_input'] = dummy_input if args.pruner not in ('l1filter', 'l2filter', 'fpgm'): # set only work for training aware pruners kw_args['trainer'] = trainer kw_args['optimizer'] = optimizer kw_args['criterion'] = criterion if args.pruner in ('mean_activation', 'apoz', 'taylorfo'): kw_args['sparsifying_training_batches'] = 1 if args.pruner == 'slim': kw_args['sparsifying_training_epochs'] = 1 if args.pruner == 'agp': kw_args['pruning_algorithm'] = 'l1' kw_args['num_iterations'] = 2 kw_args['epochs_per_iteration'] = 1 # Reproduced result in paper 'PRUNING FILTERS FOR EFFICIENT CONVNETS', # Conv_1, Conv_8, Conv_9, Conv_10, Conv_11, Conv_12 are pruned with 50% sparsity, as 'VGG-16-pruned-A' if args.pruner == 'slim': config_list = [{ 'sparsity': args.sparsity, 'op_types': ['BatchNorm2d'], }] else: config_list = [{ 'sparsity': args.sparsity, 'op_types': ['Conv2d'], 'op_names': [ 'feature.0', 'feature.24', 'feature.27', 'feature.30', 'feature.34', 'feature.37' ] }] pruner = pruner_cls(model, config_list, **kw_args) # Pruner.compress() returns the masked model model = pruner.compress() pruner.get_pruned_weights() # export the pruned model masks for model speedup model_path = os.path.join( args.experiment_data_dir, 'pruned_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner)) mask_path = os.path.join( args.experiment_data_dir, 'mask_{}_{}_{}.pth'.format(args.model, args.dataset, args.pruner)) pruner.export_model(model_path=model_path, mask_path=mask_path) if args.test_only: test(args, model, device, criterion, test_loader) if args.speed_up: # Unwrap all modules to normal state pruner._unwrap_model() m_speedup = ModelSpeedup(model, dummy_input, mask_path, device) m_speedup.speedup_model() print('start finetuning...') best_top1 = 0 save_path = os.path.join(args.experiment_data_dir, f'finetuned.pth') for epoch in range(args.fine_tune_epochs): print('# Epoch {} #'.format(epoch)) train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() top1 = test(args, model, device, criterion, test_loader) if top1 > best_top1: best_top1 = top1 torch.save(model.state_dict(), save_path) flops, params, results = count_flops_params(model, dummy_input) print( f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_top1: .2f}' ) if args.nni: nni.report_final_result(best_top1)
def main(args): # prepare dataset torch.manual_seed(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_loader, val_loader, criterion = get_data(args.dataset, args.data_dir, args.batch_size, args.test_batch_size) model, optimizer = get_trained_model_optimizer(args, device, train_loader, val_loader, criterion) def short_term_fine_tuner(model, epochs=1): for epoch in range(epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) def trainer(model, optimizer, criterion, epoch, callback): return train(args, model, device, train_loader, criterion, optimizer, epoch=epoch, callback=callback) def evaluator(model): return test(model, device, criterion, val_loader) # used to save the performance of the original & pruned & finetuned models result = {'flops': {}, 'params': {}, 'performance':{}} flops, params, _ = count_flops_params(model, get_input_size(args.dataset)) result['flops']['original'] = flops result['params']['original'] = params evaluation_result = evaluator(model) print('Evaluation result (original model): %s' % evaluation_result) result['performance']['original'] = evaluation_result # module types to prune, only "Conv2d" supported for channel pruning if args.base_algo in ['l1', 'l2', 'fpgm']: op_types = ['Conv2d'] elif args.base_algo == 'level': op_types = ['default'] config_list = [{ 'sparsity': args.sparsity, 'op_types': op_types }] dummy_input = get_dummy_input(args, device) if args.pruner == 'L1FilterPruner': pruner = L1FilterPruner(model, config_list) elif args.pruner == 'L2FilterPruner': pruner = L2FilterPruner(model, config_list) elif args.pruner == 'FPGMPruner': pruner = FPGMPruner(model, config_list) elif args.pruner == 'NetAdaptPruner': pruner = NetAdaptPruner(model, config_list, short_term_fine_tuner=short_term_fine_tuner, evaluator=evaluator, base_algo=args.base_algo, experiment_data_dir=args.experiment_data_dir) elif args.pruner == 'ADMMPruner': # users are free to change the config here if args.model == 'LeNet': if args.base_algo in ['l1', 'l2', 'fpgm']: config_list = [{ 'sparsity': 0.8, 'op_types': ['Conv2d'], 'op_names': ['conv1'] }, { 'sparsity': 0.92, 'op_types': ['Conv2d'], 'op_names': ['conv2'] }] elif args.base_algo == 'level': config_list = [{ 'sparsity': 0.8, 'op_names': ['conv1'] }, { 'sparsity': 0.92, 'op_names': ['conv2'] }, { 'sparsity': 0.991, 'op_names': ['fc1'] }, { 'sparsity': 0.93, 'op_names': ['fc2'] }] else: raise ValueError('Example only implemented for LeNet.') pruner = ADMMPruner(model, config_list, trainer=trainer, num_iterations=2, training_epochs=2) elif args.pruner == 'SimulatedAnnealingPruner': pruner = SimulatedAnnealingPruner( model, config_list, evaluator=evaluator, base_algo=args.base_algo, cool_down_rate=args.cool_down_rate, experiment_data_dir=args.experiment_data_dir) elif args.pruner == 'AutoCompressPruner': pruner = AutoCompressPruner( model, config_list, trainer=trainer, evaluator=evaluator, dummy_input=dummy_input, num_iterations=3, optimize_mode='maximize', base_algo=args.base_algo, cool_down_rate=args.cool_down_rate, admm_num_iterations=30, admm_training_epochs=5, experiment_data_dir=args.experiment_data_dir) else: raise ValueError( "Pruner not supported.") # Pruner.compress() returns the masked model # but for AutoCompressPruner, Pruner.compress() returns directly the pruned model model = pruner.compress() evaluation_result = evaluator(model) print('Evaluation result (masked model): %s' % evaluation_result) result['performance']['pruned'] = evaluation_result if args.save_model: pruner.export_model( os.path.join(args.experiment_data_dir, 'model_masked.pth'), os.path.join(args.experiment_data_dir, 'mask.pth')) print('Masked model saved to %s' % args.experiment_data_dir) # model speed up if args.speed_up: if args.pruner != 'AutoCompressPruner': if args.model == 'LeNet': model = LeNet().to(device) elif args.model == 'vgg16': model = VGG(depth=16).to(device) elif args.model == 'resnet18': model = ResNet18().to(device) elif args.model == 'resnet50': model = ResNet50().to(device) model.load_state_dict(torch.load(os.path.join(args.experiment_data_dir, 'model_masked.pth'))) masks_file = os.path.join(args.experiment_data_dir, 'mask.pth') m_speedup = ModelSpeedup(model, dummy_input, masks_file, device) m_speedup.speedup_model() evaluation_result = evaluator(model) print('Evaluation result (speed up model): %s' % evaluation_result) result['performance']['speedup'] = evaluation_result torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_speed_up.pth')) print('Speed up model saved to %s' % args.experiment_data_dir) flops, params, _ = count_flops_params(model, get_input_size(args.dataset)) result['flops']['speedup'] = flops result['params']['speedup'] = params if args.fine_tune: if args.dataset == 'mnist': optimizer = torch.optim.Adadelta(model.parameters(), lr=1) scheduler = StepLR(optimizer, step_size=1, gamma=0.7) elif args.dataset == 'cifar10' and args.model == 'vgg16': optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR( optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) elif args.dataset == 'cifar10' and args.model == 'resnet18': optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR( optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) elif args.dataset == 'cifar10' and args.model == 'resnet50': optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR( optimizer, milestones=[int(args.fine_tune_epochs*0.5), int(args.fine_tune_epochs*0.75)], gamma=0.1) best_acc = 0 for epoch in range(args.fine_tune_epochs): train(args, model, device, train_loader, criterion, optimizer, epoch) scheduler.step() acc = evaluator(model) if acc > best_acc: best_acc = acc torch.save(model.state_dict(), os.path.join(args.experiment_data_dir, 'model_fine_tuned.pth')) print('Evaluation result (fine tuned): %s' % best_acc) print('Fined tuned model saved to %s' % args.experiment_data_dir) result['performance']['finetuned'] = best_acc with open(os.path.join(args.experiment_data_dir, 'result.json'), 'w+') as f: json.dump(result, f)
def generate_tasks(self, task_result: TaskResult) -> List[Task]: # append experience & update agent policy if task_result.task_id != 'origin': action, reward, observation, done = self.env.step( self.action, task_result.compact_model) self.T.append( [reward, self.observation, observation, self.action, done]) self.observation = observation.copy() if done: final_reward = task_result.score - 1 # agent observe and update policy for _, s_t, s_t1, a_t, d_t in self.T: self.agent.observe(final_reward, s_t, s_t1, a_t, d_t) if self.current_episode > self.warmup_episode: self.agent.update_policy() self.current_episode += 1 self.T = [] self.action = None self.observation = None # update current2origin_sparsity in log file origin_model = torch.load(self._origin_model_path) compact_model = task_result.compact_model compact_model_masks = task_result.compact_model_masks current2origin_sparsity, _, _ = compute_sparsity( origin_model, compact_model, compact_model_masks, self.temp_config_list) self._tasks[task_result.task_id].state[ 'current2origin_sparsity'] = current2origin_sparsity current2origin_sparsity, _, _ = compute_sparsity( origin_model, compact_model, compact_model_masks, self.config_list_copy) self._tasks[task_result.task_id].state[ 'current_total_sparsity'] = current2origin_sparsity flops, params, _ = count_flops_params(compact_model, self.dummy_input, verbose=False) self._tasks[task_result. task_id].state['current_flops'] = '{:.2f} M'.format( flops / 1e6) self._tasks[task_result. task_id].state['current_params'] = '{:.2f} M'.format( params / 1e6) # generate new action if self.current_episode < self.total_episode: if self.observation is None: self.observation = self.env.reset().copy() self.temp_config_list = [] compact_model = torch.load(self._origin_model_path) compact_model_masks = torch.load(self._origin_masks_path) else: compact_model = task_result.compact_model compact_model_masks = task_result.compact_model_masks if self.current_episode <= self.warmup_episode: action = self.agent.random_action() else: action = self.agent.select_action(self.observation, episode=self.current_episode) action = action.tolist()[0] self.action = self.env.correct_action(action, compact_model) sub_config_list = [{ 'op_names': [self.env.current_op_name], 'total_sparsity': self.action }] self.temp_config_list.extend(sub_config_list) task_id = self._task_id_candidate if self.env.is_first_layer() or self.env.is_final_layer(): task_config_list = self.temp_config_list else: task_config_list = sub_config_list config_list_path = Path(self._intermediate_result_dir, '{}_config_list.json'.format(task_id)) with Path(config_list_path).open('w') as f: json_tricks.dump(task_config_list, f, indent=4) model_path = Path( self._intermediate_result_dir, '{}_compact_model.pth'.format(task_result.task_id)) masks_path = Path( self._intermediate_result_dir, '{}_compact_model_masks.pth'.format(task_result.task_id)) torch.save(compact_model, model_path) torch.save(compact_model_masks, masks_path) task = Task(task_id, model_path, masks_path, config_list_path) if not self.env.is_final_layer(): task.finetune = False task.evaluate = False self._tasks[task_id] = task self._task_id_candidate += 1 return [task] else: return []
if __name__ == '__main__': # model = MobileNetV2(n_class=10).to(device) model = VGG().to(device) optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = MultiStepLR(optimizer, milestones=[50, 75], gamma=0.1) criterion = torch.nn.CrossEntropyLoss() for i in range(100): trainer(model, optimizer, criterion, i) pre_best_acc = evaluator(model) dummy_input = torch.rand(10, 3, 32, 32).to(device) pre_flops, pre_params, _ = count_flops_params(model, dummy_input) config_list = [{ 'op_types': ['Conv2d'], 'total_sparsity': 0.5, 'max_sparsity_per_layer': 0.8 }] # if you just want to keep the final result as the best result, you can pass evaluator as None. # or the result with the highest score (given by evaluator) will be the best result. ddpg_params = { 'hidden1': 300, 'hidden2': 300, 'lr_c': 1e-3, 'lr_a': 1e-4, 'warmup': 100,