def configure(model, compress): model = model.cuda() compression_scheduler = distiller.CompressionScheduler(model) optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9) criterion = nn.CrossEntropyLoss() if compress: source = compress compression_scheduler = distiller.CompressionScheduler(model) distiller.config.file_config(model, optimizer, compress, compression_scheduler,) parser = argparse.ArgumentParser() distiller.knowledge_distillation.add_distillation_args(parser) CONFIG_FILE = '.config_ipynb' if os.path.isfile(CONFIG_FILE): with open(CONFIG_FILE) as f: sys.argv = f.read().split() else: sys.argv = ['resnet.py','--kd-resume', 'net56_cifar.pth.tar','--kd-teacher',None, '--kd-start-epoch', 0, '--kd-student-wt','0.5', '--kd-teacher-wt', '0.0', '--kd-distill-wt', '0.5', ] args = parser.parse_args() args.kd_policy = None epochs = 30 if args.kd_teacher: if args.kd_resume: teacher = torch.load(args.kd_resume) # Create policy and add to scheduler dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=epochs, frequency=1) return model, compression_scheduler, epochs, optimizer, criterion, args
def test_load_gpu_model_on_cpu_with_thinning(): # Issue #148 # 1. create a GPU model and remove 50% of the filters in one of the layers (thninning) # 2. save the thinned model in a checkpoint file # 3. load the checkpoint and place it on the CPU CPU_DEVICE_ID = -1 gpu_model = create_model(False, 'cifar10', 'resnet20_cifar') conv_pname = "module.layer1.0.conv1.weight" conv_p = distiller.model_find_param(gpu_model, conv_pname) pruner = distiller.pruning.L1RankedStructureParameterPruner("test_pruner", group_type="Filters", desired_sparsity=0.5, weights=conv_pname) zeros_mask_dict = distiller.create_model_masks_dict(gpu_model) pruner.set_param_mask(conv_p, conv_pname, zeros_mask_dict, meta=None) # Use the mask to prune zeros_mask_dict[conv_pname].apply_mask(conv_p) distiller.remove_filters(gpu_model, zeros_mask_dict, 'resnet20_cifar', 'cifar10', optimizer=None) assert hasattr(gpu_model, 'thinning_recipes') scheduler = distiller.CompressionScheduler(gpu_model) save_checkpoint(epoch=0, arch='resnet20_cifar', model=gpu_model, scheduler=scheduler, optimizer=None) CPU_DEVICE_ID = -1 cpu_model = create_model(False, 'cifar10', 'resnet20_cifar', device_ids=CPU_DEVICE_ID) load_checkpoint(cpu_model, "checkpoint.pth.tar") assert distiller.model_device(cpu_model) == 'cpu'
def test_policy_scheduling(): model = create_model(False, 'cifar10', 'resnet20_cifar') scheduler = distiller.CompressionScheduler(model) policy = distiller.PruningPolicy(None, None) with pytest.raises(AssertionError): scheduler.add_policy(policy) with pytest.raises(AssertionError): # Test for mutual-exclusive configuration scheduler.add_policy(policy, epochs=[1, 2, 3], starting_epoch=4, ending_epoch=5, frequency=1) scheduler.add_policy(policy, epochs=None, starting_epoch=4, ending_epoch=5, frequency=1) # Regression test for issue #176 - https://github.com/NervanaSystems/distiller/issues/176 scheduler.add_policy(policy, epochs=[1, 2, 3]) sched_metadata = scheduler.sched_metadata[policy] assert sched_metadata['starting_epoch'] == 1 assert sched_metadata['ending_epoch'] == 4 assert sched_metadata['frequency'] is None scheduler.add_policy(policy, epochs=[5]) sched_metadata = scheduler.sched_metadata[policy] assert sched_metadata['starting_epoch'] == 5 assert sched_metadata['ending_epoch'] == 6 assert sched_metadata['frequency'] is None
def load_checkpoint(model, chkpt_file, optimizer=None): """Load a pytorch training checkpoint Args: model: the pytorch model to which we will load the parameters chkpt_file: the checkpoint file optimizer: the optimizer to which we will load the serialized state """ compression_scheduler = None start_epoch = 0 if os.path.isfile(chkpt_file): msglogger.info("=> loading checkpoint %s", chkpt_file) checkpoint = torch.load(chkpt_file, map_location=lambda storage, loc: storage) msglogger.info("Checkpoint keys:\n{}".format("\n\t".join( k for k in checkpoint.keys()))) start_epoch = checkpoint['epoch'] + 1 best_top1 = checkpoint.get('best_top1', None) if best_top1 is not None: msglogger.info(" best top@1: %.3f", best_top1) if 'compression_sched' in checkpoint: compression_scheduler = distiller.CompressionScheduler(model) compression_scheduler.load_state_dict( checkpoint['compression_sched']) msglogger.info( "Loaded compression schedule from checkpoint (epoch %d)", checkpoint['epoch']) else: msglogger.info( "Warning: compression schedule data does not exist in the checkpoint" ) if 'thinning_recipes' in checkpoint: if 'compression_sched' not in checkpoint: raise KeyError( "Found thinning_recipes key, but missing mandatory key compression_sched" ) msglogger.info("Loaded a thinning recipe from the checkpoint") # Cache the recipes in case we need them later model.thinning_recipes = checkpoint['thinning_recipes'] distiller.execute_thinning_recipes_list( model, compression_scheduler.zeros_mask_dict, model.thinning_recipes) if 'quantizer_metadata' in checkpoint: msglogger.info('Loaded quantizer metadata from the checkpoint') qmd = checkpoint['quantizer_metadata'] quantizer = qmd['type'](model, **qmd['params']) quantizer.prepare_model() msglogger.info("=> loaded checkpoint '%s' (epoch %d)", chkpt_file, checkpoint['epoch']) model.load_state_dict(checkpoint['state_dict']) return model, compression_scheduler, start_epoch else: raise IOError(ENOENT, 'Could not find a checkpoint file at', chkpt_file)
def create_scheduler(self): scheduler = distiller.CompressionScheduler(self.model) masks = { param_name: masker.mask for param_name, masker in self.zeros_mask_dict.items() } scheduler.load_state_dict(state={'masks_dict': masks}) return scheduler
def create_scheduler(model, zeros_mask_dict): scheduler = distiller.CompressionScheduler(model) masks = { param_name: masker.mask for param_name, masker in zeros_mask_dict.items() } scheduler.load_state_dict(state={"masks_dict": masks}) return scheduler
def _init_learner(args): # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # TODO(barrh): args.deprecated_resume is deprecated since v0.3.1 if args.deprecated_resume: msglogger.warning( 'The "--resume" flag is deprecated. Please use "--resume-from=YOUR_PATH" instead.' ) if not args.reset_optimizer: msglogger.warning( 'If you wish to also reset the optimizer, call with: --reset-optimizer' ) args.reset_optimizer = True args.resumed_checkpoint_path = args.deprecated_resume optimizer = None start_epoch = 0 if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info( '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0' ) if optimizer is None: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.debug('Optimizer Type: %s', type(optimizer)) msglogger.debug('Optimizer Args: %s', optimizer.defaults) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) return model, compression_scheduler, optimizer, start_epoch, args.epochs
def save_checkpoint(self, is_best=False): # Save the learned-model checkpoint scheduler = distiller.CompressionScheduler(self.model) masks = {param_name: masker.mask for param_name, masker in self.zeros_mask_dict.items()} scheduler.load_state_dict(state={'masks_dict': masks}) episode = self.debug_stats['episode'] episode = str(episode).zfill(3) if is_best: name = "BEST_adc_episode_{}".format(episode) else: name = "adc_episode_{}".format(episode) self.save_checkpoint_fn(epoch=self.debug_stats['episode'], model=self.model, scheduler=scheduler, name=name)
def load_checkpoint(model, chkpt_file, optimizer=None): """Load a pytorch training checkpoint Args: model: the pytorch model to which we will load the parameters chkpt_file: the checkpoint file optimizer: the optimizer to which we will load the serialized state """ compression_scheduler = None start_epoch = 0 if os.path.isfile(chkpt_file): msglogger.info("=> loading checkpoint %s", chkpt_file) checkpoint = torch.load(chkpt_file) start_epoch = checkpoint['epoch'] + 1 best_top1 = checkpoint.get('best_top1', None) if best_top1 is not None: msglogger.info(" best top@1: %.3f", best_top1) if 'compression_sched' in checkpoint: compression_scheduler = distiller.CompressionScheduler(model) compression_scheduler.load_state_dict( checkpoint['compression_sched']) msglogger.info( "Loaded compression schedule from checkpoint (epoch %d)", checkpoint['epoch']) if 'thinning_recipes' in checkpoint: if 'compression_sched' not in checkpoint: raise KeyError( "Found thinning_recipes key, but missing mandatoy key compression_sched" ) msglogger.info("Loaded a thinning recipe from the checkpoint") # Cache the recipes in case we need them later model.thinning_recipes = checkpoint['thinning_recipes'] distiller.execute_thinning_recipes_list( model, compression_scheduler.zeros_mask_dict, model.thinning_recipes) else: msglogger.info( "Warning: compression schedule data does not exist in the checkpoint" ) msglogger.info("=> loaded checkpoint '%s' (epoch %d)", chkpt_file, checkpoint['epoch']) model.load_state_dict(checkpoint['state_dict']) return model, compression_scheduler, start_epoch else: msglogger.info("Error: no checkpoint found at %s", chkpt_file) exit(1)
def __init__(self, **kwargs): opt._parse(kwargs) self.opt = opt self.model = getattr(models, self.opt.model)() self.criterion = t.nn.CrossEntropyLoss().to(self.opt.device) # 1. 铰链损失(Hinge Loss):主要用于支持向量机(SVM) 中; # 2. 互熵损失 (Cross Entropy Loss,Softmax Loss ):用于Logistic 回归与Softmax 分类中; # 3. 平方损失(Square Loss):主要是最小二乘法(OLS)中; # 4. 指数损失(Exponential Loss) :主要用于Adaboost 集成学习算法中; # 5. 其他损失(如0-1损失,绝对值损失) self.optimizer = self.model.get_optimizer(self.opt.lr, self.opt.weight_decay) self.compression_scheduler = distiller.CompressionScheduler(self.model) self.train_losses = AverageMeter() # 误差仪表 self.train_top1 = AverageMeter() # top1 仪表 self.train_top5 = AverageMeter() # top5 仪表 self.best_precision = 0 # 最好的精确度 self.start_epoch = 0 self.train_writer = None self.value_writer = None
def step(self, action): """Take a step, given an action. This is invoked by the Agent. """ layer_macs = self.get_macs(self.current_layer()) if action > 0: actual_action = self.__remove_channels(self.current_layer_id, action) else: actual_action = 0 layer_macs_after_action = self.get_macs(self.current_layer()) # Update the various counters after taking the step self.current_layer_id += 1 next_layer_macs = self.get_macs(self.current_layer()) self._removed_macs += (layer_macs - layer_macs_after_action) self._remaining_macs -= next_layer_macs #self.prev_action = actual_action if self.episode_is_done(): observation = self.get_final_obs() reward = self.compute_reward() # Save the learned-model checkpoint scheduler = distiller.CompressionScheduler(self.model) scheduler.load_state_dict( state={'masks_dict': self.zeros_mask_dict}) self.save_checkpoint_fn(epoch=self.debug_stats['episode'], model=self.model, scheduler=scheduler) self.debug_stats['episode'] += 1 else: observation = self._get_obs(next_layer_macs) if True: reward = 0 else: reward = self.compute_reward() self.prev_action = actual_action info = {} return observation, reward, self.episode_is_done(), info
def arbitrary_channel_pruning(config, channels_to_remove): """Test removal of arbitrary channels. The test receives a specification of channels to remove. Based on this specification, the channels are pruned and then physically removed from the model (via a "thinning" process). """ model, zeros_mask_dict = common.setup_test(config.arch, config.dataset) conv2 = common.find_module_by_name(model, config.conv2_name) assert conv2 is not None # Test that we can access the weights tensor of the first convolution in layer 1 conv2_p = distiller.model_find_param(model, config.conv2_name + ".weight") assert conv2_p is not None assert conv2_p.dim() == 4 num_filters = conv2_p.size(0) num_channels = conv2_p.size(1) kernel_height = conv2_p.size(2) kernel_width = conv2_p.size(3) cnt_nnz_channels = num_channels - len(channels_to_remove) # Let's build our 4D mask. # We start with a 1D mask of channels, with all but our specified channels set to one channels = torch.ones(num_channels) for ch in channels_to_remove: channels[ch] = 0 # Now let's expand back up to a 4D mask mask = channels.expand(num_filters, num_channels) mask.unsqueeze_(-1) mask.unsqueeze_(-1) mask = mask.expand(num_filters, num_channels, kernel_height, kernel_width).contiguous() assert mask.shape == conv2_p.shape assert distiller.density_ch(mask) == (conv2.in_channels - len(channels_to_remove)) / conv2.in_channels # Cool, so now we have a mask for pruning our channels. # Use the mask to prune zeros_mask_dict[config.conv2_name + ".weight"].mask = mask zeros_mask_dict[config.conv2_name + ".weight"].apply_mask(conv2_p) all_channels = set([ch for ch in range(num_channels)]) nnz_channels = set(distiller.find_nonzero_channels_list(conv2_p, config.conv2_name + ".weight")) channels_removed = all_channels - nnz_channels logger.info("Channels removed {}".format(channels_removed)) # Now, let's do the actual network thinning distiller.remove_channels(model, zeros_mask_dict, config.arch, config.dataset) conv1 = common.find_module_by_name(model, config.conv1_name) logger.info(conv1) logger.info(conv2) assert conv1.out_channels == cnt_nnz_channels assert conv2.in_channels == cnt_nnz_channels assert conv1.weight.size(0) == cnt_nnz_channels assert conv2.weight.size(1) == cnt_nnz_channels if config.bn_name is not None: bn1 = common.find_module_by_name(model, config.bn_name) assert bn1.running_var.size(0) == cnt_nnz_channels assert bn1.running_mean.size(0) == cnt_nnz_channels assert bn1.num_features == cnt_nnz_channels assert bn1.bias.size(0) == cnt_nnz_channels assert bn1.weight.size(0) == cnt_nnz_channels # Let's test saving and loading a thinned model. # We save 3 times, and load twice, to make sure to cover some corner cases: # - Make sure that after loading, the model still has hold of the thinning recipes # - Make sure that after a 2nd load, there no problem loading (in this case, the # - tensors are already thin, so this is a new flow) # (1) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None) model_2 = create_model(False, config.dataset, config.arch, parallel=False) dummy_input = torch.randn(1, 3, 32, 32) model(dummy_input) model_2(dummy_input) conv2 = common.find_module_by_name(model_2, config.conv2_name) assert conv2 is not None with pytest.raises(KeyError): model_2, compression_scheduler, start_epoch = load_checkpoint(model_2, 'checkpoint.pth.tar') compression_scheduler = distiller.CompressionScheduler(model) hasattr(model, 'thinning_recipes') # (2) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None, scheduler=compression_scheduler) model_2, compression_scheduler, start_epoch = load_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done") # (3) save_checkpoint(epoch=0, arch=config.arch, model=model_2, optimizer=None, scheduler=compression_scheduler) model_2, compression_scheduler, start_epoch = load_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done 2")
def train(**kwargs): opt._parse(kwargs) train_writer = None value_writer = None if opt.vis: train_writer = SummaryWriter( log_dir='./runs/train_' + datetime.now().strftime('%y%m%d-%H-%M-%S')) value_writer = SummaryWriter( log_dir='./runs/val_' + datetime.now().strftime('%y%m%d-%H-%M-%S')) previous_loss = 1e10 # 上次学习的loss best_precision = 0 # 最好的精确度 start_epoch = 0 lr = opt.lr perf_scores_history = [] # 绩效分数 # step1: criterion and optimizer # 1. 铰链损失(Hinge Loss):主要用于支持向量机(SVM) 中; # 2. 互熵损失 (Cross Entropy Loss,Softmax Loss ):用于Logistic 回归与Softmax 分类中; # 3. 平方损失(Square Loss):主要是最小二乘法(OLS)中; # 4. 指数损失(Exponential Loss) :主要用于Adaboost 集成学习算法中; # 5. 其他损失(如0-1损失,绝对值损失) criterion = t.nn.CrossEntropyLoss().to(opt.device) # 损失函数 # step2: meters train_losses = AverageMeter() # 误差仪表 train_top1 = AverageMeter() # top1 仪表 train_top5 = AverageMeter() # top5 仪表 pylogger = PythonLogger(msglogger) # step3: configure model model = getattr(models, opt.model)() # 获得网络结构 compression_scheduler = distiller.CompressionScheduler(model) optimizer = model.get_optimizer(lr, opt.weight_decay) # 优化器 if opt.load_model_path: # # 把所有的张量加载到CPU中 # t.load(opt.load_model_path, map_location=lambda storage, loc: storage) # t.load(opt.load_model_path, map_location='cpu') # # 把所有的张量加载到GPU 1中 # t.load(opt.load_model_path, map_location=lambda storage, loc: storage.cuda(1)) # # 把张量从GPU 1 移动到 GPU 0 # t.load(opt.load_model_path, map_location={'cuda:1': 'cuda:0'}) checkpoint = t.load(opt.load_model_path) start_epoch = checkpoint["epoch"] # compression_scheduler.load_state_dict(checkpoint['compression_scheduler'], False) best_precision = checkpoint["best_precision"] model.load_state_dict(checkpoint["state_dict"]) optimizer = checkpoint['optimizer'] model.to(opt.device) # 加载模型到 GPU if opt.compress: compression_scheduler = distiller.file_config( model, optimizer, opt.compress, compression_scheduler) # 加载模型修剪计划表 model.to(opt.device) # 学习速率调整器 lr_scheduler = get_scheduler(optimizer, opt) # step4: data_image train_data = DatasetFromFilename(opt.data_root, flag='train') # 训练集 val_data = DatasetFromFilename(opt.data_root, flag='test') # 验证集 train_dataloader = DataLoader(train_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) # 训练集加载器 val_dataloader = DataLoader(val_data, opt.batch_size, shuffle=True, num_workers=opt.num_workers) # 验证集加载器 # train for epoch in range(start_epoch, opt.max_epoch): model.train() if opt.pruning: compression_scheduler.on_epoch_begin(epoch) # epoch 开始修剪 train_losses.reset() # 重置仪表 train_top1.reset() # 重置仪表 # print('训练数据集大小', len(train_dataloader)) total_samples = len(train_dataloader.sampler) steps_per_epoch = math.ceil(total_samples / opt.batch_size) train_progressor = ProgressBar(mode="Train ", epoch=epoch, total_epoch=opt.max_epoch, model_name=opt.model, lr=lr, total=len(train_dataloader)) lr = lr_scheduler.get_lr()[0] for ii, (data, labels, img_path, tag) in enumerate(train_dataloader): if not check_date(img_path, tag, msglogger): return if opt.pruning: compression_scheduler.on_minibatch_begin( epoch, ii, steps_per_epoch, optimizer) # batch 开始修剪 train_progressor.current = ii + 1 # 训练集当前进度 # train model input = data.to(opt.device) target = labels.to(opt.device) if train_writer: grid = make_grid( (input.data.cpu() * 0.225 + 0.45).clamp(min=0, max=1)) train_writer.add_image('train_images', grid, ii * (epoch + 1)) # 训练图片 score = model(input) # 网络结构返回值 # 计算损失 loss = criterion(score, target) if opt.pruning: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) agg_loss = compression_scheduler.before_backward_pass( epoch, ii, steps_per_epoch, loss, optimizer=optimizer, return_loss_components=True) # 模型修建误差 loss = agg_loss.overall_loss train_losses.update(loss.item(), input.size(0)) # loss = criterion(score[0], target) # 计算损失 Inception3网络 optimizer.zero_grad() # 参数梯度设成0 loss.backward() # 反向传播 optimizer.step() # 更新参数 if opt.pruning: compression_scheduler.on_minibatch_end(epoch, ii, steps_per_epoch, optimizer) # batch 结束修剪 precision1_train, precision5_train = accuracy( score, target, topk=(1, 5)) # top1 和 top5 的准确率 # writer.add_graph(model, input) # precision1_train, precision2_train = accuracy(score[0], target, topk=(1, 2)) # Inception3网络 train_losses.update(loss.item(), input.size(0)) train_top1.update(precision1_train[0].item(), input.size(0)) train_top5.update(precision5_train[0].item(), input.size(0)) train_progressor.current_loss = train_losses.avg train_progressor.current_top1 = train_top1.avg train_progressor.current_top5 = train_top5.avg train_progressor() # 打印进度 if ii % opt.print_freq == 0: if train_writer: train_writer.add_scalar('loss', train_losses.avg, ii * (epoch + 1)) # 训练误差 train_writer.add_text( 'top1', 'train accuracy top1 %s' % train_top1.avg, ii * (epoch + 1)) # top1准确率文本 train_writer.add_scalars( 'accuracy', { 'top1': train_top1.avg, 'top5': train_top5.avg, 'loss': train_losses.avg }, ii * (epoch + 1)) # train_progressor.done() # 保存训练结果为txt # validate and visualize if opt.pruning: distiller.log_weights_sparsity(model, epoch, loggers=[pylogger]) # 打印模型修剪结果 compression_scheduler.on_epoch_end(epoch, optimizer) # epoch 结束修剪 val_loss, val_top1, val_top5 = val(model, criterion, val_dataloader, epoch, value_writer, lr) # 校验模型 sparsity = distiller.model_sparsity(model) perf_scores_history.append( distiller.MutableNamedTuple( { 'sparsity': sparsity, 'top1': val_top1, 'top5': val_top5, 'epoch': epoch + 1, 'lr': lr, 'loss': val_loss }, )) # 保持绩效分数历史记录从最好到最差的排序 # 按稀疏度排序为主排序键,然后按top1、top5、epoch排序 perf_scores_history.sort(key=operator.attrgetter( 'sparsity', 'top1', 'top5', 'epoch'), reverse=True) for score in perf_scores_history[:1]: msglogger.info( '==> Best [Top1: %.3f Top5: %.3f Sparsity: %.2f on epoch: %d Lr: %f Loss: %f]', score.top1, score.top5, score.sparsity, score.epoch, lr, score.loss) best_precision = max(perf_scores_history[0].top1, best_precision) # 最大top1 准确率 is_best = epoch + 1 == perf_scores_history[ 0].epoch # 当前epoch 和最佳epoch 一样 if is_best: model.save({ "epoch": epoch + 1, "model_name": opt.model, "state_dict": model.state_dict(), "best_precision": best_precision, "optimizer": optimizer, "valid_loss": [val_loss, val_top1, val_top5], 'compression_scheduler': compression_scheduler.state_dict(), }) # 保存模型 # update learning rate lr_scheduler.step(epoch) # 更新学习效率 # 如果训练误差比上次大 降低学习效率 # if train_losses.val > previous_loss: # lr = lr * opt.lr_decay # # 当loss大于上一次loss,降低学习率 # for param_group in optimizer.param_groups: # param_group['lr'] = lr # # previous_loss = train_losses.val t.cuda.empty_cache() # 这个命令是清除没用的临时变量的
def main(): global msglogger check_pytorch_version() args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(sys.argv, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_epochs = [distiller.MutableNamedTuple({'epoch': 0, 'top1': 0, 'sparsity': 0}) for i in range(args.num_best_scores)] if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error('ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1') exit(1) # Use a well-known seed, for repeatability of experiments torch.manual_seed(0) random.seed(0) np.random.seed(0) cudnn.deterministic = True else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error('ERROR: Argument --gpus must be a comma-separated list of integers only') exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error('ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name if 'cinic' in args.arch: args.dataset = 'cinic10' else: args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset in ['cifar10', 'cinic10'] else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model #model = create_model(args.pretrained, args.dataset, args.arch, # parallel=not args.load_serialized, device_ids=args.gpus) model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) # Get arch state_dict compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint if args.resume: #model, compression_scheduler, start_epoch = apputils.load_checkpoint( # model, chkpt_file=args.resume) # Load Pre-trained Model chkpt_file=args.resume print("=> loading checkpoint %s" % chkpt_file) checkpoint = torch.load(chkpt_file) model.load_state_dict(checkpoint['net']) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.ADC: return automated_deep_compression(model, criterion, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_size, args.deterministic) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) activations_collectors = create_activation_stats_collectors(model, collection_phase=args.activation_stats) if args.sensitivity is not None: return sensitivity_analysis(model, criterion, test_loader, pylogger, args) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.cuda() else: compression_scheduler = distiller.CompressionScheduler(model) args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics(epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info(distiller.masks_sparsity_tbl_summary(model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics(epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # remember best top1 and save checkpoint #sparsity = distiller.model_sparsity(model) is_best = top1 > best_epochs[0].top1 if is_best: best_epochs[0].epoch = epoch best_epochs[0].top1 = top1 #best_epoch.sparsity = sparsity best_epochs = sorted(best_epochs, key=lambda score: score.top1) for score in reversed(best_epochs): if score.top1 > 0: msglogger.info('==> Best Top1: %.3f on Epoch: %d', score.top1, score.epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_epochs[0].top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def load_checkpoint(model, chkpt_file, optimizer=None, model_device=None, lean_checkpoint=False, strict=False): """Load a pytorch training checkpoint. Args: model: the pytorch model to which we will load the parameters. You can specify model=None if the checkpoint contains enough metadata to infer the model. The order of the arguments is misleading and clunky, and is kept this way for backward compatibility. chkpt_file: the checkpoint file lean_checkpoint: if set, read into model only 'state_dict' field optimizer: [deprecated argument] model_device [str]: if set, call model.to($model_device) This should be set to either 'cpu' or 'cuda'. :returns: updated model, compression_scheduler, optimizer, start_epoch """ def _load_compression_scheduler(): normalize_keys = False try: compression_scheduler.load_state_dict( checkpoint['compression_sched'], normalize_keys) except KeyError as e: # A very common source of this KeyError is loading a GPU model on the CPU. # We rename all of the DataParallel keys because DataParallel does not execute on the CPU. normalize_keys = True compression_scheduler.load_state_dict( checkpoint['compression_sched'], normalize_keys) msglogger.info( "Loaded compression schedule from checkpoint (epoch {})".format( checkpoint_epoch)) return normalize_keys def _load_and_execute_thinning_recipes(): msglogger.info("Loaded a thinning recipe from the checkpoint") # Cache the recipes in case we need them later model.thinning_recipes = checkpoint['thinning_recipes'] if normalize_dataparallel_keys: model.thinning_recipes = [ distiller.get_normalized_recipe(recipe) for recipe in model.thinning_recipes ] distiller.execute_thinning_recipes_list( model, compression_scheduler.zeros_mask_dict, model.thinning_recipes) def _load_optimizer(): """Initialize optimizer with model parameters and load src_state_dict""" try: cls, src_state_dict = checkpoint['optimizer_type'], checkpoint[ 'optimizer_state_dict'] # Initialize the dest_optimizer with a dummy learning rate, # this is required to support SGD.__init__() dest_optimizer = cls(model.parameters(), lr=1) dest_optimizer.load_state_dict(src_state_dict) msglogger.info( 'Optimizer of type {type} was loaded from checkpoint'.format( type=type(dest_optimizer))) optimizer_param_groups = dest_optimizer.state_dict( )['param_groups'] msglogger.info('Optimizer Args: {}'.format( dict((k, v) for k, v in optimizer_param_groups[0].items() if k != 'params'))) return dest_optimizer except KeyError: # Older checkpoints do support optimizer loading: They either had an 'optimizer' field # (different name) which was not used during the load, or they didn't even checkpoint # the optimizer. msglogger.warning('Optimizer could not be loaded from checkpoint.') return None def _create_model_from_ckpt(): try: return distiller.models.create_model(False, checkpoint['dataset'], checkpoint['arch'], checkpoint['is_parallel'], device_ids=None) except KeyError: return None def _sanity_check(): try: if model.arch != checkpoint["arch"]: raise ValueError( "The model architecture does not match the checkpoint architecture" ) except (NameError, KeyError): # One of the values is missing so we can't perform the comparison pass if not os.path.isfile(chkpt_file): raise IOError(ENOENT, 'Could not find a checkpoint file at', chkpt_file) assert optimizer == None, "argument optimizer is deprecated and must be set to None" msglogger.info("=> loading checkpoint %s", chkpt_file) checkpoint = torch.load(chkpt_file, map_location=lambda storage, loc: storage) msglogger.info('=> Checkpoint contents:\n%s\n' % get_contents_table(checkpoint)) if 'extras' in checkpoint: msglogger.info("=> Checkpoint['extras'] contents:\n{}\n".format( get_contents_table(checkpoint['extras']))) if 'state_dict' not in checkpoint: raise ValueError( "Checkpoint must contain the model parameters under the key 'state_dict'" ) if not model: model = _create_model_from_ckpt() if not model: raise ValueError( "You didn't provide a model, and the checkpoint %s doesn't contain " "enough information to create one", chkpt_file) checkpoint_epoch = checkpoint.get('epoch', None) start_epoch = checkpoint_epoch + 1 if checkpoint_epoch is not None else 0 compression_scheduler = None normalize_dataparallel_keys = False if 'compression_sched' in checkpoint: compression_scheduler = distiller.CompressionScheduler(model) normalize_dataparallel_keys = _load_compression_scheduler() else: msglogger.info( "Warning: compression schedule data does not exist in the checkpoint" ) if 'thinning_recipes' in checkpoint: if not compression_scheduler: msglogger.warning( "Found thinning_recipes key, but missing key compression_scheduler" ) compression_scheduler = distiller.CompressionScheduler(model) _load_and_execute_thinning_recipes() if 'quantizer_metadata' in checkpoint: msglogger.info('Loaded quantizer metadata from the checkpoint') qmd = checkpoint['quantizer_metadata'] quantizer = qmd['type'](model, **qmd['params']) quantizer.prepare_model(qmd['dummy_input']) if normalize_dataparallel_keys: checkpoint['state_dict'] = { normalize_module_name(k): v for k, v in checkpoint['state_dict'].items() } anomalous_keys = model.load_state_dict(checkpoint['state_dict'], strict) if anomalous_keys: # This is pytorch 1.1+ missing_keys, unexpected_keys = anomalous_keys if unexpected_keys: msglogger.warning( "Warning: the loaded checkpoint (%s) contains %d unexpected state keys" % (chkpt_file, len(unexpected_keys))) if missing_keys: raise ValueError( "The loaded checkpoint (%s) is missing %d state keys" % (chkpt_file, len(missing_keys))) if model_device is not None: model.to(model_device) if lean_checkpoint: msglogger.info("=> loaded 'state_dict' from checkpoint '{}'".format( str(chkpt_file))) return model, None, None, 0 optimizer = _load_optimizer() msglogger.info("=> loaded checkpoint '{f}' (epoch {e})".format( f=str(chkpt_file), e=checkpoint_epoch)) _sanity_check() return model, compression_scheduler, optimizer, start_epoch
def create_scheduler(self): scheduler = distiller.CompressionScheduler(self.model, self.zeros_mask_dict) return scheduler
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 90 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir, args.verbose) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file ]), # remove both None and empty strings msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) if args.evaluate: args.deterministic = True if args.deterministic: distiller.set_deterministic( args.seed) # For experiment reproducability else: if args.seed is not None: distiller.set_seed(args.seed) # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = distiller.apputils.classification_dataset_str_from_arch( args.arch) args.num_classes = distiller.apputils.classification_num_classes( args.dataset) if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model, config = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # TODO(barrh): args.deprecated_resume is deprecated since v0.3.1 if args.deprecated_resume: msglogger.warning( 'The "--resume" flag is deprecated. Please use "--resume-from=YOUR_PATH" instead.' ) if not args.reset_optimizer: msglogger.warning( 'If you wish to also reset the optimizer, call with: --reset-optimizer' ) args.reset_optimizer = True args.resumed_checkpoint_path = args.deprecated_resume # We can optionally resume from a checkpoint optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info( '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0' ) # Define loss function (criterion) if "ssd" in args.arch: neg_pos_ratio = 3 criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=neg_pos_ratio, center_variance=0.1, size_variance=0.2, device=args.device, reduction="sum", class_reduction=True, verbose=0) else: criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is None: if "ssd" in args.arch: base_net_lr = args.lr extra_layers_lr = args.lr params = [{ 'params': model.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(model.source_layer_add_ons.parameters(), model.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(model.regression_headers.parameters(), model.classification_headers.parameters()) }] else: params = model.parameters() optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: for summary in args.summary: distiller.model_summary(model, summary, args.dataset) return if args.export_onnx is not None: return distiller.export_img_classifier_to_onnx(model, os.path.join( msglogger.logdir, args.export_onnx), args.dataset, add_softmax=True, verbose=False) if args.qe_calibration: return acts_quant_stats_collection(model, criterion, pylogger, args) if args.activation_histograms: return acts_histogram_collection(model, criterion, pylogger, args) activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = load_data(args, config=config) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resumed_checkpoint_path is not None, \ "You must use --resume-from to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resumed_checkpoint_path.replace( ".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return args.kd_policy = None if args.kd_teacher: teacher, _ = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, parallel=not args.load_serialized, device_ids=args.gpus) if args.kd_resume: teacher = apputils.load_lean_checkpoint(teacher, args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) raw_teacher_model_path = msglogger.logdir + "/raw_teacher.pth.tar" if not os.path.exists(raw_teacher_model_path): teacher.save(raw_teacher_model_path) msglogger.info(Fore.CYAN + '\tRaw Teacher Model saved: {0}'.format( raw_teacher_model_path) + Style.RESET_ALL) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw, loss_type=args.kd_loss_type, focal_alpha=args.kd_focal_alpha, use_adaptive=args.kd_focal_adaptive, verbose=0) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) if start_epoch >= ending_epoch: msglogger.error( 'epoch count is too low, starting epoch is {} but total epochs set to {}' .format(start_epoch, ending_epoch)) raise ValueError('Epochs parameter is too low. Nothing to do.') for epoch in range(start_epoch, ending_epoch): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin( epoch, metrics=(vloss if (epoch != start_epoch) else 10**6)) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, top1, top5, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch checkpoint_extras = { 'current_top1': top1, 'best_top1': perf_scores_history[0].top1, 'best_epoch': perf_scores_history[0].epoch } try: raw_fullpath_best = apputils.save_checkpoint( epoch, args.arch, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, is_best=is_best, name=args.name, dir=msglogger.logdir) except Exception as ex: # keep previous fullpath_best pass mlflow.log_artifacts(msglogger.logdir) # Finally run results on the test set eval_params = { "model_type": args.arch, "model_path": raw_fullpath_best, "dataset_path": args.data, "label_path": "models/voc-model-labels.txt" } mlflow.projects.run(uri=".", entry_point="eval", use_conda=False, parameters=eval_params)
# Skip biases continue bottomk, _ = torch.topk(param.abs().view(-1), int(percentile * param.numel()), largest=False, sorted=True) threshold = bottomk.data[-1] msglogger.info("parameter %s: q = %.2f" %(name, threshold)) else: distiller.model_summary(model, None, which_summary, 'wikitext2') exit(0) compression_scheduler = None if args.compress: # Create a CompressionScheduler and configure it from a YAML schedule file source = args.compress compression_scheduler = distiller.CompressionScheduler(model) distiller.config.fileConfig(model, None, compression_scheduler, args.compress, msglogger) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=0, verbose=True, factor=0.5) # Loop over epochs. # At any point you can hit Ctrl + C to break out of training early. best_val_loss = float("inf") try: for epoch in range(0, args.epochs): epoch_start_time = time.time() if compression_scheduler:
def objective(space): global model global count global global_min_score #Explore new model model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) count += 1 # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity) accuracy = 0 alpha = 0.3 # Super-parameter: the importance of inference time latency = 0.0 sparsity = 0.0 # Training hyperparameter if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) print('resume mode: {}'.format(args.resume)) print(global_min_score) criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) """ distiller/distiller/config.py # Element-wise sparsity sparsity_levels = {net_param: sparsity_level} pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) scheduler = distiller.CompressionScheduler(model) scheduler.add_policy(policy, epochs=[0, 2, 4]) # Local search add multiple pruner for each layer """ sparsity_levels = {} for key, value in space.items(): sparsity_levels[key] = value #print(sparsity_levels) pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) # for SparsityLevelParameterPruner # pruner = distiller.pruning.SensitivityPruner(name='sensitivity', sensitivities=sparsity_levels) # for SensitivityPruner policy = distiller.PruningPolicy(pruner, pruner_args=None) lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.1)) compression_scheduler = distiller.CompressionScheduler(model) compression_scheduler.add_policy(policy, epochs=[PrunerEpoch]) # compression_scheduler.add_policy(policy, starting_epoch=0, ending_epoch=38, frequency=2) compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=50, frequency=1) """ distiller/example/classifier_compression/compress_classifier.py For each epoch: compression_scheduler.on_epoch_begin(epoch) train() save_checkpoint() compression_scheduler.on_epoch_end(epoch) train(): For each training step: compression_scheduler.on_minibatch_begin(epoch) output = model(input) loss = criterion(output, target) compression_scheduler.before_backward_pass(epoch) loss.backward() optimizer.step() compression_scheduler.on_minibatch_end(epoch) """ local_min_score = 2. for i in range(args.epochs): compression_scheduler.on_epoch_begin(i) train_accuracy = train(i,criterion, optimizer, compression_scheduler) val_accuracy = validate() # Validate hyperparameter setting t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True) compression_scheduler.on_epoch_end(i, optimizer) apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False, 'hyperopt', './') print('Epoch: {}, train_acc: {:.4f}, val_acc: {:.4f}, sparsity: {:.4f}'.format(i, train_accuracy, val_accuracy, sparsity)) score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here if(score < global_min_score): global_min_score = score apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, True, 'best', './') if(score < local_min_score): local_min_score = score if (PrunerConstraint == True and i >= PrunerEpoch and (sparsity < Expected_Sparsity_Level_Low or sparsity > Expected_Sparsity_Level_High)): break test_accuracy = test() # Validate hyperparameter setting print('{} trials: score: {:.4f}, train_acc:{:.4f}, val_acc:{:.4f}, test_acc:{:.4f}, sparsity:{:.4f}'.format(count, local_min_score, train_accuracy, val_accuracy, test_accuracy, sparsity)) return local_min_score
# Distiller loggers msglogger = apputils.config_pylogger('logging.conf', None) tflogger = TensorBoardLogger(msglogger.logdir) tflogger.log_gradients = True pylogger = PythonLogger(msglogger) if args.summary: # The last string is the dataset distiller.model_summary(net, args.summary, 'coco') exit(0) compression_scheduler = None if args.compress: source = args.compress compression_scheduler = distiller.CompressionScheduler(net) distiller.config.file_config(net, optimizer, source, compression_scheduler) try: for epoch in range(0, args.epochs): total_loss = 0. epoch_start_time = time.time() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) train(epoch, optimizer, compression_scheduler) val_loss = test(epoch) if compression_scheduler: compression_scheduler.on_epoch_end(epoch)
def arbitrary_channel_pruning(config, channels_to_remove, is_parallel): """Test removal of arbitrary channels. The test receives a specification of channels to remove. Based on this specification, the channels are pruned and then physically removed from the model (via a "thinning" process). """ model, zeros_mask_dict = common.setup_test(config.arch, config.dataset, is_parallel) pair = config.module_pairs[0] conv2 = common.find_module_by_name(model, pair[1]) assert conv2 is not None # Test that we can access the weights tensor of the first convolution in layer 1 conv2_p = distiller.model_find_param(model, pair[1] + ".weight") assert conv2_p is not None assert conv2_p.dim() == 4 num_channels = conv2_p.size(1) cnt_nnz_channels = num_channels - len(channels_to_remove) mask = create_channels_mask(conv2_p, channels_to_remove) assert distiller.density_ch(mask) == (conv2.in_channels - len(channels_to_remove)) / conv2.in_channels # Cool, so now we have a mask for pruning our channels. # Use the mask to prune zeros_mask_dict[pair[1] + ".weight"].mask = mask zeros_mask_dict[pair[1] + ".weight"].apply_mask(conv2_p) all_channels = set([ch for ch in range(num_channels)]) nnz_channels = set(distiller.find_nonzero_channels_list(conv2_p, pair[1] + ".weight")) channels_removed = all_channels - nnz_channels logger.info("Channels removed {}".format(channels_removed)) # Now, let's do the actual network thinning distiller.remove_channels(model, zeros_mask_dict, config.arch, config.dataset, optimizer=None) conv1 = common.find_module_by_name(model, pair[0]) assert conv1 assert conv1.out_channels == cnt_nnz_channels assert conv2.in_channels == cnt_nnz_channels assert conv1.weight.size(0) == cnt_nnz_channels assert conv2.weight.size(1) == cnt_nnz_channels if config.bn_name is not None: bn1 = common.find_module_by_name(model, config.bn_name) assert bn1.running_var.size(0) == cnt_nnz_channels assert bn1.running_mean.size(0) == cnt_nnz_channels assert bn1.num_features == cnt_nnz_channels assert bn1.bias.size(0) == cnt_nnz_channels assert bn1.weight.size(0) == cnt_nnz_channels dummy_input = common.get_dummy_input(config.dataset) optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.1) run_forward_backward(model, optimizer, dummy_input) # Let's test saving and loading a thinned model. # We save 3 times, and load twice, to make sure to cover some corner cases: # - Make sure that after loading, the model still has hold of the thinning recipes # - Make sure that after a 2nd load, there no problem loading (in this case, the # - tensors are already thin, so this is a new flow) # (1) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None) model_2 = create_model(False, config.dataset, config.arch, parallel=is_parallel) model(dummy_input) model_2(dummy_input) conv2 = common.find_module_by_name(model_2, pair[1]) assert conv2 is not None with pytest.raises(KeyError): model_2 = load_lean_checkpoint(model_2, 'checkpoint.pth.tar') compression_scheduler = distiller.CompressionScheduler(model) hasattr(model, 'thinning_recipes') run_forward_backward(model, optimizer, dummy_input) # (2) save_checkpoint(epoch=0, arch=config.arch, model=model, optimizer=None, scheduler=compression_scheduler) model_2 = load_lean_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done") # (3) save_checkpoint(epoch=0, arch=config.arch, model=model_2, optimizer=None, scheduler=compression_scheduler) model_2 = load_lean_checkpoint(model_2, 'checkpoint.pth.tar') assert hasattr(model_2, 'thinning_recipes') logger.info("test_arbitrary_channel_pruning - Done 2")
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_epochs = list() if args.deterministic: if args.loaders is None: args.loaders = 1 # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.loaders > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --loaders to 0 or 1' ) exit(1) # Use a well-known seed, for repeatability of experiments distiller.set_deterministic() else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.use_cpu or (args.gpus is None and not torch.cuda.is_available()) or (args.gpus == ''): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) if args.loaders is None: active_gpus = args.gpus if args.gpus is not None else torch.cuda.device_count( ) args.loaders = max(parser.DEFAULT_LOADERS_COUNT, parser.DEFAULT_LOADERS_COUNT * active_gpus) msglogger.debug('Number of data loaders set to: {}'.format(args.loaders)) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint optimizer = None resumed_training_steps = None if args.resume or args.load_state_dict: if args.resume and not args.reset_optimizer: # initiate SGD with dummy lr optimizer = torch.optim.SGD(model.parameters(), lr=0.36787944117) model, compression_scheduler, optimizer, start_epoch, resumed_training_steps = apputils.load_checkpoint( model, args.resume or args.load_state_dict, optimizer=optimizer) model.to(args.device) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is not None: # optimizer was resumed from checkpoint # check if user has tried to set optimizer arguments # if so, ignore arguments with a warning. optimizer_group_args = [ 'lr', 'learning-rate', 'momentum', 'weight-decay', 'wd' ] user_optim_args = [ x for x in optimizer_group_args for arg in sys.argv if arg.startswith('--' + x) ] if user_optim_args: msglogger.warning( '{} optimizer arguments are ignored.'.format(user_optim_args)) msglogger.info( 'setting optimizer arguments when optimizer is resumed ' 'from checkpoint is forbidden.') else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info( '\tStats will be collected for {:.1%} of test dataset'.format( args.qe_calibration)) msglogger.info( '\tSetting constant seeds and converting model to serialized execution' ) distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update( create_quantization_stats_collector(model)) args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.loaders, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) args.trainset_print_period = parser.getPrintPeriod( args, len(train_loader.sampler), args.batch_size) args.validset_print_period = parser.getPrintPeriod(args, len(val_loader.sampler), args.batch_size) args.testset_print_period = parser.getPrintPeriod(args, len(test_loader.sampler), args.batch_size) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if (args.resume and not args.reset_optimizer) else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resume is not None, "You must use --resume to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resume.replace(".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume)[0] dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy( args.kd_policy, range(args.kd_start_epoch, args.epochs, 1)) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) if getattr(compression_scheduler, 'global_policy_end_epoch', None) is not None: if compression_scheduler.global_policy_end_epoch >= (start_epoch + args.epochs): msglogger.warning( 'scheduler requires at least {} epochs, but only {} are sanctioned' .format(compression_scheduler.global_policy_end_epoch, args.epochs)) accumulated_training_steps = resumed_training_steps if resumed_training_steps is not None else 0 for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: try: train(train_loader, model, criterion, optimizer, epoch, accumulated_training_steps, compression_scheduler, loggers=[tflogger, pylogger], args=args) except RuntimeError as e: if ('cuda out of memory' in str(e).lower()): msglogger.error( 'CUDA memory failure has been detected.\n' 'Sometimes it helps to decrease batch size.\n' 'e.g. Add the following flag to your call: --batch-size={}' .format(args.batch_size // 10)) raise distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) accumulated_training_steps += math.ceil( len(train_loader.sampler) / train_loader.batch_size) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) tflogger.log_training_progress(stats, epoch, None) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) if getattr(compression_scheduler, 'global_policy_end_epoch', None) is None or ( compression_scheduler.global_policy_end_epoch <= epoch): # Update the list of top scores achieved since all policies have concluded if top1 > 0: best_epochs.append( distiller.MutableNamedTuple({ 'top1': top1, 'top5': top5, 'epoch': epoch })) # Keep best_epochs sorted from best to worst # Sort by top1 first, secondary sort by top5, and so forth best_epochs.sort(key=operator.attrgetter('top1', 'top5', 'epoch'), reverse=True) for score in best_epochs[:args.num_best_scores]: msglogger.info('==> Best Top1: %.3f Top5: %.3f on epoch: %d', score.top1, score.top5, score.epoch) is_best = best_epochs and (epoch == best_epochs[0].epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_epochs[0].top1 if best_epochs else None, is_best, args.name, msglogger.logdir, accumulated_training_steps) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model global model global optimizer if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int(args.workers / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None global train_loader train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) global val_loader val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # add quantization scheduler global compression_scheduler compression_scheduler = distiller.CompressionScheduler(model) compression_scheduler = distiller.file_config( model, optimizer, '/home/hguan2/workspace/fault-tolerance/nips19/quant_aware_training.yaml', compression_scheduler, (args.start_epoch - 1) if args.resume else None) model.cuda() global epoch info = "before training, large_weights_count: {}".format( check_large_weights_count(model)) print(info) write_log(info) if args.mode == Mode.ADMM: global W_list, Z_list, U_list W_list = [] Z_list = [] U_list = [] init_admm_param(model, W_list, Z_list, U_list) print("Initiaze ADMM parameters...") print("W_list:", W_list[0].view(-1)[:5]) print("Z_list:", Z_list[0].view(-1)[:5]) print("U_list:", U_list[0].view(-1)[:5]) for epoch in range(args.start_epoch, args.epochs): compression_scheduler.on_epoch_begin(epoch) if args.distributed: train_sampler.set_epoch(epoch) # adjust_learning_rate(optimizer, epoch, args) # train for one epoch regularized_train(compression_scheduler, train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) info = "epoch: {}, iteration: {}, after epoch, large_weights_count: {}, accuracy: {:.3f}".format( epoch, len(train_loader), check_large_weights_count(model), acc1) write_log(info) print(info) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) compression_scheduler.on_epoch_end(epoch, optimizer) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best) ## when it is clipping-based training, stop whenever the accuracy is met if args.mode == Mode.WOT and best_acc1 >= target_accs[args.arch] + 1e-5: print("Reach best accuracy:{:.3f}, target_acc:{:.3f}".format( best_acc1, target_accs[args.arch])) return
def load_checkpoint(model, chkpt_file, optimizer=None): """Load a pytorch training checkpoint Args: model: the pytorch model to which we will load the parameters chkpt_file: the checkpoint file optimizer: the optimizer to which we will load the serialized state """ if not os.path.isfile(chkpt_file): raise IOError(ENOENT, 'Could not find a checkpoint file at', chkpt_file) msglogger.info("=> loading checkpoint %s", chkpt_file) checkpoint = torch.load(chkpt_file, map_location=lambda storage, loc: storage) msglogger.debug("\n\t".join(['Checkpoint keys:'] + list(checkpoint))) if 'state_dict' not in checkpoint: raise ValueError( "Checkpoint must contain the model parameters under the key 'state_dict'" ) checkpoint_epoch = checkpoint.get('epoch', None) start_epoch = checkpoint_epoch + 1 if checkpoint_epoch is not None else 0 best_top1 = checkpoint.get('best_top1', None) if best_top1 is not None: msglogger.info(" best top@1: %.3f", best_top1) compression_scheduler = None normalize_dataparallel_keys = False if 'compression_sched' in checkpoint: compression_scheduler = distiller.CompressionScheduler(model) try: compression_scheduler.load_state_dict( checkpoint['compression_sched'], normalize_dataparallel_keys) except KeyError as e: # A very common source of this KeyError is loading a GPU model on the CPU. # We rename all of the DataParallel keys because DataParallel does not execute on the CPU. normalize_dataparallel_keys = True compression_scheduler.load_state_dict( checkpoint['compression_sched'], normalize_dataparallel_keys) msglogger.info( "Loaded compression schedule from checkpoint (epoch {})".format( checkpoint_epoch)) else: msglogger.info( "Warning: compression schedule data does not exist in the checkpoint" ) if 'thinning_recipes' in checkpoint: if 'compression_sched' not in checkpoint: raise KeyError( "Found thinning_recipes key, but missing mandatory key compression_sched" ) msglogger.info("Loaded a thinning recipe from the checkpoint") # Cache the recipes in case we need them later model.thinning_recipes = checkpoint['thinning_recipes'] if normalize_dataparallel_keys: model.thinning_recipes = [ distiller.get_normalized_recipe(recipe) for recipe in model.thinning_recipes ] distiller.execute_thinning_recipes_list( model, compression_scheduler.zeros_mask_dict, model.thinning_recipes) if 'quantizer_metadata' in checkpoint: msglogger.info('Loaded quantizer metadata from the checkpoint') qmd = checkpoint['quantizer_metadata'] quantizer = qmd['type'](model, **qmd['params']) quantizer.prepare_model() msglogger.info("=> loaded checkpoint '{f}' (epoch {e})".format( f=str(chkpt_file), e=checkpoint_epoch)) if normalize_dataparallel_keys: checkpoint['state_dict'] = { normalize_module_name(k): v for k, v in checkpoint['state_dict'].items() } model.load_state_dict(checkpoint['state_dict']) return (model, compression_scheduler, start_epoch)
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if utils.is_main_process(): msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir, args.verbose) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file]), # remove both None and empty strings msglogger.logdir) msglogger.debug("Distiller: %s", distiller.__version__) else: msglogger = logging.getLogger() msglogger.disabled = True # Data loading code print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler( train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") model = detection.__dict__[args.model](num_classes=num_classes, pretrained=args.pretrained) patch_fastrcnn(model) model.to(device) if args.summary: if utils.is_main_process(): for summary in args.summary: distiller.model_summary(model, summary, args.dataset) return model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) compression_scheduler = None if utils.is_main_process(): # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler, None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.qe_calibration: def test_fn(model): return evaluate(model, data_loader_test, device=device) collect_quant_stats(model_without_ddp, test_fn, save_dir=args.output_dir, modules_to_collect=['backbone', 'rpn', 'roi_heads']) # We skip `.transform` because it is a pre-processing unit. return if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if compression_scheduler and 'compression_scheduler' in checkpoint: compression_scheduler.load_state_dict(checkpoint['compression_scheduler']) if args.test_only: evaluate(model, data_loader_test, device=device) return activations_collectors = create_activation_stats_collectors(model, *args.activation_stats) print("Start training") start_time = time.time() # if not isinstance(model, nn.DataParallel) and torch.cuda.is_available() \ # and torch.cuda.device_count() > 1: # msglogger.info("Using %d GPUs on DataParallel." % torch.cuda.device_count()) # model = nn.DataParallel(model) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) dist.barrier() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) with collectors_context(activations_collectors["train"]) as collectors: train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, compression_scheduler) if utils.is_main_process(): distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics(epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity and utils.is_main_process(): msglogger.info(distiller.masks_sparsity_tbl_summary(model, compression_scheduler)) lr_scheduler.step() if args.output_dir: save_dict = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args} if compression_scheduler: save_dict['compression_scheduler'] = compression_scheduler.state_dict() utils.save_on_master(save_dict, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def dict_config(model, optimizer, sched_dict, scheduler=None): app_cfg_logger.debug('Schedule contents:\n' + json.dumps(sched_dict, indent=2)) if scheduler is None: scheduler = distiller.CompressionScheduler(model) pruners = __factory('pruners', model, sched_dict) regularizers = __factory('regularizers', model, sched_dict) quantizers = __factory('quantizers', model, sched_dict, optimizer=optimizer) if len(quantizers) > 1: raise ValueError("\nError: Multiple Quantizers not supported") extensions = __factory('extensions', model, sched_dict) try: lr_policies = [] for policy_def in sched_dict['policies']: policy = None if 'pruner' in policy_def: try: instance_name, args = __policy_params(policy_def, 'pruner') except TypeError as e: print( '\n\nFatal Error: a policy is defined with a null pruner' ) print( 'Here\'s the policy definition for your reference:\n{}' .format(json.dumps(policy_def, indent=1))) raise assert instance_name in pruners, "Pruner {} was not defined in the list of pruners".format( instance_name) pruner = pruners[instance_name] policy = distiller.PruningPolicy(pruner, args) elif 'regularizer' in policy_def: instance_name, args = __policy_params(policy_def, 'regularizer') assert instance_name in regularizers, "Regularizer {} was not defined in the list of regularizers".format( instance_name) regularizer = regularizers[instance_name] if args is None: policy = distiller.RegularizationPolicy(regularizer) else: policy = distiller.RegularizationPolicy( regularizer, **args) elif 'quantizer' in policy_def: instance_name, args = __policy_params(policy_def, 'quantizer') assert instance_name in quantizers, "Quantizer {} was not defined in the list of quantizers".format( instance_name) quantizer = quantizers[instance_name] policy = distiller.QuantizationPolicy(quantizer) elif 'lr_scheduler' in policy_def: # LR schedulers take an optimizer in their CTOR, so postpone handling until we're certain # a quantization policy was initialized (if exists) lr_policies.append(policy_def) continue elif 'extension' in policy_def: instance_name, args = __policy_params(policy_def, 'extension') assert instance_name in extensions, "Extension {} was not defined in the list of extensions".format( instance_name) extension = extensions[instance_name] policy = extension else: raise ValueError( "\nFATAL Parsing error while parsing the pruning schedule - unknown policy [%s]" .format(policy_def)) add_policy_to_scheduler(policy, policy_def, scheduler) # Any changes to the optmizer caused by a quantizer have occured by now, so safe to create LR schedulers lr_schedulers = __factory('lr_schedulers', model, sched_dict, optimizer=optimizer) for policy_def in lr_policies: instance_name, args = __policy_params(policy_def, 'lr_scheduler') assert instance_name in lr_schedulers, "LR-scheduler {} was not defined in the list of lr-schedulers".format( instance_name) lr_scheduler = lr_schedulers[instance_name] policy = distiller.LRPolicy(lr_scheduler) add_policy_to_scheduler(policy, policy_def, scheduler) except AssertionError: # propagate the assertion information raise except Exception as exception: print("\nFATAL Parsing error!\n%s" % json.dumps(policy_def, indent=1)) print("Exception: %s %s" % (type(exception), exception)) raise return scheduler
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 90 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.evaluate: args.deterministic = True if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ distiller.set_deterministic() # Use a well-known seed, for repeatability of experiments else: # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError('ERROR: Argument --gpus must be a comma-separated list of integers only') available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError('ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # TODO(barrh): args.deprecated_resume is deprecated since v0.3.1 if args.deprecated_resume: msglogger.warning('The "--resume" flag is deprecated. Please use "--resume-from=YOUR_PATH" instead.') if not args.reset_optimizer: msglogger.warning('If you wish to also reset the optimizer, call with: --reset-optimizer') args.reset_optimizer = True args.resumed_checkpoint_path = args.deprecated_resume # We can optionally resume from a checkpoint optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info('\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0') # Define loss function (criterion) criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is None: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) activations_collectors = create_activation_stats_collectors(model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info('\tStats will be collected for {:.1%} of test dataset'.format(args.qe_calibration)) msglogger.info('\tSetting constant seeds and converting model to serialized execution') distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update(create_quantization_stats_collector(model)) args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler, (start_epoch-1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resumed_checkpoint_path is not None, \ "You must use --resume-from to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format(args.resumed_checkpoint_path.replace(".pth.tar", "")), dir=msglogger.logdir) print("Note: your model may have collapsed to random inference, so you may want to fine-tune") return args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher = apputils.load_lean_checkpoint(teacher, args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) if start_epoch >= ending_epoch: msglogger.error( 'epoch count is too low, starting epoch is {} but total epochs set to {}'.format( start_epoch, ending_epoch)) raise ValueError('Epochs parameter is too low. Nothing to do.') for epoch in range(start_epoch, ending_epoch): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch, metrics=(vloss if (epoch != start_epoch) else 10**6)) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics(epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info(distiller.masks_sparsity_tbl_summary(model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics(epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, top1, top5, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch checkpoint_extras = {'current_top1': top1, 'best_top1': perf_scores_history[0].top1, 'best_epoch': perf_scores_history[0].epoch} apputils.save_checkpoint(epoch, args.arch, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, is_best=is_best, name=args.name, dir=msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def main(): global msglogger check_pytorch_version() args = parser.parse_args() msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(sys.argv, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_top1 = 0 if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1' ) exit(1) # Use a well-known seed, for repeatability of experiments torch.manual_seed(0) random.seed(0) np.random.seed(0) cudnn.deterministic = True else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' # Create the model png_summary = args.summary is not None and args.summary.startswith('png') is_parallel = not png_summary # For PNG summary, parallel graphs are illegible model = create_model(args.pretrained, args.dataset, args.arch, parallel=is_parallel, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # We can optionally resume from a checkpoint if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) if 'resnet' in args.arch and 'cifar' in args.arch: distiller.resnet_cifar_remove_layers(model) #model = distiller.resnet_cifar_remove_channels(model, compression_scheduler.zeros_mask_dict) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info("Optimizer (%s): momentum=%s decay=%s", type(optimizer), args.momentum, args.weight_decay) # This sample application can be invoked to produce various summary reports. if args.summary: which_summary = args.summary if which_summary.startswith('png'): apputils.draw_img_classifier_to_file( model, 'model.png', args.dataset, which_summary == 'png_w_params') else: distiller.model_summary(model, optimizer, which_summary, args.dataset) exit() # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.deterministic) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) activations_sparsity = None if args.activation_stats: # If your model has ReLU layers, then those layers have sparse activations. # ActivationSparsityCollector will collect information about this sparsity. # WARNING! Enabling activation sparsity collection will significantly slow down training! activations_sparsity = ActivationSparsityCollector(model) if args.sensitivity is not None: # This sample application can be invoked to execute Sensitivity Analysis on your # model. The ouptut is saved to CSV and PNG. msglogger.info("Running sensitivity tests") test_fnc = partial(test, test_loader=test_loader, criterion=criterion, loggers=[pylogger], print_freq=args.print_freq) which_params = [ param_name for param_name, _ in model.named_parameters() ] sensitivity = distiller.perform_sensitivity_analysis( model, net_params=which_params, sparsities=np.arange(0.0, 0.50, 0.05) if args.sensitivity == 'filter' else np.arange(0.0, 0.95, 0.05), test_func=test_fnc, group=args.sensitivity) distiller.sensitivities_to_png(sensitivity, 'sensitivity.png') distiller.sensitivities_to_csv(sensitivity, 'sensitivity.csv') exit() if args.evaluate: # This sample application can be invoked to evaluate the accuracy of your model on # the test dataset. # You can optionally quantize the model to 8-bit integer before evaluation. # For example: # python3 compress_classifier.py --arch resnet20_cifar ../data.cifar10 -p=50 --resume=checkpoint.pth.tar --evaluate if args.quantize: model.cpu() quantizer = quantization.SymmetricLinearQuantizer(model, 8, 8) quantizer.prepare_model() model.cuda() top1, _, _ = test(test_loader, model, criterion, [pylogger], args.print_freq) if args.quantize: checkpoint_name = 'quantized' apputils.save_checkpoint(0, args.arch, model, optimizer, best_top1=top1, name='_'.split(args.name, checkpoint_name) if args.name else checkpoint_name) exit() if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. source = args.compress msglogger.info("Compression schedule (source=%s)", source) compression_scheduler = distiller.CompressionScheduler(model) distiller.config.fileConfig(model, optimizer, compression_scheduler, args.compress, msglogger) for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], print_freq=args.print_freq, log_params_hist=args.log_params_histograms) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) if args.activation_stats: distiller.log_activation_sparsity(epoch, loggers=[tflogger, pylogger], collector=activations_sparsity) # evaluate on validation set top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args.print_freq, epoch) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch) # remember best top1 and save checkpoint is_best = top1 > best_top1 best_top1 = max(top1, best_top1) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_top1, is_best, args.name) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], args.print_freq)
def load_checkpoint(model, chkpt_file, optimizer=None, model_device=None, *, lean_checkpoint=False, strict=False): """Load a pytorch training checkpoint. Args: model: the pytorch model to which we will load the parameters chkpt_file: the checkpoint file lean_checkpoint: if set, read into model only 'state_dict' field optimizer: [deprecated argument] model_device [str]: if set, call model.to($model_device) This should be set to either 'cpu' or 'cuda'. :returns: updated model, compression_scheduler, optimizer, start_epoch """ if not os.path.isfile(chkpt_file): raise IOError(ENOENT, 'Could not find a checkpoint file at', chkpt_file) msglogger.info("=> loading checkpoint %s", chkpt_file) checkpoint = torch.load(chkpt_file, map_location=lambda storage, loc: storage) msglogger.info('=> Checkpoint contents:\n%s\n' % get_contents_table(checkpoint)) if 'extras' in checkpoint: msglogger.info("=> Checkpoint['extras'] contents:\n{}\n".format( get_contents_table(checkpoint['extras']))) if 'state_dict' not in checkpoint: raise ValueError( "Checkpoint must contain the model parameters under the key 'state_dict'" ) checkpoint_epoch = checkpoint.get('epoch', None) start_epoch = checkpoint_epoch + 1 if checkpoint_epoch is not None else 0 compression_scheduler = None normalize_dataparallel_keys = False if 'compression_sched' in checkpoint: compression_scheduler = distiller.CompressionScheduler(model) try: compression_scheduler.load_state_dict( checkpoint['compression_sched'], normalize_dataparallel_keys) except KeyError as e: # A very common source of this KeyError is loading a GPU model on the CPU. # We rename all of the DataParallel keys because DataParallel does not execute on the CPU. normalize_dataparallel_keys = True compression_scheduler.load_state_dict( checkpoint['compression_sched'], normalize_dataparallel_keys) msglogger.info( "Loaded compression schedule from checkpoint (epoch {})".format( checkpoint_epoch)) else: msglogger.info( "Warning: compression schedule data does not exist in the checkpoint" ) if 'thinning_recipes' in checkpoint: if 'compression_sched' not in checkpoint: raise KeyError( "Found thinning_recipes key, but missing mandatory key compression_sched" ) msglogger.info("Loaded a thinning recipe from the checkpoint") # Cache the recipes in case we need them later model.thinning_recipes = checkpoint['thinning_recipes'] if normalize_dataparallel_keys: model.thinning_recipes = [ distiller.get_normalized_recipe(recipe) for recipe in model.thinning_recipes ] distiller.execute_thinning_recipes_list( model, compression_scheduler.zeros_mask_dict, model.thinning_recipes) if 'quantizer_metadata' in checkpoint: msglogger.info('Loaded quantizer metadata from the checkpoint') qmd = checkpoint['quantizer_metadata'] quantizer = qmd['type'](model, **qmd['params']) quantizer.prepare_model(qmd['dummy_input']) if normalize_dataparallel_keys: checkpoint['state_dict'] = { normalize_module_name(k): v for k, v in checkpoint['state_dict'].items() } anomalous_keys = model.load_state_dict(checkpoint['state_dict'], strict) if anomalous_keys: # This is pytorch 1.1+ missing_keys, unexpected_keys = anomalous_keys if unexpected_keys: msglogger.warning( "Warning: the loaded checkpoint (%s) contains %d unexpected state keys" % (chkpt_file, len(unexpected_keys))) if missing_keys: raise ValueError( "The loaded checkpoint (%s) is missing %d state keys" % (chkpt_file, len(missing_keys))) if model_device is not None: model.to(model_device) if lean_checkpoint: msglogger.info("=> loaded 'state_dict' from checkpoint '{}'".format( str(chkpt_file))) return (model, None, None, 0) def _load_optimizer(cls, src_state_dict, model): """Initiate optimizer with model parameters and load src_state_dict""" # initiate the dest_optimizer with a dummy learning rate, # this is required to support SGD.__init__() dest_optimizer = cls(model.parameters(), lr=1) dest_optimizer.load_state_dict(src_state_dict) return dest_optimizer try: optimizer = _load_optimizer(checkpoint['optimizer_type'], checkpoint['optimizer_state_dict'], model) except KeyError: # Older checkpoints do support optimizer loading: They either had an 'optimizer' field # (different name) which was not used during the load, or they didn't even checkpoint # the optimizer. optimizer = None if optimizer is not None: msglogger.info( 'Optimizer of type {type} was loaded from checkpoint'.format( type=type(optimizer))) msglogger.info('Optimizer Args: {}'.format( dict((k, v) for k, v in optimizer.state_dict()['param_groups'][0].items() if k != 'params'))) else: msglogger.warning('Optimizer could not be loaded from checkpoint.') msglogger.info("=> loaded checkpoint '{f}' (epoch {e})".format( f=str(chkpt_file), e=checkpoint_epoch)) return (model, compression_scheduler, optimizer, start_epoch)
def objective(space): global model global count global best_dict #Explore new model model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) if args.resume: model, _, _ = apputils.load_checkpoint( model, chkpt_file=args.resume) count += 1 print('{} trial starting...'.format(count)) # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity) accuracy = 0 #alpha = 0.2 # Super-parameter: the importance of inference time alpha = 1.0 # Super-parameter: the importance of inference time sparsity = 0.0 # Training hyperparameter criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) """ distiller/distiller/config.py # Element-wise sparsity sparsity_levels = {net_param: sparsity_level} pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) scheduler = distiller.CompressionScheduler(model) scheduler.add_policy(policy, epochs=[0, 2, 4]) # Local search add multiple pruner for each layer """ sparsity_levels = {} for key, value in space.items(): sparsity_levels[key] = value pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)) compression_scheduler = distiller.CompressionScheduler(model) #compression_scheduler.add_policy(policy, epochs=[90]) compression_scheduler.add_policy(policy, epochs=[0]) compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=90, frequency=1) """ distiller/example/classifier_compression/compress_classifier.py For each epoch: compression_scheduler.on_epoch_begin(epoch) train() save_checkpoint() compression_scheduler.on_epoch_end(epoch) train(): For each training step: compression_scheduler.on_minibatch_begin(epoch) output = model(input) loss = criterion(output, target) compression_scheduler.before_backward_pass(epoch) loss.backward() optimizer.step() compression_scheduler.on_minibatch_end(epoch) """ for i in range(args.epochs): compression_scheduler.on_epoch_begin(i) train_accuracy = train(i,criterion, optimizer, compression_scheduler) val_accuracy = validate() # Validate hyperparameter setting t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True) compression_scheduler.on_epoch_end(i, optimizer) apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False, 'hyperopt', './') print('{} epochs => train acc:{:.2f}%, val acc:{:.2f}%'.format(i, train_accuracy, val_accuracy)) test_accuracy = validate(test_loader) # Validate hyperparameter setting #score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here # objective funtion here # accuracy: 98~90%, sparsity: 80%~50% score = -((val_accuracy/100.)**2-0.9**2 + alpha * ((sparsity/100.)**2-0.5**2)) print('{} trials: score: {:.2f}\ttrain acc:{:.2f}%\tval acc:{:.2f}%\ttest acc:{:.2f}%\tsparsity:{:.2f}%'.format(count, score, train_accuracy, val_accuracy, test_accuracy, sparsity)) if score < best_dict['score']: best_dict['trial'] = count best_dict['score'] = score best_dict['tr_acc'] = train_accuracy best_dict['v_acc'] = val_accuracy best_dict['te_acc'] = test_accuracy best_dict['sparsity'] = sparsity best_dict['model_best'] = copy.deepcopy(model) return score