def model_summary(model, what, dataset=None): if what == 'sparsity': # ctsun 181125 t, total = distiller.weights_sparsity_tbl_summary( model, return_total_sparsity=True) print("\nParameters:\n", str(t)) print('Total sparsity: {:0.2f}\n'.format(total)) pylogger = PythonLogger(msglogger) csvlogger = CsvLogger('weights.csv') distiller.log_weights_sparsity(model, -1, loggers=[pylogger, csvlogger]) elif what == 'compute': if dataset == 'imagenet': dummy_input = Variable(torch.randn(1, 3, 224, 224)) # ctsun 181125 elif dataset == 'cifar10' or 'cinic10': dummy_input = Variable(torch.randn(1, 3, 32, 32)) else: print("Unsupported dataset (%s) - aborting compute operation" % dataset) return df = model_performance_summary(model, dummy_input, 1) t = tabulate(df, headers='keys', tablefmt='psql', floatfmt=".5f") total_macs = df['MACs'].sum() # ctsun 181125 total_weights = df['Weights volume'].sum() print(t) print("Total MACs : " + "{:,}".format(total_macs)) # ctsun 181125 print("Total Weights: " + "{:,}".format(total_weights)) elif what == 'model': # print the simple form of the model print(model) elif what == 'modules': # Print the names of non-leaf modules # Remember that in PyTorch not every node is a module (e.g. F.relu). # Also remember that parameterless modules, like nn.MaxPool2d, can be used multiple # times in the same model, but they will only appear once in the modules list. nodes = [] for name, module in model.named_modules(): # Only print leaf modules if len(module._modules) == 0: nodes.append([name, module.__class__.__name__]) print(tabulate(nodes, headers=['Name', 'Type'])) else: raise ValueError("%s is not a supported summary type" % what)
def train(self, dataset, k=20, n_epochs=20, save_dir='./models', save=True, model_name='GRU4REC'): """ Train the GRU4REC model on a pandas dataframe for several training epochs, and store the intermediate models to the user-specified directory. Args: n_epochs (int): the number of training epochs to run save_dir (str): the path to save the intermediate trained models model_name (str): name of the model """ print(f'Training {model_name}...') for epoch in range(n_epochs): if self.compression_scheduler: self.compression_scheduler.on_epoch_begin(epoch) results = self.run_epoch( dataset, k=k, training=True, compression_scheduler=self.compression_scheduler, epoch=epoch) results = [f'{k}:{v:.3f}' for k, v in results.items()] print(f'epoch:{epoch+1:2d}/{"/".join(results)}') t, total = distiller.weights_sparsity_tbl_summary( self.gru, return_total_sparsity=True) print("\nParameters:\n" + str(t)) print('Total sparsity: {:0.2f}\n'.format(total)) if self.compression_scheduler: self.compression_scheduler.on_epoch_end(epoch) # Store the intermediate model if save: save_dir = Path(save_dir) if not save_dir.exists(): save_dir.mkdir() model_fname = f'{model_name}_{self.loss_type}_{self.optimizer_type}_{self.lr}_epoch{epoch+1:d}' torch.save(self.gru.state_dict(), save_dir / model_fname)
def objective(space): global model global count global global_min_score #Explore new model model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) count += 1 # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity) accuracy = 0 alpha = 0.3 # Super-parameter: the importance of inference time latency = 0.0 sparsity = 0.0 # Training hyperparameter if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) print('resume mode: {}'.format(args.resume)) print(global_min_score) criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) """ distiller/distiller/config.py # Element-wise sparsity sparsity_levels = {net_param: sparsity_level} pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) scheduler = distiller.CompressionScheduler(model) scheduler.add_policy(policy, epochs=[0, 2, 4]) # Local search add multiple pruner for each layer """ sparsity_levels = {} for key, value in space.items(): sparsity_levels[key] = value #print(sparsity_levels) pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) # for SparsityLevelParameterPruner # pruner = distiller.pruning.SensitivityPruner(name='sensitivity', sensitivities=sparsity_levels) # for SensitivityPruner policy = distiller.PruningPolicy(pruner, pruner_args=None) lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.1)) compression_scheduler = distiller.CompressionScheduler(model) compression_scheduler.add_policy(policy, epochs=[PrunerEpoch]) # compression_scheduler.add_policy(policy, starting_epoch=0, ending_epoch=38, frequency=2) compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=50, frequency=1) """ distiller/example/classifier_compression/compress_classifier.py For each epoch: compression_scheduler.on_epoch_begin(epoch) train() save_checkpoint() compression_scheduler.on_epoch_end(epoch) train(): For each training step: compression_scheduler.on_minibatch_begin(epoch) output = model(input) loss = criterion(output, target) compression_scheduler.before_backward_pass(epoch) loss.backward() optimizer.step() compression_scheduler.on_minibatch_end(epoch) """ local_min_score = 2. for i in range(args.epochs): compression_scheduler.on_epoch_begin(i) train_accuracy = train(i,criterion, optimizer, compression_scheduler) val_accuracy = validate() # Validate hyperparameter setting t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True) compression_scheduler.on_epoch_end(i, optimizer) apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False, 'hyperopt', './') print('Epoch: {}, train_acc: {:.4f}, val_acc: {:.4f}, sparsity: {:.4f}'.format(i, train_accuracy, val_accuracy, sparsity)) score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here if(score < global_min_score): global_min_score = score apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, True, 'best', './') if(score < local_min_score): local_min_score = score if (PrunerConstraint == True and i >= PrunerEpoch and (sparsity < Expected_Sparsity_Level_Low or sparsity > Expected_Sparsity_Level_High)): break test_accuracy = test() # Validate hyperparameter setting print('{} trials: score: {:.4f}, train_acc:{:.4f}, val_acc:{:.4f}, test_acc:{:.4f}, sparsity:{:.4f}'.format(count, local_min_score, train_accuracy, val_accuracy, test_accuracy, sparsity)) return local_min_score
def log_weights_sparsity(self, model, epoch): t, total = distiller.weights_sparsity_tbl_summary( model, return_total_sparsity=True) self.pylogger.info("\nParameters:\n" + str(t)) self.pylogger.info('Total sparsity: {:0.2f}\n'.format(total))
def train(c): net = get_net(c) opt = get_opt(c, net) net, opt, step = c.init_model(net, opt=opt, step='max', train=True) step_lr = scheduler(c, opt, step) data_tr = SampleIterator(c, c.train_batch, split='valid' if c.debug else 'train') iter_tr = iter(data_tr) data_val = SequentialIterator(c, c.eval_batch, split='valid') data_test = SequentialIterator(c, c.eval_batch, split='test') print('Before quantization') tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True) step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_') step_result = step_result.append( pd.Series(evaluate(c, data_test, net)).add_prefix('test_') ) step_result['sparsity'] = sparsity print(step_result) compression_scheduler = distiller.config.file_config(net, opt, c.compress) print('After initial quantization') s = Namespace(net=net, opt=opt, step=step) c.on_train_start(s) tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True) step_result = pd.Series(evaluate(c, data_val, net)).add_prefix('val_') step_result = step_result.append( pd.Series(evaluate(c, data_test, net)).add_prefix('test_') ) step_result['sparsity'] = sparsity print(step_result) npm = [] for name, param in net.named_parameters(): if param.dim() in [2, 4] and any(type in name for type in ['weight', 'bias']): npm.append((name, param, param.abs() == 0)) best_val_loss = np.inf if s.results is not None and 'val_loss' in s.results.columns: best_val_loss = s.results['val_loss'].dropna().max() try: steps_per_epoch = c.step_eval while step < s.step_max: epoch = step // steps_per_epoch batch = step % steps_per_epoch if batch == 0: compression_scheduler.on_epoch_begin(epoch) compression_scheduler.on_minibatch_begin(epoch, batch, steps_per_epoch) step_lr(step) x = to_torch(next(iter_tr), c.device).t() t_s = time() inputs, labels = x[:-1], x[1:] loss, _, lam, theta = net(inputs, labels) compression_scheduler.before_backward_pass(epoch, batch, steps_per_epoch, loss, False) opt.zero_grad() if torch.isnan(loss): import q; q.d() loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), c.get('clip_grad', 0.5)) compression_scheduler.before_parameter_optimization(epoch, batch, steps_per_epoch, opt) opt.step() for name, param, mask in npm: param.data[mask] = 0 compression_scheduler.on_minibatch_end(epoch, batch, steps_per_epoch) if (batch + 1) == steps_per_epoch: compression_scheduler.on_epoch_end(epoch) time_model = np.round(time() - t_s, 5) loss = from_torch(loss) perplexity = np.nan if loss > 5 else np.e ** loss step_result = pd.Series(dict( loss=loss, perplexity=perplexity, time=time_model, )).add_prefix('train_') step_result['lr'] = next(iter(opt.param_groups))['lr'] step_result['theta'] = from_torch(theta) step_result['lambda'] = from_torch(lam) s.step = step = step + 1 if step % c.step_eval == 0: tbl, sparsity = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True) step_result = step_result.append( pd.Series(evaluate(c, data_val, net)).add_prefix('val_') ) step_result = step_result.append( pd.Series(evaluate(c, data_test, net)).add_prefix('test_') ) step_result['sparsity'] = sparsity s.record_step = step_result['val_loss'] < best_val_loss clear_gpu_memory() s.step_result = step_result c.on_step_end(s) except Exception as e: import traceback err = traceback.format_exc() if c.main: c.log(err) else: print(err) finally: c.on_train_end(s)
def get_step_logs(self): model = self.compression_scheduler.model t, total = distiller.weights_sparsity_tbl_summary( model, return_total_sparsity=True) return {"sparsity": total / 100}
def main(): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) processor = NLUDataProcessor(args.data_dir, args.max_seq_length, tokenizer) label_list = processor.get_labels() num_labels = len(label_list) num_intents = len(processor.intents) train_examples = None num_train_optimization_steps = None if args.do_train: restrict = {} if args.limit_data: for i in range(len(args.limit_data) // 2): intent = args.limit_data[i * 2] size = args.limit_data[i * 2 + 1] assert intent in processor.intents restrict[intent] = tuple(int(x) for x in size.split('*')) train_examples = processor.get_train_examples(restrict) num_train_optimization_steps = \ int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * \ args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = \ num_train_optimization_steps // torch.distributed.get_world_size() if args.eval_on_test: eval_examples = processor.get_test_examples() else: eval_examples = processor.get_dev_examples() # Prepare model cache_dir = args.cache_dir or os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForNLU.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=num_labels, num_intents=num_intents, from_tf=args.from_tf, layers=args.layers, prune=args.prune, dropout=args.dropout) if args.fp16: model.half() model.to(device) if args.local_rank != -1: from apex.parallel import DistributedDataParallel as DDP model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # distiller compression_scheduler = None if args.distiller is not None: import distiller distiller_pylogger = distiller.data_loggers.PythonLogger(logger) compression_scheduler = distiller.config.file_config( model, None, args.distiller) # Tensorboard swriter = SummaryWriter(args.output_dir) logger.info("Writing summary to %s", args.output_dir) # Prepare optimizer if args.do_train: if args.train_layers_from is not None: param_optimizer = [] for n, p in model.named_parameters(): if "classifier" in n or "pooler" in n: param_optimizer.append((n, p)) elif any( int(s) >= args.train_layers_from for s in re.findall(r'layer\.(\d+)\.', n)): param_optimizer.append((n, p)) else: print("Not considered trainable:", n) p.requires_grad_(False) else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule=None if args.const_lr else 'warmup_linear') global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_tokenids = torch.tensor( [f.input_tokenids for f in train_examples], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_examples], dtype=torch.long) all_input_segmentids = torch.tensor( [f.input_segmentids for f in train_examples], dtype=torch.long) all_input_labelids = torch.tensor( [f.input_labelids for f in train_examples], dtype=torch.long) train_data = TensorDataset(all_input_tokenids, all_input_mask, all_input_segmentids, all_input_labelids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) batches_per_epoch = int(len(train_examples) / args.train_batch_size) model.train() for epoch_id in trange(int(args.num_train_epochs), desc="Epoch"): if compression_scheduler: compression_scheduler.on_epoch_begin(epoch_id) nb_tr_examples = 0 global_step_tr_loss = 0.0 tqdm_bar = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(tqdm_bar): if compression_scheduler and step % args.gradient_accumulation_steps == 0: compression_scheduler.on_minibatch_begin( epoch_id, minibatch_id=step / args.gradient_accumulation_steps, minibatches_per_epoch=batches_per_epoch) batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _, _ = model(input_ids, segment_ids, input_mask, labels=label_ids) if compression_scheduler: # Before running the backward phase, we allow the scheduler to modify the loss # (e.g. add regularization loss) loss = compression_scheduler.before_backward_pass( epoch_id, minibatch_id=step / args.gradient_accumulation_steps, minibatches_per_epoch=batches_per_epoch, loss=loss, return_loss_components=False) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() global_step_tr_loss += loss.item() nb_tr_examples += input_ids.size(0) if (step + 1) % args.gradient_accumulation_steps == 0: tqdm_bar.set_postfix(train_loss=global_step_tr_loss) swriter.add_scalar('train_loss', global_step_tr_loss, global_step=global_step) if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = \ args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step if args.fp16: for _, p in param_optimizer: if p.grad is None: p.grad = torch.zeros(p.size(), dtype=p.dtype, device=p.device) optimizer.step() global_step += 1 global_step_tr_loss = 0.0 if compression_scheduler: compression_scheduler.on_minibatch_end( epoch_id, minibatch_id=step / args.gradient_accumulation_steps, minibatches_per_epoch=batches_per_epoch) optimizer.zero_grad() if not args.fp16 and optimizer.get_lr(): # get_lr returns a list, however all the elements of the list should be the # same swriter.add_scalar('learning_rate', np.random.choice( optimizer.get_lr()), global_step=global_step) if global_step % args.eval_steps == 0: perform_evaluation(eval_examples=eval_examples, model=model, processor=processor, swriter=swriter, device=device, global_step=global_step) model.train() if global_step % args.save_checkpoints_steps == 0: save_model(model=model, tokenizer=tokenizer, global_step=global_step) if args.prune and global_step % args.eval_steps == 1: prune_model(model=model, swriter=swriter, global_step=global_step, count=args.prune_count) if compression_scheduler: sparsity_table, total_sparsity = \ distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True) logger.info("\nParameters:\n" + str(sparsity_table)) logger.info('Total sparsity: {:0.2f}\n'.format(total_sparsity)) swriter.add_scalar('sparsity', total_sparsity, global_step=global_step) compression_scheduler.on_epoch_end(epoch_id) save_model(model=model, tokenizer=tokenizer, global_step=global_step, tag='final') perform_evaluation(eval_examples=eval_examples, model=model, processor=processor, swriter=swriter, device=device, global_step=global_step) swriter.close()
def train(c, net, compression_scheduler=None): import distiller.apputils as apputils from distiller.data_loggers import TensorBoardLogger, PythonLogger msglogger = apputils.config_pylogger('logging.conf', None) tflogger = TensorBoardLogger(msglogger.logdir) tflogger.log_gradients = True pylogger = PythonLogger(msglogger) c.setdefault(hebbian=False) emb_params = count_params(net.embed) + count_params(net.loss.projections) + count_params(net.loss.clusters) opt = get_opt(c, net) net, opt, step = c.init_model(net, opt=opt, step='max', train=True) step_lr = scheduler(c, opt, step) data_tr = SampleIterator(c, c.train_batch, split='valid' if c.debug else 'train') iter_tr = iter(data_tr) data_val = SequentialIterator(c, c.eval_batch, split='valid') s = Namespace(net=net, opt=opt, step=step) c.on_train_start(s) c.log('Embedding has %s parameters' % emb_params) if c.get("steps_per_epoch"): steps_per_epoch = c.steps_per_epoch else: steps_per_epoch = len(data_tr.tokens) // data_tr.bs // c.train_chunk print("#### steps per epoch %d ####" % steps_per_epoch) if c.hebbian: counters = [torch.ones(end - start, dtype=torch.long, device=c.device) for start, end in zip([0] + c.cutoffs, c.cutoffs + [c.n_vocab])] temp_counters = [torch.zeros_like(x) for x in counters] best_val_loss = np.inf if s.results is not None and 'val_loss' in s.results.columns: best_val_loss = s.results['val_loss'].dropna().max() try: while step < s.step_max: batch = step % steps_per_epoch epoch = step // steps_per_epoch if step % steps_per_epoch == 0: c.log("====> batch=%d, epoch=%d, step=%d" % (batch, epoch, step)) if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) step_lr(step) x = to_torch(next(iter_tr), c.device).t() t_s = time() inputs, labels = x[:-1], x[1:] preds = net(inputs, labels) loss = preds['loss'] if compression_scheduler: _ = compression_scheduler.before_backward_pass(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch, loss=loss, return_loss_components=False) opt.zero_grad() if torch.isnan(loss): raise RuntimeError('Encountered nan loss during training') loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), c.get('clip_grad', 0.5)) opt.step() if c.hebbian: hebbian_weight_update(c, net, preds['hiddens'], counters, temp_counters) time_model = np.round(time() - t_s, 5) loss = from_torch(loss) perplexity = np.nan if loss > 5 else np.e ** loss step_result = pd.Series(dict( loss=loss, perplexity=perplexity, time=time_model )).add_prefix('train_') step_result['lr'] = next(iter(opt.param_groups))['lr'] if c.use_cache: step_result['theta'] = preds['theta'] step_result['lambda'] = preds['lambda'].item() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) if step % steps_per_epoch == 0: if compression_scheduler: compression_scheduler.on_epoch_end(epoch) s.step = step = step + 1 if step % c.step_eval == 0: distiller.log_weights_sparsity(net, epoch, loggers=[tflogger, pylogger]) t, total = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True) c.log("total sparsity: %.3lf" % total) step_result = step_result.append( pd.Series(evaluate(c, data_val, net)).add_prefix('val_') ) s.record_step = step_result['val_loss'] < best_val_loss clear_gpu_memory() s.step_result = step_result c.on_step_end(s) except Exception as e: import traceback err = traceback.format_exc() if c.main: c.log(err) else: print(err) finally: c.on_train_end(s)
def train(args): SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD, lower=args.lower) TGT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD, lower=args.lower) # Load IWSLT Data ---> German to English Translation if args.dataset == 'IWSLT': train, val, test = datasets.IWSLT.splits( exts=('.de', '.en'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= args.max_length and len(vars(x)['trg']) <= args.max_length) else: train, val, test = datasets.Multi30k.splits( exts=('.de', '.en'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= args.max_length and len(vars(x)['trg']) <= args.max_length) # Frequency of words in the vocabulary SRC.build_vocab(train.src, min_freq=args.min_freq) TGT.build_vocab(train.trg, min_freq=args.min_freq) print("Size of source vocabulary:", len(SRC.vocab)) print("Size of target vocabulary:", len(TGT.vocab)) pad_idx = TGT.vocab.stoi[BLANK_WORD] model = make_model(len(SRC.vocab), len(TGT.vocab), n=args.num_blocks, d_model=args.hidden_dim, d_ff=args.ff_dim, h=args.num_heads, dropout=args.dropout) print("Model made with n:", args.num_blocks, "hidden_dim:", args.hidden_dim, "feed forward dim:", args.ff_dim, "heads:", args.num_heads, "dropout:", args.dropout) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("Number of parameters: ", params) if args.load_model: print("Loading model from [%s]" % args.load_model) model.load_state_dict(torch.load(args.load_model)) # UNCOMMENT WHEN RUNNING ON RESEARCH MACHINES - run on GPU # model.cuda() # Used by original authors, hurts perplexity but improves BLEU score criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) # UNCOMMENT WHEN RUNNING ON RESEARCH MACHINES - run on GPU # criterion.cuda() train_iter = MyIterator(train, batch_size=args.batch_size, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True) valid_iter = MyIterator(val, batch_size=args.batch_size, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False, sort=False) model_par = nn.DataParallel(model, device_ids=devices) # model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000, # torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) # Use standard optimizer -- As used in the paper model_opt = get_std_opt(model) # PRUNING CODE if args.summary: df = distiller.weights_sparsity_tbl_summary(model, False) print(df) exit(0) msglogger = apputils.config_pylogger('logging.conf', None) tflogger = TensorBoardLogger(msglogger.logdir) tflogger.log_gradients = True pylogger = PythonLogger(msglogger) source = args.compress if args.compress: compression_scheduler = distiller.config.file_config( model_par.module, None, args.compress) print(model_par.module) best_bleu = 0 best_epoch = 0 steps_per_epoch = math.ceil(len(train_iter.data()) / 60) for epoch in range(args.epoch): print("=" * 80) print("Epoch ", epoch + 1) print("=" * 80) print("Training...") model_par.train() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # IF PRUNING run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt), args, epoch, steps_per_epoch, compression_scheduler, SRC, TGT, valid_iter, is_valid=False) # run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par, # MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt), args, # SRC, TGT, valid_iter, is_valid=False) print("Validation...") model_par.eval() # IF PRUNING loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None), args, epoch, steps_per_epoch, compression_scheduler, SRC, TGT, valid_iter, is_valid=True) # loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par, # MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None), args, # SRC, TGT, valid_iter, is_valid=True) if compression_scheduler: compression_scheduler.on_epoch_end(epoch) print('Validation loss:', loss) print('Validation perplexity: ', np.exp(loss)) bleu_score = run_validation_bleu_score(model, SRC, TGT, valid_iter) if best_bleu < bleu_score: best_bleu = bleu_score model_file = args.save_to + args.exp_name + 'validation.bin' print('Saving model without optimizer [%s]' % model_file) torch.save(model_par.module.state_dict(), model_file) best_epoch = epoch model_file = args.save_to + args.exp_name + 'latest.bin' print('Saving latest model without optimizer [%s]' % model_file) torch.save(model_par.module.state_dict(), model_file) print('The best epoch was:', best_epoch)
def objective(space): global model global count global best_dict #Explore new model model = create_model(False, args.dataset, args.arch, device_ids=args.gpus) if args.resume: model, _, _ = apputils.load_checkpoint( model, chkpt_file=args.resume) count += 1 print('{} trial starting...'.format(count)) # Objective function: F(Acc, Lat) = (1 - Acc.) + (alpha * Sparsity) accuracy = 0 #alpha = 0.2 # Super-parameter: the importance of inference time alpha = 1.0 # Super-parameter: the importance of inference time sparsity = 0.0 # Training hyperparameter criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) """ distiller/distiller/config.py # Element-wise sparsity sparsity_levels = {net_param: sparsity_level} pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) scheduler = distiller.CompressionScheduler(model) scheduler.add_policy(policy, epochs=[0, 2, 4]) # Local search add multiple pruner for each layer """ sparsity_levels = {} for key, value in space.items(): sparsity_levels[key] = value pruner = distiller.pruning.SparsityLevelParameterPruner(name='sensitivity', levels=sparsity_levels) policy = distiller.PruningPolicy(pruner, pruner_args=None) lrpolicy = distiller.LRPolicy(torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)) compression_scheduler = distiller.CompressionScheduler(model) #compression_scheduler.add_policy(policy, epochs=[90]) compression_scheduler.add_policy(policy, epochs=[0]) compression_scheduler.add_policy(lrpolicy, starting_epoch=0, ending_epoch=90, frequency=1) """ distiller/example/classifier_compression/compress_classifier.py For each epoch: compression_scheduler.on_epoch_begin(epoch) train() save_checkpoint() compression_scheduler.on_epoch_end(epoch) train(): For each training step: compression_scheduler.on_minibatch_begin(epoch) output = model(input) loss = criterion(output, target) compression_scheduler.before_backward_pass(epoch) loss.backward() optimizer.step() compression_scheduler.on_minibatch_end(epoch) """ for i in range(args.epochs): compression_scheduler.on_epoch_begin(i) train_accuracy = train(i,criterion, optimizer, compression_scheduler) val_accuracy = validate() # Validate hyperparameter setting t, sparsity = distiller.weights_sparsity_tbl_summary(model, return_total_sparsity=True) compression_scheduler.on_epoch_end(i, optimizer) apputils.save_checkpoint(i, args.arch, model, optimizer, compression_scheduler, train_accuracy, False, 'hyperopt', './') print('{} epochs => train acc:{:.2f}%, val acc:{:.2f}%'.format(i, train_accuracy, val_accuracy)) test_accuracy = validate(test_loader) # Validate hyperparameter setting #score = (1-(val_accuracy/100.)) + (alpha * (1-sparsity/100.)) # objective funtion here # objective funtion here # accuracy: 98~90%, sparsity: 80%~50% score = -((val_accuracy/100.)**2-0.9**2 + alpha * ((sparsity/100.)**2-0.5**2)) print('{} trials: score: {:.2f}\ttrain acc:{:.2f}%\tval acc:{:.2f}%\ttest acc:{:.2f}%\tsparsity:{:.2f}%'.format(count, score, train_accuracy, val_accuracy, test_accuracy, sparsity)) if score < best_dict['score']: best_dict['trial'] = count best_dict['score'] = score best_dict['tr_acc'] = train_accuracy best_dict['v_acc'] = val_accuracy best_dict['te_acc'] = test_accuracy best_dict['sparsity'] = sparsity best_dict['model_best'] = copy.deepcopy(model) return score