def build_model(corpus, model_name, emsize, nhid, nlayers, dropout, dropouth, dropouti, dropoute, wdrop, lr, tied, resume, cuda): criterion = None ntokens = len(corpus.dictionary) model = model_module.RNNModel(model_name, ntokens, emsize, nhid, nlayers, dropout, dropouth, dropouti, dropoute, wdrop, tied) ### if resume: print('Resuming model ...') model, criterion, optimizer = model_load(resume) optimizer.param_groups[0]['lr'] = lr model.dropouti, model.dropouth, model.dropout, model.dropoute = dropouti, dropouth, dropout, dropoute if wdrop: from weight_drop import WeightDrop for rnn in model.rnns: if type(rnn) == WeightDrop: rnn.dropout = wdrop elif rnn.zoneout > 0: rnn.zoneout = wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) criterion = SplitCrossEntropyLoss(emsize, splits=splits, verbose=False) ### if cuda: model = model.cuda() criterion = criterion.cuda() ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) return model, criterion, None
def build_model(args, corpus): criterion = None ntokens = len(corpus.dictionary) model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) ### if args.resume: logging.info('Resuming model ...') model, criterion, optimizer = model_load(args.resume_path) optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute if args.wdrop: from weight_drop import WeightDrop for rnn in model.rnns: if type(rnn) == WeightDrop: rnn.dropout = args.wdrop elif rnn.zoneout > 0: rnn.zoneout = args.wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] logging.info(f'Using {splits}') criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) logging.info(f'Args: {args}') logging.info(f'Model total parameters: {total_params}') if args.cuda: model = model.cuda() criterion = criterion.cuda() return model, criterion
def build_criterion(self): splits = [] if self.ninp > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif self.ninp > 75000: # WikiText-103 splits = [2800, 20000, 76000] logging.info('Using splits: {}'.format(' '.join(splits))) self.criterion = SplitCrossEntropyLoss(self.ninp, splits=splits, verbose=False)
# for rnn in model.rnns: # if type(rnn) == WeightDrop: rnn.dropout = args.wdrop # elif rnn.zoneout > 0: rnn.zoneout = args.wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() if False: # or args.jit: print('Jitting ...') model.eval() model.lmr = torch.jit.trace(model.lmr, (torch.rand([args.bptt, args.batch_size, args.emsize]).cuda(), torch.rand([1, args.batch_size, args.emsize]).cuda())) #model = torch.jit.trace_module(model, torch.zeros((args.bptt, args.batch_size), dtype=torch.long)) ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params)
criterion = None if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) def train(): # Turn on training mode which enables dropout. if args.model == 'QRNN': model.reset() total_loss = 0 start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) batch, i = 0, 0 while i < train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5)))
# rnn.zoneout = args.whhdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] elif ntokens > 20000: splits = [5000, 20000] print('Using', splits) criterion = SplitCrossEntropyLoss(args.nhid, splits=splits, tied_weights=args.tied, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() ### params = list(model.parameters()) + list(criterion.parameters()) print('{:-^60}'.format('')) print('Args:', args) print('{:-^60}'.format('')) print('Model parameters:', count_parameters(model)) print('Criterion parameters:', count_parameters(criterion)) ############################################################################### # Training code
if args.wdrop: for rnn in model.rnn.cells: rnn.hh.dropout = args.wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] tools.print_log(args.save, splits) criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) if args.mode == 'GPT': criterion_gpt = CrossEntropyLoss(ignore_index=-1) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() ### params = list(filter(lambda x: x.requires_grad, model.parameters())) + list(criterion.parameters()) total_params = sum(p.data.nelement() for p in params if p.requires_grad) if args.mode == 'GPT': param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'ln_'] # Add 'ln_1' to test if it's better optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'transformer' in n],
for rnn in model.rnns: if type(rnn) == WeightDrop: rnn.dropout = args.wdrop elif rnn.zoneout > 0: rnn.zoneout = args.wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) ############################################################################### # Training code ############################################################################### def evaluate(data_source, batch_size=10):
def main(): parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='data/penn/', help='location of the data corpus') parser.add_argument('--model', type=str, default='LSTM', help='type of recurrent net (LSTM, QRNN, GRU)') parser.add_argument('--emsize', type=int, default=400, help='size of word embeddings') parser.add_argument('--nhid', type=int, default=1150, help='number of hidden units per layer') parser.add_argument('--nlayers', type=int, default=3, help='number of layers') parser.add_argument('--lr', type=float, default=30, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=8000, help='upper epoch limit') parser.add_argument('--max-steps-per-epoch', type=int, default=-1, help='upper steps per epoch epoch limit') parser.add_argument('--batch-size', type=int, default=80, metavar='N', help='batch size') parser.add_argument('--bptt', type=int, default=70, help='sequence length') parser.add_argument('--warmup', type=int, default=4000, help='warmup for learning rate') parser.add_argument('--cooldown', type=int, default=None, help='cooldown for learning rate') parser.add_argument( '--accumulate', type=int, default=1, help='number of batches to accumulate before gradient update') parser.add_argument('--dropout', type=float, default=0.4, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--dropouth', type=float, default=0.3, help='dropout for rnn layers (0 = no dropout)') parser.add_argument( '--dropouti', type=float, default=0.65, help='dropout for input embedding layers (0 = no dropout)') parser.add_argument( '--dropoute', type=float, default=0.1, help='dropout to remove words from embedding layer (0 = no dropout)') parser.add_argument( '--wdrop', type=float, default=0.0, help= 'amount of weight dropout to apply to the RNN hidden to hidden matrix') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--nonmono', type=int, default=5, help='random seed') parser.add_argument('--cuda', action='store_false', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') randomhash = ''.join(str(time.time()).split('.')) parser.add_argument('--save', type=str, default=randomhash + '.pt', help='path to save the final model') parser.add_argument( '--alpha', type=float, default=2, help= 'alpha L2 regularization on RNN activation (alpha = 0 means no regularization)' ) parser.add_argument( '--beta', type=float, default=1, help= 'beta slowness regularization applied on RNN activiation (beta = 0 means no regularization)' ) parser.add_argument('--wdecay', type=float, default=1.2e-6, help='weight decay applied to all weights') parser.add_argument('--resume', type=str, default='', help='path of model to resume') parser.add_argument('--optimizer', type=str, default='sgd', help='optimizer to use (sgd, adam)') parser.add_argument( '--when', nargs="+", type=int, default=[-1], help= 'When (which epochs) to divide the learning rate by 10 - accepts multiple' ) parser.add_argument( '--discard-highest-losses', type=float, default=0.0, help= 'discard highest percentage of prediction losses before executing an optimizer step' ) parser.add_argument( '--enlarge-model-every-n-epochs', type=int, default=-1, help='enlarge model (hidden and embedding dims) after every n epochs') args = parser.parse_args() args.tied = True # Set the random seed manually for reproducibility. np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) ############################################################################### # Load data ############################################################################### import os import hashlib fn = 'corpus.{}.data'.format(hashlib.md5(args.data.encode()).hexdigest()) if os.path.exists(fn): print('Loading cached dataset...') corpus = torch.load(fn) else: print('Producing dataset...') corpus = data.Corpus(args.data) torch.save(corpus, fn) eval_batch_size = min(100, args.batch_size) print('Eval batch size of', eval_batch_size) test_batch_size = 8 train_data = batchify(corpus.train, args.batch_size, args) val_data = batchify(corpus.valid, eval_batch_size, args) test_data = batchify(corpus.test, test_batch_size, args) ############################################################################### # Build the model ############################################################################### from splitcross import SplitCrossEntropyLoss criterion = None ntokens = len(corpus.dictionary) print('Total number of tokens:', ntokens) #model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.BoomRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) if args.enlarge_model_every_n_epochs <= 0: model = SHARNN(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) else: model = None #model = model.AttnRNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.RecAttn(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.LNRNN(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) #model = model.LNRR(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) ### splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) if model is not None: if args.resume and args.epochs > 0: print('Resuming model ...') criterion = model_load(args.resume, model) #optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute #if args.wdrop: # from weight_drop import WeightDrop # for rnn in model.rnns: # if type(rnn) == WeightDrop: rnn.dropout = args.wdrop # elif rnn.zoneout > 0: rnn.zoneout = args.wdrop ### if not criterion: criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() if False: # or args.jit: print('Jitting ...') model.eval() model.lmr = torch.jit.trace(model.lmr, (torch.rand([ args.bptt, args.batch_size, args.emsize ]).cuda(), torch.rand([1, args.batch_size, args.emsize]).cuda())) #model = torch.jit.trace_module(model, torch.zeros((args.bptt, args.batch_size), dtype=torch.long)) ### ############################################################################### # Training code ############################################################################### # Loop over epochs. #lr = args.lr best_val_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: if model is not None: model, optimizer, params = init_optimizer(args, model, criterion) for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() discard_highest_losses = args.discard_highest_losses * ( args.epochs - epoch + 1) / args.epochs if args.enlarge_model_every_n_epochs > 0 and ( epoch - 1) % args.enlarge_model_every_n_epochs == 0: prev_model = model current_factor = (args.enlarge_model_every_n_epochs + epoch - 1) / (args.enlarge_model_every_n_epochs + args.epochs) emsize = int(args.emsize * current_factor) nhid = int(args.nhid * current_factor) print( f'enlarge model: emsize={emsize}, nhid={nhid} (discard_highest_losses={discard_highest_losses})' ) model = SHARNN(args.model, ntokens, emsize, nhid, args.nlayers, args.dropout, args.dropouth, args.dropouti, args.dropoute, args.wdrop, args.tied) criterion = SplitCrossEntropyLoss(emsize, splits=splits, verbose=False) if args.cuda: model = model.cuda() criterion = criterion.cuda() if prev_model is not None: model.load_from_smaller_and_freeze(prev_model) model, optimizer, params = init_optimizer( args, model, criterion) train(model, optimizer, criterion, args, train_data, params, epoch=epoch - 1, max_steps=args.max_steps_per_epoch, discard_highest_losses=discard_highest_losses) if 't0' in optimizer.param_groups[0]: tmp = {} for prm in model.parameters(): tmp[prm] = prm.data.clone() prm.data = optimizer.state[prm]['ax'].clone() val_loss2 = evaluate(model, criterion, args, val_data) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) print('-' * 89) if val_loss2 < stored_loss: model_save(args.save, model, criterion) print('Saving Averaged!') stored_loss = val_loss2 for prm in model.parameters(): prm.data = tmp[prm].clone() else: val_loss = evaluate(model, criterion, args, val_data, eval_batch_size) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) print('-' * 89) if val_loss < stored_loss: model_save(args.save, model, criterion) print('Saving model (new best validation)') stored_loss = val_loss if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[ 0] and (len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): print('Switching to ASGD') optimizer = torch.optim.ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) if epoch in args.when: print('Saving model before learning rate decreased') model_save('{}.e{}'.format(args.save, epoch), model, criterion) print('Dividing learning rate by 10') optimizer.param_groups[0]['lr'] /= 10. best_val_loss.append(val_loss) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Load the best saved model. criterion = model_load(args.save, model) params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Model total parameters:', total_params) # Run on test data. test_loss = evaluate(model, criterion, args, test_data, test_batch_size) print('=' * 89) print( '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}' .format(test_loss, math.exp(test_loss), test_loss / math.log(2))) print('=' * 89)
if args.resume: print('Resuming model ...') model_load(args.resume) optimizer.param_groups[0]['lr'] = args.lr model.dropouti, model.dropouth, model.dropout, args.dropoute = args.dropouti, args.dropouth, args.dropout, args.dropoute if args.wdrop: from weight_drop import WeightDrop for rnn in model.rnns: if type(rnn) == WeightDrop: rnn.dropout = args.wdrop elif rnn.zoneout > 0: rnn.zoneout = args.wdrop ### if (not criterion): if args.sampling_loss: criterion = SamplingLoss(args.emsize, k=args.k, obj=args.obj, Z=args.Z, noise=unigram, q=args.q, b=args.b, g=args.g) else: criterion = SplitCrossEntropyLoss(args.emsize, q=args.q, b=args.b, g=args.g) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() ### params = list(model.parameters()) + list(criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params) ############################################################################### # Training code ###############################################################################
default=False, help='Calculate model performance on the test set') parser.add_argument( '--significance_testing', action='store_true', default=False, help='Performance significance testing on baseline and MTS model') parser.add_argument( '--unit_ablation', action='store_true', default=False, help='Evaluate model performance with unit ablation for layer 2') args = parser.parse_args() criterion = SplitCrossEntropyLoss(400, splits=[], verbose=False) #entropy_calc = Entropy_calculation(400, splits=[], verbose=False) seed = 141 np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) def model_load(fn): #global model with open(fn, 'rb') as f: model, criterion_m, optim = torch.load( f) #, map_location=torch.device('cpu')) #, return model
elif rnn.zoneout > 0: rnn.zoneout = model.wdrop ### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] print('Using', splits) criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() ### params, parser_params = [], [] for n, p in model.named_parameters(): if "_att_" in n: parser_params.append(p) else: params.append(p) for n, p in criterion.named_parameters(): if "_att_" in n:
verbose=False) if args.kd: model_t = load_teacher(main_args.logs_path, args, ntokens) if model_t is None: raise Exception("Teacher model not found") model_t.eval() log_stats = vars(args) log_stats['experiment_id'] = main_args.experiment_id log_stats['init_time'] = init_time log_stats['num_params'] = sum( x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in custom_model.parameters() if x.size()) criterion = SplitCrossEntropyLoss(args.emsize, splits=[], verbose=False) # criterion = torch.nn.CrossEntropyLoss() criterion_kd = DistillKL(args.kd_tau) if args.cuda: custom_model = custom_model.to(cuda) criterion = criterion.to(cuda) criterion_kd = criterion_kd.to(cuda) if args.kd: model_t = model_t.to(cuda) params = list(custom_model.parameters()) + list( criterion.parameters()) + list(criterion_kd.parameters()) optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay)
### if not criterion: splits = [] if ntokens > 500000: # One Billion # This produces fairly even matrix mults for the buckets: # 0: 11723136, 1: 10854630, 2: 11270961, 3: 11219422 splits = [4200, 35000, 180000] elif ntokens > 75000: # WikiText-103 splits = [2800, 20000, 76000] else: splits = [1400, 10000, 32000] print('Using', splits) criterion = SplitCrossEntropyLoss(args.emsize, splits=splits, verbose=False) ### if args.cuda: model = model.cuda() criterion = criterion.cuda() # model_r = model_r.cuda() model_mlp = model_mlp.cuda() ### params = list(model.parameters()) + list(model_mlp.parameters()) + list( criterion.parameters()) total_params = sum(x.size()[0] * x.size()[1] if len(x.size()) > 1 else x.size()[0] for x in params if x.size()) print('Args:', args) print('Model total parameters:', total_params)