def __init__(self, optimizer, lr, model_type, vocsize, emsize, buffer_len, nhead, nhid, nlayers, dropout, learn_iterations, warmup, after_warmup): criterion = nn.CrossEntropyLoss() super(TransformerLearner, self).__init__(criterion, vocsize, learn_iterations) self.model = model.TransformerModel(vocsize, emsize, nhead, nhid, nlayers, dropout) self.dmodel = emsize if lr == 42: self.lr = self.dmodel**-0.5 else: self.lr = lr self.step = 1 self.warmup = warmup self.after_warmup = after_warmup self.buffer_len = buffer_len self.buffer = None kwargs = {} if optimizer == 'Adam': kwargs['betas'] = (0.9, 0.98) kwargs['eps'] = 1e-9 lr = self.compute_lr() self.optimizer = getattr(optim, optimizer)(self.model.parameters(), lr=lr)
def model_func(wrapped_import, inputs): ############################################################################### # Build the model ############################################################################### if wrapped_import: nn = wrapped_import("torch.nn") model = wrapped_import("model") else: from torch import nn import model if args.model == 'Transformer': net = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout) else: net = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) net.eval() # for verification, need no random elements (e.g. dropout) # criterion = nn.NLLLoss() if args.model != 'Transformer': hidden = net.init_hidden(args.batch_size) else: hidden = None with torch.no_grad(): if args.model == 'Transformer': output = net(inputs) output = output.view(-1, ntokens) else: output, hidden = net(inputs, hidden) return output
def main_func(datasets, context_len, epochs): for dataset in datasets: for bptt in context_len: train_d, valid_d, test_d, data = data_generator(dataset) train_data = batchify(train_d, bptt) val_data = batchify(valid_d, bptt) test_data = batchify(test_d, bptt) ntokens = len(set(data)) best_val_loss = None lr = args.lr if args.model == 'Transformer': model = model1.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = model1.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), args.lr) for epoch in range(1, epochs + 1): epoch_start_time = time.time() train(train_data, bptt, ntokens, model, criterion, optimizer, epoch) val_loss = evaluate(val_data, bptt, ntokens, model, criterion) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format( epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: with open(args.save, 'wb') as f: torch.save(model, f) best_val_loss = val_loss else: lr /= 0.4 # Load the best saved model. with open(args.save, 'rb') as f: model = torch.load(f) if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']: model.rnn.flatten_parameters() # Run on test data. test_loss = evaluate(test_data, bptt, ntokens, model, criterion) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'. format(test_loss, math.exp(test_loss))) print('=' * 89) fr.writelines("test loss for len %d and dataset %s is %f\n" % (bptt, dataset, test_loss)) #fr.close() return
data = data.view(bsz, -1).t().contiguous() return data.to(device) eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.model == 'Transformer': model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.NLLLoss() ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor):
def main(): parser = argparse.ArgumentParser(description="Compute sentence scores of " "nbest lists with a PyTorch trained " "neural language model.") parser.add_argument("--nbest-list", type=str, required=True, help="N-best hypotheses for rescoring") parser.add_argument( "--outfile", type=str, required=True, help="Output file with language model scores associated " "with each hypothesis", ) parser.add_argument("--vocabulary", type=str, required=True, help="Vocabulary used for training") parser.add_argument( "--model-path", type=str, required=True, help="Path to a pretrained neural model.", ) parser.add_argument( "--model", type=str, default="LSTM", help="Network type. can be RNN, LSTM or Transformer.", ) parser.add_argument("--emsize", type=int, default=200, help="size of word embeddings") parser.add_argument("--nhid", type=int, default=200, help="number of hidden units per layer") parser.add_argument("--nlayers", type=int, default=2, help="number of layers") parser.add_argument( "--nhead", type=int, default=2, help="the number of heads in the encoder/decoder of the " "transformer model", ) args = parser.parse_args() assert os.path.exists(args.nbest_list), "Nbest list path does not exists." assert os.path.exists(args.vocabulary), "Vocabulary path does not exists." assert os.path.exists(args.model_path), "Model path does not exists." print("Load vocabulary") vocab = read_vocab(args.vocabulary) ntokens = len(vocab) print("Load model and criterion") import model if args.model == "Transformer": model = model.TransformerModel( ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, activation="gelu", tie_weights=True, ) else: model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, tie_weights=True) with open(args.model_path, "rb") as f: model.load_state_dict( torch.load(f, map_location=lambda storage, loc: storage)) if args.model in ["RNN_TANH", "RNN_RELU", "LSTM", "GRU"]: model.rnn.flatten_parameters() criterion = nn.CrossEntropyLoss() print("Load nbest list") nbest = load_nbest(args.nbest_list) print("Compute sentence scores with a ", args.model, " model") nbest_and_scores = compute_scores(nbest, model, criterion, ntokens, vocab, model_type=args.model) print("Write sentence scores out") write_scores(nbest_and_scores, args.outfile)
def setup_model_and_optim(args, train_data, tokenizer): ntokens = args.data_size if args.model.lower() == 'transformer': embed_tokens = m.Embedding( ntokens, args.decoder_embed_dim, padding_idx=tokenizer.command_name_map['pad'].Id) model = m.TransformerModel(m.DecoderPreprocessor(args, embed_tokens), m.TransformerDecoder(args, embed_tokens)) else: model = m.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) global rnn_model rnn_model = model LR_Warmer = None print('* number of parameters: %d' % sum([p.nelement() for p in model.parameters()])) if args.cuda: model.cuda() optim = None if args.load is not None and args.load != '': sd = torch.load(args.load, map_location='cpu') if args.load_optim: #optim_sd = torch.load(os.path.join(os.path.dirname(args.load), 'optim.pt'), map_location='cpu') rng = torch.load(os.path.join(os.path.dirname(args.load), 'rng.pt')) torch.cuda.set_rng_state(rng[0]) torch.set_rng_state(rng[1]) try: model.load_state_dict(sd) except: if hasattr(model, 'rnn'): apply_weight_norm(model.rnn, hook_child=False) else: apply_weight_norm(model, hook_child=False) model.load_state_dict(sd) remove_weight_norm(model) if not args.no_weight_norm: if hasattr(model, 'rnn'): apply_weight_norm(model.rnn, hook_child=False) else: apply_weight_norm(model, hook_child=False) if optim is None: optim_choice = 'Adam' if args.stlr_cut_frac else args.optim if args.fp16: model = FP16_Module(model) optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) optim = FP16_Optimizer(optim, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) else: optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) if args.load_optim: optim.load_state_dict(optim_sd) # add linear learning rate scheduler if train_data is not None: if args.constant_decay: num_iters = args.constant_decay else: num_iters = args.train_iters * args.epochs init_step = -1 if args.load_optim: #TODO: this no longer makes sense given the new data loaders init_step = optim_sd['iter'] - optim_sd['skipped_iter'] train_data.batch_sampler.start_iter = (optim_sd['iter'] % len(train_data)) + 1 warmup_iter = args.warmup * num_iters if args.stlr_cut_frac is not None: LR = SlantedTriangularLR(optim, cut_frac=args.stlr_cut_frac, num_iters=num_iters) else: LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iter, num_iters=num_iters, decay_style=args.decay_style) if args.warmup != 0: LR_Warmer = WarmupLR(optim, warmup_iter, last_iter=init_step) # wrap model for distributed training if args.world_size > 1: model = DDP(model) criterion = nn.CrossEntropyLoss(reduce=False) return model, optim, LR, LR_Warmer, criterion
def main(args): # print(os.getcwd()) # ss('s') if args.wandb: # using None as trigger. if not None, it should be project name import wandb wandb.init(project=args.wandb, reinit=True) wandb.config.update(args) torch.manual_seed(args.seed) # if torch.cuda.is_available(): # if not args.cuda: # print("WARNING: You have a CUDA device, so you should probably run with --cuda") device = torch.device("cuda" if args.is_cuda else "cpu") ############################################################################### # Load data ############################################################################### corpus = data.Corpus(args.data_root) # Starting from sequential data, batchify arranges the dataset into columns. # For instance, with the alphabet as the sequence and batch size 4, we'd get # ┌ a g m s ┐ # │ b h n t │ # │ c i o u │ # │ d j p v │ # │ e k q w │ # └ f l r x ┘. # These columns are treated as independent by the model, which means that the # dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient # batch processing. def batchify(data, bsz): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() return data.to(device) eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) # ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.model == 'Transformer': model = Model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = Model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.CrossEntropyLoss() if args.wandb: wandb.watch(model) # ############################################################################### # Training code ############################################################################### optimizer = optim.Adam(model.parameters(), lr=args.lr_adam) lmbda = lambda epoch: 0.95**epoch # scheduler = optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lmbda]) # scheduler.step() # scheduler.step() # scheduler.step() # args.is_manual_update = True lr = args.lr def get_lr(): if args.is_manual_update: output = 'M{:02.5f}'.format(lr) else: for p in optimizer.param_groups: output = 'A{:02.5f}'.format(p['lr']) return output print(get_lr()) # for p in optimizer.param_groups: # print(p['lr']) # # break ss('-in main') def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else: return tuple(repackage_hidden(v) for v in h) # get_batch subdivides the source data into chunks of length args.bptt. # If source is equal to the example output of the batchify function, with # a bptt-limit of 2, we'd get the following two Variables for i = 0: # ┌ a g m s ┐ ┌ b h n t ┐ # └ b h n t ┘ └ c i o u ┘ # Note that despite the name of the function, the subdivison of data is not # done along the batch dimension (i.e. dimension 1), since that was handled # by the batchify function. The chunks are along dimension 0, corresponding # to the seq_len dimension in the LSTM. def get_batch(source, i): seq_len = min(args.bptt, len(source) - 1 - i) data = source[i:i+seq_len] target = source[i+1:i+1+seq_len].view(-1) return data, target def evaluate(data_source): # Turn on evaluation mode which disables dropout. model.eval() total_loss = 0. ntokens = len(corpus.dictionary) if args.model != 'Transformer': hidden = model.init_hidden(eval_batch_size) with torch.no_grad(): for i in range(0, data_source.size(0) - 1, args.bptt): data, targets = get_batch(data_source, i) if args.model == 'Transformer': output = model(data) else: output, hidden = model(data, hidden) hidden = repackage_hidden(hidden) output_flat = output.view(-1, ntokens) total_loss += len(data) * criterion(output_flat, targets).item() return total_loss / (len(data_source) - 1) def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. log_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) if args.model != 'Transformer': hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. model.zero_grad() if args.model == 'Transformer': output = model(data) else: hidden = repackage_hidden(hidden) output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) if args.is_manual_update: for p in model.parameters(): p.data.add_(-lr, p.grad.data) else: optimizer.step() total_loss += loss.item() log_loss += len(data) * loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, get_lr(), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() if args.is_quickrun: break return log_loss / (train_data.size(0)-1) # break # def export_onnx(path, batch_size, seq_len): # print('The model is also exported in ONNX format at {}'. # format(os.path.realpath(args.onnx_export))) # model.eval() # dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device) # hidden = model.init_hidden(batch_size) # torch.onnx.export(model, (dummy_input, hidden), path) # Loop over epochs. best_val_loss = None early_stop_count = 0 early_stop_when = 10 # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(1, args.epoch+1): epoch_start_time = time.time() log_loss = train() # ss('-in main') val_loss = evaluate(val_data) print('-' * 89) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) if args.wandb: wandb.log({ 'train loss': log_loss, 'valid loss': val_loss }) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: # with open(args.save, 'wb') as f: # torch.save(model, f) best_val_loss = val_loss early_stop_count = 0 else: if args.is_manual_update: # Anneal the learning rate if no improvement has been seen in the validation dataset. lr /= 4.0 else: scheduler.step() early_stop_count += 1 if args.early_stop != None: print('early stop monitor [{}/{}]'.format(early_stop_count, args.early_stop)) if early_stop_count > args.early_stop: print('trigger early stop') break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Load the best saved model. # with open(args.save, 'rb') as f: # model = torch.load(f) # # after load the rnn params are not a continuous chunk of memory # # this makes them a continuous chunk, and will speed up forward pass # # Currently, only rnn model supports flatten_parameters function. # if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']: # model.rnn.flatten_parameters() # Run on test data. test_loss = evaluate(test_data) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89) if args.wandb: wandb.log({'test loss':test_loss}) wandb.join()
return data.to(device) eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.model == 'Transformer': model = model.TransformerModel(ntoken=ntokens, ninp=args.emsize, nhead=args.nhead, nhid=args.nhid, nlayers=args.nlayers, dropout=args.dropout).to(device) else: model = model.RNNModel(args.model, ntoken=ntokens, ninp=args.emsize, nhid=args.nhid, nlayers=args.nlayers, dropout=args.dropout).to(device) criterion = nn.NLLLoss() ############################################################################### # Training code
# Load data corpus = data.Corpus(args.input_path) eval_batch_size = 10 train_data = batchify(corpus.train, batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) print(f'Train data shape: {train_data.shape}') print(f'Val data shape: {val_data.shape}') print(f'Test data shape: {test_data.shape}') # build model ntokens = len(corpus.dictionary) if model_type == 'Transformer': model = model.TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) else: model = model.RNNModel(model_type, ntokens, emsize, nhid, nlayers, dropout, args.tied).to(device) print(f'model: {model}') criterion = nn.NLLLoss() # Training code best_val_loss = None for epoch in range(1, epochs + 1): epoch_start_time = time.time() train(model_type, model, corpus, train_data, batch_size, args.bptt, clip, args.log_interval, args.dry_run, epoch) val_loss = evaluate(model_type, model, corpus, val_data, args.bptt)
# Evenly divide the data across the bsz batches. data = data.view(bsz, -1).t().contiguous() return data.to(device) eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.model == 'Transformer': model = model_class.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device) else: model = model_class.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.NLLLoss() ############################################################################### # Training code ############################################################################### def repackage_hidden(h): """Wraps hidden states in new Tensors, to detach them from their history.""" if isinstance(h, torch.Tensor): return h.detach() else:
eval_batch_size = 10 train_data = batchify(corpus.train, args.batch_size) val_data = batchify(corpus.valid, eval_batch_size) test_data = batchify(corpus.test, eval_batch_size) ############################################################################### # Build the model ############################################################################### ntokens = len(corpus.dictionary) if args.model == 'Transformer': model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout, args.norm, 0.0, activation='relu').to(device) else: model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device) criterion = nn.NLLLoss() if args.optim == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) elif args.optim == 'adam':
os.environ['CUDA_VISIBLE_DEVICES'] = cmd[:-1] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") c_to_i = pickle.load(open(args.c_to_i, 'rb')) i_to_c = pickle.load(open(args.i_to_c, 'rb')) n_char = len(c_to_i) dataloaders = [] with open('data/vs_chemist.txt') as f: lines = f.readlines() lines = [l.strip().split() for l in lines] s_to_human_score = {l[1]: float(l[3]) for l in lines} if args.model == 'Trans': model = model.TransformerModel(args, n_char, i_to_c) else: model = model.RNN(args, n_char, i_to_c) model = utils.initialize_model(model, device, args.save_files) print("number of parameters :", sum(p.numel() for p in model.parameters() if p.requires_grad)) softmax = nn.Softmax(dim=-1) model.eval() log_likelihoods = [] humanscores = [] sascores = [] with torch.no_grad(): for s in s_to_human_score.keys():
test_primer = dataset.__getitem__(test_indices[0])[0].type( torch.LongTensor).view(1, -1) train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, num_workers=8, sampler=train_sampler) test_dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, num_workers=8, sampler=test_sampler) print("> Done.") print(f"> Loaded {dataset.length} MIDI sequences.") transformer = model.TransformerModel(336, 128, 8, 256, 8, dropout=0.0, device=device).to(device) print("> Model Summary:") print(transformer, '\n') if len(sys.argv) == 2: print("> Loading existing model from file\n") transformer = torch.load(sys.argv[1]) # generate(transformer, "load-test") train(transformer, train_dataloader, test_dataloader)
def main(): parser = argparse.ArgumentParser( description="Compute word scores of" "hypotheses for each utterance in parallel" "with a PyTorch-trained neural language model.") parser.add_argument('--infile', type=str, required=True, help="Word hypotheses generated from a lattice.") parser.add_argument('--outfile', type=str, required=True, help="Output file with neural language model scores" "for input word hypotheses.") parser.add_argument( '--vocabulary', type=str, required=True, help="Vocabulary used for neural language model training.") parser.add_argument('--model-path', type=str, required=True, help="Path to a pretrained neural language model.") parser.add_argument('--model', type=str, default='LSTM', help='Network type. Can be RNN, LSTM or Transformer.') parser.add_argument('--emsize', type=int, default=200, help='Size of word embeddings.') parser.add_argument('--nhid', type=int, default=200, help='Number of hidden units per layer.') parser.add_argument('--nlayers', type=int, default=2, help='Number of layers.') parser.add_argument('--nhead', type=int, default=2, help='Number of heads in a Transformer model.') parser.add_argument('--oov', type=str, default='<unk>', help='Out of vocabulary word.') parser.add_argument('--sent-boundary', type=str, default='<s>', help='Sentence boundary symbol.') args = parser.parse_args() assert os.path.exists( args.infile), "Path for input word sequences does not exist." assert os.path.exists(args.vocabulary), "Vocabulary path does not exist." assert os.path.exists(args.model_path), "Model path does not exist." print("Load vocabulary.") vocab = read_vocab(args.vocabulary) ntokens = len(vocab) print("Load model and criterion.") import model if args.model == 'Transformer': model = model.TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, activation="gelu", tie_weights=True) else: model = model.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, tie_weights=True) with open(args.model_path, 'rb') as f: model.load_state_dict( torch.load(f, map_location=lambda storage, loc: storage)) if args.model in ['RNN_TANH', 'RNN_RELU', 'LSTM', 'GRU']: model.rnn.flatten_parameters() criterion = nn.CrossEntropyLoss(reduction='none') print("Load input word hypotheses.") sents = load_sents(args.infile) print("Compute word scores with a ", args.model, " model.") sents_and_scores = compute_scores(args, sents, model, criterion, ntokens, vocab, model_type=args.model) print("Write out word scores.") write_scores(sents_and_scores, args.outfile)
print("torch.cuda.is_available(): ", torch.cuda.is_available()) # load data train_iter, val_iter, VOCAB_SIZE = data.get_data("../data/ptb/", batch_size, bptt_len, device) print("VOCAB_SIZE: ", VOCAB_SIZE) # WRITE CODE HERE within two '#' bar ######################################## # Build LMModel best_model (build your language best_model here) emsize = 256 nhid = 256 nlayers = 2 nhead = 2 dropout = 0.2 MyModel = model.TransformerModel(VOCAB_SIZE, emsize, nhead, nhid, nlayers, dropout) print(MyModel) MyModel.to(device) ######################################## criterion = nn.CrossEntropyLoss() learning_rate = 0.001 step_size = 10 optimizer = torch.optim.Adam(MyModel.parameters(), lr=learning_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=0.3) GRAD_CLIP = 0.5 ########################################