def train(**kwargs): print("loading dataset") train_dataset = NMTDataset(kwargs["src_train"], kwargs["tgt_train"]) valid_dataset = NMTDataset(kwargs["src_valid"], kwargs["tgt_valid"]) print("Dataset loaded successfully.") train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) tokenizer = SpaceTokenizer(base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_nepali.json", base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_english.json" ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer( base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_newa.json", base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_eng.json" ) if kwargs['model'] == 'transformer': model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), embed_size, n_heads, dropout=dropout_rate) else: model = Seq2Seq(embed_size, hidden_size, tokenizer, dropout_rate=dropout_rate, n_layers=n_layers) # criterion = nn.CrossEntropyLoss() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=0.001) model.to(device) model = trainer(model, optimizer, train_dl, valid_dl, BATCH_SIZE, epoch, device, LOG_EVERY, kwargs["checkpoint_path"], kwargs["best_model"], beam_size, max_decoding_time_step)
def evaluate(input_sentences, output_sentences, input_vocab, output_vocab, input_reverse, output_reverse, hy, writer): dataset = NMTDataset(input_sentences, output_sentences, input_vocab, output_vocab, input_reverse, output_reverse) loader = DataLoader(dataset, batch_size=hy.batch_size, shuffle=True, drop_last=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_vocab_size = len(input_vocab.keys()) output_vocab_size = len(output_vocab.keys()) encoder = EncoderRNN(input_vocab_size, hy.embedding_size, hy.hidden_size, hy.rnn_layers, hy.bidirectional, device) decoder = DecoderRNN(output_vocab_size, hy.embedding_size, hy.hidden_size, hy.rnn_layers, hy.bidirectional, device) accuracies = [] for epoch in range(1, hy.num_epochs + 1): encoder.load_state_dict( torch.load("saved_runs/encoder_{}_weights.pt".format(epoch))) decoder.load_state_dict( torch.load("saved_runs/decoder_{}_weights.pt".format(epoch))) accuracy = compute_model_accuracy(encoder, decoder, loader, device, epoch, writer) accuracies.append(accuracy) print("=" * 80) print("Final Accuracy = {:.1f}".format(100. * np.max(accuracies))) print("=" * 80) return accuracies
def test(**kwargs): test_dataset = NMTDataset(kwargs["src_test"], kwargs["tgt_test"]) print("Dataset loaded successfully.") test_dl = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) tokenizer = SpaceTokenizer( src_vocab_path, tgt_vocab_path ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer( src_vocab_path, tgt_vocab_path) model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), tokenizer, embed_size, n_heads, dropout=dropout_rate) criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum') model.to(device) model.eval() bleu_score = 0 test_loss = 0 test_start_time = time.time() with torch.no_grad(): for batch in test_dl: src_tensor, tgt_tensor, _, _ = model.tokenizer.encode( batch, device, return_tensor=True) src_tensor = src_tensor.transpose(0, 1) tgt_tensor = tgt_tensor.transpose(0, 1) trg_input = tgt_tensor[:, :-1] targets = tgt_tensor[:, 1:].contiguous().view(-1) preds = model(src_tensor, trg_input.to(device), device) loss = criterion(preds, targets) test_loss += loss.item() / BATCH_SIZE output = [] for src in src_tensor: hyps = beam_search_transformer( model, src.view(1, -1), beam_size, max_decoding_time_step, model.tokenizer.src_vocab['[PAD]'], model.tokenizer.tgt_vocab['[EOS]'], device) top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) output.append(hyp_sent) score = compute_bleu_score(output, batch[1]) bleu_score += score print( f'Avg. test loss: {test_loss/len(test_dl):.5f} | BLEU Score: {bleu_score/len(test_dl)} | time elapsed: {time.time() - test_start_time}' )
def train(**kwargs): print("loading dataset") train_dataset = NMTDataset(kwargs["src_train"], kwargs["tgt_train"]) valid_dataset = NMTDataset(kwargs["src_valid"], kwargs["tgt_valid"]) print("Dataset loaded successfully.") train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) tokenizer = SpaceTokenizer( src_vocab_path, tgt_vocab_path ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer( src_vocab_path, tgt_vocab_path) model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), tokenizer, embed_size, n_heads, dropout=dropout_rate) model.to(device) criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum') optimizer = torch.optim.Adam(model.parameters(), lr=0.6, betas=(0.9, 0.98), eps=1e-9) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) torch.autograd.set_detect_anomaly(True) train_model(model, optimizer, criterion, scheduler, train_dl, valid_dl, BATCH_SIZE, epoch, device, kwargs["checkpoint_path"], kwargs["best_model"], beam_size, max_decoding_time_step)
def test(**kwargs): print("loading dataset") test_dataset = NMTDataset(kwargs["src_test"], kwargs["tgt_test"]) print("Dataset loaded successfully.") test_dl = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn) tokenizer = SpaceTokenizer(base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_nepali.json", base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_english.json" ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer( base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_newa.json", base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_eng.json" ) model = Seq2Seq(embed_size, hidden_size, tokenizer, dropout_rate=dropout_rate, n_layers=n_layers) model.to(device) model, _, _, _ = load_checkpt(model, kwargs['best_model'], device) eval_start_time = time.time() test_loss, bleu_score = evaluate(model, test_dl, 0, device, BATCH_SIZE, beam_size, max_decoding_time_step) print(f'Avg. test loss: {test_loss:.5f} | BLEU Score: {bleu_score} | time elapsed: {time.time() - eval_start_time}')
tokens += loss total_tokens += batch['ntokens'] if i % 50 == 1: elapsed = time.time() - start print("Epoch Step: %d Loss: %f Tokens per Sec: %f." % (i, loss / batch['ntokens'], total_tokens / elapsed)) start = time.time() tokens = 0 return total_loss / total_tokens if __name__ == "__main__": dataset = NMTDataset.load_dataset_and_make_vectorizer( # "/home/liuxd/home/NLP/PyTorchNLPBook/code4model/data/translation2019zh_train-df_100000.csv" "/home/liuxd/home/NLP/PyTorchNLPBook/code4model/data/translation2019zh_train-df_70w.csv" ) src_vocab_size = len(dataset.get_vectorizer().source_vocab) tgt_vocab_size = len(dataset.get_vectorizer().target_vocab) padding_idx = dataset.get_vectorizer().target_vocab.lookup_token('<MASK>') criterion = LabelSmoothing(size=tgt_vocab_size, padding_idx=0, smoothing=0.1) criterion.cuda() model = make_model(src_vocab_size, tgt_vocab_size, 6) model.cuda() model_opt = NoamOpt( model.src_embed[0].d_model, 1, 8000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) loss_compute = SimpleLossCompute(model.generator, criterion, model_opt)
default=1000) parser.add_argument('-n', '--num_workers', type=int, help='Src language', default=16) parser.add_argument('-g', '--gpu', type=int, help='Src language', default=0) parser.add_argument('--btec', type=str, help='Src language', default="ALL") args = parser.parse_args() src_key, trg_key = args.src_segment, args.trg_segment dataset = NMTDataset(args.src, args.trg, key=args.btec) dataset.sort(trg_key) set_reverse_vocab(dataset.trg_vocab[trg_key]) model = nmt_model(len(dataset.src_vocab[src_key]), len(dataset.trg_vocab[trg_key])) model_path = os.path.join('./NMT/LOG', args.src + "2" + args.trg, args.btec) if not os.path.exists(model_path): os.makedirs(model_path) model_path = os.path.join(model_path, src_key + "2" + trg_key) min_loss = 100 try: model.load(model_path) except: pass
def train(input_sentences, output_sentences, input_vocab, output_vocab, input_reverse, output_reverse, hy, writer): dataset = NMTDataset(input_sentences, output_sentences, input_vocab, output_vocab, input_reverse, output_reverse) loader = DataLoader(dataset, batch_size=hy.batch_size, shuffle=True, drop_last=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_vocab_size = len(input_vocab.keys()) output_vocab_size = len(output_vocab.keys()) encoder = EncoderRNN(input_vocab_size, hy.embedding_size, hy.hidden_size, hy.rnn_layers, hy.bidirectional, device) decoder = DecoderRNN(output_vocab_size, hy.embedding_size, hy.hidden_size, hy.rnn_layers, hy.bidirectional, device) loss_function = nn.CrossEntropyLoss().to(device) encoder_optimizer = optim.Adam(encoder.parameters(), lr=hy.lr) decoder_optimizer = optim.Adam(decoder.parameters(), lr=hy.lr) n_iterations = 0 loss_history = [] training_accuracy = 0. encoder.train() decoder.train() for epoch in range(1, hy.num_epochs + 1): for encoder_input, decoder_input, decoder_output in tqdm( loader, desc="{}/{}".format(epoch, hy.num_epochs)): encoder_input = encoder_input.to(device) decoder_input = decoder_input.to(device) decoder_output = decoder_output.to(device) encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() _, encoder_hidden = encoder(encoder_input) logits = decoder(decoder_input, encoder_hidden) loss = loss_function( logits.view(hy.batch_size * decoder_output.shape[1], -1), decoder_output.view(-1)) loss.backward() encoder_optimizer.step() decoder_optimizer.step() writer.add_scalar("TrainingLoss", loss.item(), n_iterations) n_iterations = n_iterations + 1 loss_history.append(loss.item()) training_accuracy = compute_model_accuracy(encoder, decoder, loader, device, epoch, writer) torch.save(encoder.state_dict(), "saved_runs/encoder_{}_weights.pt".format(epoch)) torch.save(decoder.state_dict(), "saved_runs/decoder_{}_weights.pt".format(epoch)) return loss_history, training_accuracy
return NoamOpt( model.src_embed[0].d_model, 2, 4000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) if __name__ == '__main__': args = get_argparse().parse_args() src_sp = spm.SentencePieceProcessor() src_sp.load(args.src_spm) trg_sp = spm.SentencePieceProcessor() trg_sp.load(args.trg_spm) train_dataset = NMTDataset(os.path.join(args.data, 'train.en'), os.path.join(args.data, 'train.zh'), src_sp, trg_sp) train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate_fn) dev_dataset = NMTDataset(os.path.join(args.data, 'dev.en'), os.path.join(args.data, 'dev.zh'), src_sp, trg_sp) dev_dataloader = DataLoader(dev_dataset, batch_size=args.test_batch_size, shuffle=False, collate_fn=collate_fn) test_dataset = NMTDataset(os.path.join(args.data, 'test.en'), os.path.join(args.data, 'test.zh'), src_sp, trg_sp) test_dataloader = DataLoader(test_dataset,