Beispiel #1
0
def train(**kwargs):
    print("loading dataset")
    train_dataset = NMTDataset(kwargs["src_train"], kwargs["tgt_train"])
    valid_dataset = NMTDataset(kwargs["src_valid"], kwargs["tgt_valid"])
    print("Dataset loaded successfully.")

    train_dl = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    valid_dl = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    tokenizer = SpaceTokenizer(base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_nepali.json", 
                base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_english.json"
                ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer(
                    base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_newa.json", 
                    base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_eng.json"
                )
    if kwargs['model'] == 'transformer':
        model = TransformerModel(len(tokenizer.src_vocab), len(tokenizer.tgt_vocab), embed_size, 
                n_heads, dropout=dropout_rate)
    else:
        model = Seq2Seq(embed_size, hidden_size, tokenizer, dropout_rate=dropout_rate, n_layers=n_layers)
    # criterion = nn.CrossEntropyLoss()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=0.001)
    model.to(device)
    model  = trainer(model, optimizer, train_dl, valid_dl, BATCH_SIZE, epoch, 
                            device, LOG_EVERY, kwargs["checkpoint_path"], kwargs["best_model"], 
                            beam_size, max_decoding_time_step)
Beispiel #2
0
def evaluate(input_sentences, output_sentences, input_vocab, output_vocab,
             input_reverse, output_reverse, hy, writer):
    dataset = NMTDataset(input_sentences, output_sentences, input_vocab,
                         output_vocab, input_reverse, output_reverse)
    loader = DataLoader(dataset,
                        batch_size=hy.batch_size,
                        shuffle=True,
                        drop_last=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    input_vocab_size = len(input_vocab.keys())
    output_vocab_size = len(output_vocab.keys())

    encoder = EncoderRNN(input_vocab_size, hy.embedding_size, hy.hidden_size,
                         hy.rnn_layers, hy.bidirectional, device)
    decoder = DecoderRNN(output_vocab_size, hy.embedding_size, hy.hidden_size,
                         hy.rnn_layers, hy.bidirectional, device)

    accuracies = []

    for epoch in range(1, hy.num_epochs + 1):
        encoder.load_state_dict(
            torch.load("saved_runs/encoder_{}_weights.pt".format(epoch)))
        decoder.load_state_dict(
            torch.load("saved_runs/decoder_{}_weights.pt".format(epoch)))
        accuracy = compute_model_accuracy(encoder, decoder, loader, device,
                                          epoch, writer)
        accuracies.append(accuracy)

    print("=" * 80)
    print("Final Accuracy = {:.1f}".format(100. * np.max(accuracies)))
    print("=" * 80)

    return accuracies
Beispiel #3
0
def test(**kwargs):
    test_dataset = NMTDataset(kwargs["src_test"], kwargs["tgt_test"])
    print("Dataset loaded successfully.")

    test_dl = DataLoader(test_dataset,
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         collate_fn=collate_fn)

    tokenizer = SpaceTokenizer(
        src_vocab_path, tgt_vocab_path
    ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer(
        src_vocab_path, tgt_vocab_path)
    model = TransformerModel(len(tokenizer.src_vocab),
                             len(tokenizer.tgt_vocab),
                             tokenizer,
                             embed_size,
                             n_heads,
                             dropout=dropout_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

    model.to(device)
    model.eval()
    bleu_score = 0
    test_loss = 0
    test_start_time = time.time()
    with torch.no_grad():
        for batch in test_dl:
            src_tensor, tgt_tensor, _, _ = model.tokenizer.encode(
                batch, device, return_tensor=True)
            src_tensor = src_tensor.transpose(0, 1)
            tgt_tensor = tgt_tensor.transpose(0, 1)
            trg_input = tgt_tensor[:, :-1]
            targets = tgt_tensor[:, 1:].contiguous().view(-1)
            preds = model(src_tensor, trg_input.to(device), device)

            loss = criterion(preds, targets)
            test_loss += loss.item() / BATCH_SIZE

            output = []
            for src in src_tensor:
                hyps = beam_search_transformer(
                    model, src.view(1, -1), beam_size, max_decoding_time_step,
                    model.tokenizer.src_vocab['[PAD]'],
                    model.tokenizer.tgt_vocab['[EOS]'], device)
                top_hyp = hyps[0]
                hyp_sent = ' '.join(top_hyp.value)
                output.append(hyp_sent)

            score = compute_bleu_score(output, batch[1])
            bleu_score += score
    print(
        f'Avg. test loss: {test_loss/len(test_dl):.5f} | BLEU Score: {bleu_score/len(test_dl)} | time elapsed: {time.time() - test_start_time}'
    )
Beispiel #4
0
def train(**kwargs):
    print("loading dataset")
    train_dataset = NMTDataset(kwargs["src_train"], kwargs["tgt_train"])
    valid_dataset = NMTDataset(kwargs["src_valid"], kwargs["tgt_valid"])
    print("Dataset loaded successfully.")

    train_dl = DataLoader(train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn=collate_fn)
    valid_dl = DataLoader(valid_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn=collate_fn)
    tokenizer = SpaceTokenizer(
        src_vocab_path, tgt_vocab_path
    ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer(
        src_vocab_path, tgt_vocab_path)

    model = TransformerModel(len(tokenizer.src_vocab),
                             len(tokenizer.tgt_vocab),
                             tokenizer,
                             embed_size,
                             n_heads,
                             dropout=dropout_rate)
    model.to(device)
    criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=0.6,
                                 betas=(0.9, 0.98),
                                 eps=1e-9)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
    torch.autograd.set_detect_anomaly(True)

    train_model(model, optimizer, criterion, scheduler, train_dl, valid_dl,
                BATCH_SIZE, epoch, device, kwargs["checkpoint_path"],
                kwargs["best_model"], beam_size, max_decoding_time_step)
Beispiel #5
0
def test(**kwargs):
    print("loading dataset")
    test_dataset = NMTDataset(kwargs["src_test"], kwargs["tgt_test"])
    print("Dataset loaded successfully.")
    test_dl = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    tokenizer = SpaceTokenizer(base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_nepali.json", 
                base_path+"NMTtokenizers/spacetoken_vocab_files/vocab_english.json"
                ) if kwargs["tokenizer"] == "space_tokenizer" else BertTokenizer(
                    base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_newa.json", 
                    base_path+"NMTtokenizers/wordpiece_vocab_files/vocab_eng.json"
                )
    model = Seq2Seq(embed_size, hidden_size, tokenizer, dropout_rate=dropout_rate, n_layers=n_layers)
    model.to(device)
    model, _, _, _ = load_checkpt(model, kwargs['best_model'], device)
    eval_start_time = time.time()
    test_loss, bleu_score = evaluate(model, test_dl, 0, device, BATCH_SIZE, beam_size, max_decoding_time_step)
    print(f'Avg. test loss: {test_loss:.5f} | BLEU Score: {bleu_score} | time elapsed: {time.time() - eval_start_time}')
        tokens += loss
        total_tokens += batch['ntokens']

        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f Tokens per Sec: %f." %
                  (i, loss / batch['ntokens'], total_tokens / elapsed))
            start = time.time()
            tokens = 0

    return total_loss / total_tokens


if __name__ == "__main__":
    dataset = NMTDataset.load_dataset_and_make_vectorizer(
        # "/home/liuxd/home/NLP/PyTorchNLPBook/code4model/data/translation2019zh_train-df_100000.csv"
        "/home/liuxd/home/NLP/PyTorchNLPBook/code4model/data/translation2019zh_train-df_70w.csv"
    )
    src_vocab_size = len(dataset.get_vectorizer().source_vocab)
    tgt_vocab_size = len(dataset.get_vectorizer().target_vocab)
    padding_idx = dataset.get_vectorizer().target_vocab.lookup_token('<MASK>')
    criterion = LabelSmoothing(size=tgt_vocab_size,
                               padding_idx=0,
                               smoothing=0.1)
    criterion.cuda()
    model = make_model(src_vocab_size, tgt_vocab_size, 6)
    model.cuda()
    model_opt = NoamOpt(
        model.src_embed[0].d_model, 1, 8000,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))
    loss_compute = SimpleLossCompute(model.generator, criterion, model_opt)
Beispiel #7
0
                        default=1000)
    parser.add_argument('-n',
                        '--num_workers',
                        type=int,
                        help='Src language',
                        default=16)
    parser.add_argument('-g',
                        '--gpu',
                        type=int,
                        help='Src language',
                        default=0)
    parser.add_argument('--btec', type=str, help='Src language', default="ALL")
    args = parser.parse_args()
    src_key, trg_key = args.src_segment, args.trg_segment

    dataset = NMTDataset(args.src, args.trg, key=args.btec)
    dataset.sort(trg_key)
    set_reverse_vocab(dataset.trg_vocab[trg_key])
    model = nmt_model(len(dataset.src_vocab[src_key]),
                      len(dataset.trg_vocab[trg_key]))
    model_path = os.path.join('./NMT/LOG', args.src + "2" + args.trg,
                              args.btec)
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    model_path = os.path.join(model_path, src_key + "2" + trg_key)
    min_loss = 100
    try:
        model.load(model_path)
    except:
        pass
Beispiel #8
0
def train(input_sentences, output_sentences, input_vocab, output_vocab,
          input_reverse, output_reverse, hy, writer):
    dataset = NMTDataset(input_sentences, output_sentences, input_vocab,
                         output_vocab, input_reverse, output_reverse)
    loader = DataLoader(dataset,
                        batch_size=hy.batch_size,
                        shuffle=True,
                        drop_last=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    input_vocab_size = len(input_vocab.keys())
    output_vocab_size = len(output_vocab.keys())

    encoder = EncoderRNN(input_vocab_size, hy.embedding_size, hy.hidden_size,
                         hy.rnn_layers, hy.bidirectional, device)
    decoder = DecoderRNN(output_vocab_size, hy.embedding_size, hy.hidden_size,
                         hy.rnn_layers, hy.bidirectional, device)

    loss_function = nn.CrossEntropyLoss().to(device)
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=hy.lr)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=hy.lr)

    n_iterations = 0
    loss_history = []
    training_accuracy = 0.

    encoder.train()
    decoder.train()

    for epoch in range(1, hy.num_epochs + 1):
        for encoder_input, decoder_input, decoder_output in tqdm(
                loader, desc="{}/{}".format(epoch, hy.num_epochs)):
            encoder_input = encoder_input.to(device)
            decoder_input = decoder_input.to(device)
            decoder_output = decoder_output.to(device)

            encoder_optimizer.zero_grad()
            decoder_optimizer.zero_grad()

            _, encoder_hidden = encoder(encoder_input)
            logits = decoder(decoder_input, encoder_hidden)

            loss = loss_function(
                logits.view(hy.batch_size * decoder_output.shape[1], -1),
                decoder_output.view(-1))

            loss.backward()
            encoder_optimizer.step()
            decoder_optimizer.step()

            writer.add_scalar("TrainingLoss", loss.item(), n_iterations)
            n_iterations = n_iterations + 1
            loss_history.append(loss.item())

        training_accuracy = compute_model_accuracy(encoder, decoder, loader,
                                                   device, epoch, writer)
        torch.save(encoder.state_dict(),
                   "saved_runs/encoder_{}_weights.pt".format(epoch))
        torch.save(decoder.state_dict(),
                   "saved_runs/decoder_{}_weights.pt".format(epoch))

    return loss_history, training_accuracy
Beispiel #9
0
    return NoamOpt(
        model.src_embed[0].d_model, 2, 4000,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))


if __name__ == '__main__':
    args = get_argparse().parse_args()

    src_sp = spm.SentencePieceProcessor()
    src_sp.load(args.src_spm)
    trg_sp = spm.SentencePieceProcessor()
    trg_sp.load(args.trg_spm)

    train_dataset = NMTDataset(os.path.join(args.data, 'train.en'),
                               os.path.join(args.data, 'train.zh'), src_sp,
                               trg_sp)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn)
    dev_dataset = NMTDataset(os.path.join(args.data, 'dev.en'),
                             os.path.join(args.data, 'dev.zh'), src_sp, trg_sp)
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=args.test_batch_size,
                                shuffle=False,
                                collate_fn=collate_fn)
    test_dataset = NMTDataset(os.path.join(args.data, 'test.en'),
                              os.path.join(args.data, 'test.zh'), src_sp,
                              trg_sp)
    test_dataloader = DataLoader(test_dataset,