Example #1
0
def evaluate(lang, embedding, encoder, decoder, dev_pairs, max_length,
             infer_batch_size):
    target_words = []
    decoded_words = []
    for i in range(0, len(dev_pairs), infer_batch_size):
        # print("batch: ", int(i / infer_batch_size))
        dev_batch = dev_pairs[i:i + infer_batch_size]
        # 排序并padding
        dev_batch = sorted(dev_batch, key=lambda x: len(x[0]), reverse=True)
        target_words.extend(map(lambda x: x[1], dev_batch))
        dev_batch = list(map(lambda x: indexesFromPair(lang, x), dev_batch))
        enc_lens = list(map(lambda x: len(x[0]), dev_batch))
        # print("enc_lens: ", enc_lens)
        enc_max_len = max(enc_lens)
        enc = []
        for t in dev_batch:
            enc.append(t[0] + (enc_max_len - len(t[0])) * [0])
        enc = torch.tensor(enc, dtype=torch.long, device=device_cuda)

        decoded_words.extend(
            inference(lang, enc, enc_lens, embedding, encoder, decoder,
                      max_length))

    target_sentences = list(map(lambda x: ''.join(x), target_words))
    output_sentences = list(map(lambda x: ''.join(x), decoded_words))

    with open('bleu/gold', 'w', encoding='utf-8') as gw:
        gw.write('\n'.join(target_sentences))
        gw.close()
    with open('bleu/predict', 'w', encoding='utf-8') as pr:
        pr.write('\n'.join(output_sentences))
        pr.close()

    bleu_score = bleu(target_sentences, output_sentences)
    return bleu_score
Example #2
0
    def compute_scores(self, matchs):
        bleu_matched = []
        nb_matched_all = []

        for match in matchs:
            if len(match) > 3:
                test_src, test_tgt, direction, nb_matched, matched, matched_rendered = match

                bleu_matched.append(
                    utils.bleu([test_tgt.source], [matched_rendered]))
                nb_matched_all.append(nb_matched)

        total = len(matchs)
        multi_matched = sum(1 if len(e) > 3 and e[3] > 1 else 0
                            for e in matchs)
        _matched = [len(e) > 3 for e in matchs]

        return {
            'bleu_total':
            sum(bleu_matched) / total,
            'bleu_matched':
            sum(bleu_matched) / len(bleu_matched),
            'nb_entry_matched':
            sum(_matched),
            'avg_nb_entry_matched':
            sum(_matched) / total,
            'nb_entry_multi_matched':
            multi_matched,
            'avg_nb_entry_multi_matched':
            multi_matched / total,
            'nb_database_match':
            sum(nb_matched_all),
            'avg-matched_nb_database_match':
            sum(nb_matched_all) / sum(_matched),
            'nb_no_matched':
            sum(1 if not e else 0 for e in _matched),
            'avg_no_matched':
            sum(1 if not e else 0 for e in _matched) / total,
            'nb_bleu_1':
            sum(1 if d == 1.0 else 0 for d in bleu_matched),
            'avg-matched_nb_bleu_1':
            sum(1 if d == 1.0 else 0 for d in bleu_matched) / sum(_matched),
            'nb_bleu_between_1_0.7':
            sum(1 if d < 1.0 and d > 0.7 else 0 for d in bleu_matched),
            'avg-matched_nb_bleu_between_1_0.7':
            sum(1 if d < 1.0 and d > 0.7 else 0
                for d in bleu_matched) / sum(_matched),
            'nb_bleu_below_0.7':
            sum(1 if d < 0.7 else 0 for d in bleu_matched),
            'avg-matched_nb_bleu_below_0.7':
            sum(1 if d < 0.7 else 0 for d in bleu_matched) / sum(_matched),
            'total':
            total,
        }
Example #3
0
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

# sentence = "ein pferd geht unter einer brücke neben einem boot."
#
# translated_sentence = translate_sentence(
#     model, sentence, german, english, device, max_length=50
# )
sentence1 = [
    'ein', 'pferd', 'geht', 'unter', 'einer', 'brücke', 'neben', 'einem',
    'boot', '.'
]
translated_sentence = translate_sentence(model,
                                         sentence1,
                                         german,
                                         english,
                                         device,
                                         max_length=50)
# exit()
# print(f"Translated1 example sentence: \n {sentence}")
# print(f"Translated1 example sentence: \n {translated_sentence}")

# exit()

train(model, device, load_model, save_model, german, english, train_data,
      valid_data, test_data, batch_size)
# running on entire test data takes a while
score = bleu(train_data[1:100], model, german, english, device)
print(f"Final Train Bleu score {score * 100:.2f}")

score = bleu(test_data[1:100], model, german, english, device)
print(f"Final Test Bleu score {score * 100:.2f}")
Example #4
0
def train(model, device, load_model, save_model, german_vocab, english_vocab,
          train_data, valid_data, test_data, batch_size):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    if load_model:
        load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

    sentence = "ein pferd geht unter einer brücke neben einem boot."
    # sentence = 'a little girl climbing into a wooden playhouse.'
    # sentence = "man stuffed smiling lion"
    #6 1 4 7 3 2 5 0
    # sentence = ['ein', 'pferd', 'geht', 'unter', 'einer', 'brücke', 'neben', 'einem', 'boot', '.']
    #sentence = ['The', 'study’s', 'questions', 'are', 'carefully', 'worded', 'and', 'chosen', '.']
    # sentence = 'The study questions are carefully worded and chosen.'

    # sentence = ['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.']
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10,
                                                           verbose=True)

    pad_idx = english_vocab.stoi["<pad>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    # train_iterator, valid_iterator, test_iterator = Batcher(train_data, valid_data, test_data)
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device,
    )

    step = 0

    for epoch in range(num_epochs):
        print(f"[Epoch {epoch} / {num_epochs}]")

        if save_model:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint)

        model.eval()
        # sentence = "Das wird sehr seltsam"
        # sentence = "Frankreich wird wohl Deutschland angreifen"

        translated_sentence = translate_sentence(model,
                                                 sentence,
                                                 german_vocab,
                                                 english_vocab,
                                                 device,
                                                 max_length=50)

        print(f"Translated example sentence: \n {sentence}")
        print(f"Translated example sentence: \n {translated_sentence}")
        # exit()

        # running on entire test data takes a while
        print("here1")
        score = bleu(train_data[1:10], model, german_vocab, english_vocab,
                     device)
        print(f"Train Bleu score {score * 100:.2f}")

        print("here2")
        score = bleu(test_data[1:50], model, german_vocab, english_vocab,
                     device)
        print(f"Test Bleu score {score * 100:.2f}")

        model.train()
        losses = []

        for batch_idx, batch in enumerate(train_iterator):
            # Get input and targets and get to cuda
            # print(batch_idx)
            inp_data = batch.src
            inp_data = inp_data.to(device)
            target = batch.trg
            target = target.to(device)

            # inp_data = batch[0].to(device)
            # target = batch[1].to(device)
            # Forward prop
            # print(target)
            # printSentences(inp_data, german_vocab)
            # printSentences2(target, english_vocab, inp_data, german_vocab)
            trg = target[:-1, :]
            # print(trg.shape)
            output = model(inp_data, trg)

            # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
            # doesn't take input in that form. For example if we have MNIST we want to have
            # output to be: (N, 10) and targets just (N). Here we can view it in a similar
            # way that we have output_words * batch_size that we want to send in into
            # our cost function, so we need to do some reshapin.
            # Let's also remove the start token while we're at it
            output = output.reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()

            loss = criterion(output, target)
            losses.append(loss.item())

            # Back prop
            loss.backward()
            # Clip to avoid exploding gradient issues, makes sure grads are
            # within a healthy range
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Gradient descent step
            optimizer.step()

            # plot to tensorboard
            # writer.add_scalar("Training loss", loss, global_step=step)
            step += 1

        mean_loss = sum(losses) / len(losses)
        scheduler.step(mean_loss)
Example #5
0
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        #         GPUtil.showUtilization()
        loss = criterion(output, target)

        del output, inp_data, target
        torch.cuda.empty_cache()
        #         for i in range(100000000):
        #             continue
        # Back prop
        loss.mean().backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1
        del loss
        torch.cuda.empty_cache()
#         for i in range(100000000):
#             continue
# running on entire test data takes a while
score = bleu(val[1:100], model, english, english, device)
print("Bleu score {}".format(score * 100))
Example #6
0
#     print(spe_dec.decode(src))
#     print("     ", trg)
#     print(spe_dec.decode(trg))

src_vocab_size = len(spe_dec)
trg_vocab_size = len(spe_dec)
print("src vocabulary size: ", src_vocab_size)
print("trg vocabulary size: ", trg_vocab_size)
embedding_size = 256
src_pad_idx = spe_dec.pad_id()
print("pad_index = ", src_pad_idx)
print("===============================after loading")

model = Transformer(device, embedding_size, src_vocab_size, trg_vocab_size,
                    src_pad_idx).to(device)

load_model = True
save_model = True
learning_rate = 3e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

# with torch.no_grad():
score1 = bleu(train_data[1:50], model, spe_dec, spe_dec, device)

score2 = bleu(test_data[1:50], model, spe_dec, spe_dec, device)
print(f"Train Bleu score1 {score1 * 100:.2f}")
print(f"Test Bleu score2 {score2 * 100:.2f}")
Example #7
0
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it

        # (N, 10) and targets would be (N)
        output = output[1:].reshape(
            -1, output.shape[2]
        )  # output dimension which would be the size of the vocabulary and just gonna put everyting else together
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # To avoid exploding problems deprecated errors happen
        #clipping_value = 1  # arbitrary value of your choosing
        #torch.nn.utils.clip_grad_norm(model.parameters(), clipping_value)
        clipping_value = 1  # arbitrary value of your choosing
        torch.nn.utils.clip_grad_norm_(model.parameters(), clipping_value)

        # Gradient descent step
        optimizer.step()

        writer.add_scalar('Training loss', loss, global_step=step)
        step += 1

score = bleu(test_data, model, german, english, device)
print(f'Bleu score {score*100:.2f}')
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(seq2seq.state_dict(), 'tut2-model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

    
    seq2seq.load_state_dict(torch.load('tut2-model.pt'))

    test_loss = evaluate(seq2seq, test_iterator, criterion)
    print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')
    print("Bleu Score is :")
    print(bleu(test_data, seq2seq, config.source, config.target, config.device))
    example_idx = 0
    example = train_data.examples[example_idx]
    print('source sentence: ', ' '.join(example.src))
    print('target sentence: ', ' '.join(example.trg))

    src_tensor = config.source.process([example.src]).to(config.device)
    trg_tensor = config.target.process([example.trg]).to(config.device)

    seq2seq.eval()
    with torch.no_grad():
        outputs = seq2seq(src_tensor, trg_tensor, teacher_forcing_ratio=0)

    output_idx = outputs[1:].squeeze(1).argmax(1)
    result = ' '.join([config.target.vocab.itos[idx] for idx in output_idx])
Example #9
0
def train(model, device, load_model, save_model, german_vocab, english_vocab,
          train_data, valid_data, test_data, batch_size, LOAD_NEW_METHOD):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    if load_model:
        load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

    # sentence = "ein pferd geht unter einer brücke neben einem boot."
    # sentence = 'a little girl climbing into a wooden playhouse.'
    # sentence = "is man lion a stuffed A at smiling."

    # sentence = ['ein', 'pferd', 'geht', 'unter', 'einer', 'brücke', 'neben', 'einem', 'boot', '.']
    #sentence = ['The', 'study’s', 'questions', 'are', 'carefully', 'worded', 'and', 'chosen', '.']
    # sentence = 'The study questions are carefully worded and chosen.'

    # sentence = ['a', 'little', 'girl', 'climbing', 'into', 'a', 'wooden', 'playhouse', '.']
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                           factor=0.1,
                                                           patience=10,
                                                           verbose=True)

    pad_idx = english_vocab.stoi["<pad>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    if LOAD_NEW_METHOD:
        train_iterator, valid_iterator, test_iterator = Batcher(
            train_data, valid_data, test_data)
    else:
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, valid_data, test_data),
            batch_size=batch_size,
            # sort_within_batch=True,
            # shuffle=True,
            sort_key=lambda x: len(x.src),
            device=device,
        )

    step = 0

    for epoch in range(num_epochs):

        print(f"[Epoch {epoch} / {num_epochs}]")

        if save_model:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint)

        model.eval()
        # sentence = "Das wird sehr seltsam"
        # sentence = "Frankreich wird wohl Deutschland angreifen"

        src_sents = [
            "There was no possibility of taking a walk that day.",
            "His own words is a pledge of this.",
            "His own words are a pledge of this.",
            "A man are playing the banjo at a concert.",
            "No; you is less than a servant, for you do nothing for your keep."
        ]

        # for src, trg in train_data:
        #     # translation = translate_sentence_lstm(model, src, german_vocab, english_vocab, device)
        #
        #     translated_sentence = translate_sentence(
        #         model,
        #         src, german_vocab, english_vocab, device, max_length=50, inputIsTensor=True
        #     )
        #     src_eng = [german_vocab.itos[idx] for idx in src]
        #     print(f"Translated example sentence: \n {src_eng}")
        #     print(f"Translated example sentence: \n {translated_sentence}")

        # running on entire test data takes a while
        print("here1")
        score = bleu(train_data[1:10], model, german_vocab, english_vocab,
                     device, LOAD_NEW_METHOD)
        print(f"Train Bleu score {score * 100:.2f}")
        #
        print("here2")
        score = bleu(test_data[1:10], model, german_vocab, english_vocab,
                     device, LOAD_NEW_METHOD)
        print(f"Test Bleu score {score * 100:.2f}")

        model.train()
        losses = []
        t1 = time.time()
        for batch_idx, batch in enumerate(train_iterator):
            # Get input and targets and get to cuda
            if LOAD_NEW_METHOD:
                inp_data = batch[0].to(device)
                target = batch[1].to(device)
            else:
                inp_data = batch.src.to(device)
                target = batch.trg.to(device)
            # continue
            # Forward prop
            # print(target)
            # printSentences(inp_data, german_vocab)
            # printSentences(target, english_vocab)
            trg = target[:-1, :]
            # print(trg.shape)
            output = model(inp_data, trg)

            # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
            # doesn't take input in that form. For example if we have MNIST we want to have
            # output to be: (N, 10) and targets just (N). Here we can view it in a similar
            # way that we have output_words * batch_size that we want to send in into
            # our cost function, so we need to do some reshapin.
            # Let's also remove the start token while we're at it
            output = output.reshape(-1, output.shape[2])
            target = target[1:].reshape(-1)

            optimizer.zero_grad()

            loss = criterion(output, target)
            losses.append(loss.item())

            # Back prop
            loss.backward()
            # Clip to avoid exploding gradient issues, makes sure grads are
            # within a healthy range
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

            # Gradient descent step
            optimizer.step()

            # plot to tensorboard
            # writer.add_scalar("Training loss", loss, global_step=step)
            step += 1
            if batch_idx % 100 == 0:
                print("batch " + str(batch_idx))
        t2 = time.time()
        print("epoch time = ", t2 - t1)
        mean_loss = sum(losses) / len(losses)
        scheduler.step(mean_loss)
Example #10
0
print("src vocabulary size: ", src_vocab_size)
print("trg vocabulary size: ", trg_vocab_size)
embedding_size = 512
src_pad_idx = english_vocab.stoi["<pad>"]
print(src_pad_idx)
print(english_vocab.itos[src_pad_idx])
print("===============================after loading ")

model = Transformer(device, embedding_size, src_vocab_size, trg_vocab_size,
                    src_pad_idx).to(device)

load_model = True
save_model = True
learning_rate = 3e-4

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

if load_model:
    load_checkpoint_zaid("my_checkpoint.pth.tar", model, optimizer)

print("here1")
score = bleu(train_data[1:10], model, german_vocab, english_vocab, device)
print(f"Train Bleu score {score * 100:.2f}")

print("here2")
score = bleu(test_data[1:50], model, german_vocab, english_vocab, device)
print(f"Test Bleu score {score * 100:.2f}")

if save_model:
    save_checkpoint_zaid("my_checkpointx.pth.tar", model, optimizer)
       # Back prop
       loss.backward()
       
       # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
       torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

       optimizer.step()

       training_loss.append(loss.item())
   
   avg_train_loss = np.average(training_loss)
   avg_trainloss_plot.append(avg_train_loss)
   
   model.eval()
   
   BleuScore = bleu(test_data[:3000], model, source_lang, target_lang, device,max_length=50,generate_outputs=False)
   Bleu_scores_plot.append(BleuScore)
   #print(f"Bleu score {score*100:.2f}")
   print(f'\n[epoch:{epoch}] -->>'+f'training loss is: {avg_train_loss:.5f}' + f' Bleu Score is: {BleuScore*100:.5f}')
   #reset losses for next epoch
   training_loss = []
   
   #Earlystop check
   if Max_BleuScore == None:
       Max_BleuScore = BleuScore
       #print("First min loss",min_loss)
       print("saving first model")
       torch.save(model,checkpoint_path)
   elif BleuScore > Max_BleuScore:
       Max_BleuScore = BleuScore
       print("New max bleu score found , saving model")
Example #12
0
def oracle(matched_many, target):
    return sorted(range(len(matched_many)), key=lambda i: -utils.bleu([matched_many[i]], [target]))[0]
Example #13
0
def main():
    file = {
        "model_group": "/seq_len_exp",
        "model_name": "/generation_6",
        "model_version": 0,
        "project_file": "/home/mattd/PycharmProjects/reddit/generation"
    }

    file["dataset_path"] = "{}/data/".format(file["project_file"])

    file["model_filename"] = '{}{}s{}_{}'.format(file["project_file"],
                                                 file["model_group"],
                                                 file["model_name"],
                                                 file["model_version"])

    file["output_file"] = '{}{}_outputs{}_{}'.format(file["project_file"],
                                                     file["model_group"],
                                                     file["model_name"],
                                                     file["model_version"])

    #check_files(file)

    use_old_model = file["model_version"] != 0
    params = {}

    if use_old_model:
        file["old_model_filename"] = '{}{}s{}_{}'.format(
            file["project_file"], file["model_group"], file["model_name"],
            file["model_version"] - 1)
        params, old_files = load_params(file["old_model_filename"])
        use_old_model = old_files != {}

    if not use_old_model:
        params = {
            "attention": True,
            "batch_size": 325,
            "hidden_size": 256,
            "embedding_dim": 300,
            "pretrained_embeddings": True,
            "max_grad_norm": 5,
            "max_len": 30,
            "min_count": 2,
            "weight_decay": 0.00001,
            "learning_rate": 0.005,
        }

    params["num_training_examples"] = 78260
    params["num_val_examples"] = -1
    params["nb_epochs"] = 40

    if params["pretrained_embeddings"]:
        file["pretrained_embeddings_file"] = \
            "/embeddings/embeddings_min{}_max{}.npy".format(
            params["min_count"], params["max_len"])

    string = ""
    for k, v in file.items():
        string += "{}: {}\n".format(k, v)
    for k, v in params.items():
        string += "{}: {}\n".format(k, v)

    print(string)
    output = string + '\n'

    # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'
    dataset_train_filename = "{}train.csv".format(file["dataset_path"])
    dataset_val_filename = "{}validation.csv".format(file["dataset_path"])

    dataset_train = SentenceDataset(dataset_train_filename, params["max_len"],
                                    params["min_count"])
    dataset_val = SentenceDataset(dataset_val_filename, params["max_len"],
                                  params["min_count"], dataset_train.vocab)

    string = 'Vocab size {}\n'.format(len(dataset_train.vocab))
    string += 'Train {} '.format(len(dataset_train))

    if params["num_training_examples"] != -1:
        dataset_train.prune_examples(params["num_training_examples"])
        string += '-> {}'.format(len(dataset_train))

    string += '\nVal: {}'.format(len(dataset_val))

    if params["num_val_examples"] != -1:
        dataset_val.prune_examples(params["num_val_examples"])
        string += '-> {}'.format(len(dataset_val))

    print(string)
    output += string + '\n'

    if params["pretrained_embeddings"]:
        embeddings_dir = '{}{}'.format(file["project_file"],
                                       file["pretrained_embeddings_file"])
        pretrained_embeddings = cuda(get_pretrained_embeddings(embeddings_dir))
        params["embedding_dim"] = pretrained_embeddings.shape[1]
    else:
        pretrained_embeddings = None

    data_loader_train = torch.utils.data.DataLoader(dataset_train,
                                                    params["batch_size"],
                                                    shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  params["batch_size"],
                                                  shuffle=False)

    vocab_size = len(dataset_train.vocab)
    padding_idx = dataset_train.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset_train.vocab[SentenceDataset.INIT_TOKEN]

    if params["attention"] is True:
        model = Seq2SeqModelAttention(params["hidden_size"], padding_idx,
                                      init_idx, params["max_len"], vocab_size,
                                      params["embedding_dim"],
                                      pretrained_embeddings)
    else:
        model = Seq2SeqModel(params["hidden_size"], padding_idx, init_idx,
                             params["max_len"], vocab_size,
                             params["embedding_dim"], pretrained_embeddings)

    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 amsgrad=True,
                                 weight_decay=params["weight_decay"],
                                 lr=params["learning_rate"])
    criterion = torch.nn.CrossEntropyLoss()

    if use_old_model:
        model, optimizer = load_checkpoint(file["old_model_filename"], model,
                                           optimizer)

    lowest_loss = 100
    train_loss = []
    val_loss = []
    best_model = model
    best_optimizer = optimizer
    average_epoch_loss = 0

    metrics = {
        "token_accuracy": [],
        "sentence_accuracy": [],
        "perplexity": [],
        "bleu": {
            'bleu_1': [],
            'bleu_2': [],
            'bleu_3': [],
            'bleu_4': []
        }
    }

    outfile = open(file["output_file"], 'w')
    outfile.write(output)
    outfile.close()

    phases = [
        'train',
        'val',
    ]
    data_loaders = [
        data_loader_train,
        data_loader_val,
    ]

    intervals = 2
    highest_acc = 0

    for epoch in range(0, params["nb_epochs"]):
        start = time.clock()
        string = 'Epoch: {}\n'.format(epoch)
        print(string, end='')
        output = output + '\n' + string

        #if epoch == 6:
        #    model.unfreeze_embeddings()
        #    parameters = list(model.parameters())
        #    optimizer = torch.optim.Adam(
        #        parameters, amsgrad=True, weight_decay=weight_decay)

        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
                string = 'Train: \n'
            else:
                model.eval()
                string = 'Validation \n'

            print(string, end='')
            output = output + '\n' + string

            epoch_bleu = {
                'bleu_1': [],
                'bleu_2': [],
                'bleu_3': [],
                'bleu_4': []
            }
            epoch_loss = []
            epoch_sentenence_accuracy = []
            epoch_token_accuracy = []
            j = 1

            for i, (sentence_1, sentence_2) in tqdm(enumerate(data_loader)):
                optimizer.zero_grad()

                sentence_1 = variable(sentence_1)
                sentence_2 = variable(sentence_2)

                outputs = model(sentence_1, sentence_2)

                targets = sentence_2.view(-1)
                outputs = outputs.view(targets.size(0), -1)
                loss = criterion(outputs, targets)

                epoch_loss.append(float(loss))
                average_epoch_loss = np.mean(epoch_loss)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters,
                                                   params["max_grad_norm"])
                    optimizer.step()
                    if (len(data_loader) / intervals) * j <= i + 1:
                        string = ('Example {:03d} | {} loss: {:.3f}'.format(
                            i, phase, average_epoch_loss))
                        print(string, end='\n')
                        output = output + string + '\n'
                        j += 1
                else:
                    # get result metrics
                    predicted = torch.argmax(
                        outputs.view(-1, params["max_len"], vocab_size), -1)
                    batch_sentence_accuracy, batch_token_accuracy = \
                        encoder_accuracy(
                        targets.view(-1, params["max_len"]), predicted)
                    epoch_sentenence_accuracy.append(batch_sentence_accuracy)
                    epoch_token_accuracy.append(batch_token_accuracy)

                    bleu_1 = bleu(targets.view(-1, params["max_len"]),
                                  predicted, 1)
                    bleu_2 = bleu(targets.view(-1, params["max_len"]),
                                  predicted, 2)
                    bleu_3 = bleu(targets.view(-1, params["max_len"]),
                                  predicted, 3)
                    bleu_4 = bleu(targets.view(-1, params["max_len"]),
                                  predicted, 4)
                    epoch_bleu["bleu_1"].append(bleu_1)
                    epoch_bleu["bleu_2"].append(bleu_2)
                    epoch_bleu["bleu_3"].append(bleu_3)
                    epoch_bleu["bleu_4"].append(bleu_4)

            if phase == 'val':
                time_taken = time.clock() - start

                val_loss.append(average_epoch_loss)

                string = ' {} loss: {:.3f} | time: {:.3f}'.format(
                    phase, average_epoch_loss, time_taken)
                string += ' | lowest loss: {:.3f} highest accuracy: ' \
                    '{:.3f}'.format(lowest_loss, highest_acc)
                print(string, end='\n')
                output = output + '\n' + string + '\n'

                average_epoch_sentenence_accuracy = np.mean(
                    epoch_sentenence_accuracy)
                average_epoch_token_accuracy = np.mean(epoch_token_accuracy)
                perplexity = np.exp(average_epoch_loss)

                average_epoch_bleu = {
                    'bleu_1': np.mean(epoch_bleu['bleu_1']),
                    'bleu_2': np.mean(epoch_bleu['bleu_2']),
                    'bleu_3': np.mean(epoch_bleu['bleu_3']),
                    'bleu_4': np.mean(epoch_bleu['bleu_4'])
                }

                metrics["token_accuracy"].append(average_epoch_token_accuracy)
                metrics["sentence_accuracy"].append(
                    average_epoch_sentenence_accuracy)
                metrics["perplexity"].append(perplexity)
                metrics["bleu"]["bleu_1"].append(average_epoch_bleu['bleu_1'])
                metrics["bleu"]["bleu_2"].append(average_epoch_bleu['bleu_2'])
                metrics["bleu"]["bleu_3"].append(average_epoch_bleu['bleu_3'])
                metrics["bleu"]["bleu_4"].append(average_epoch_bleu['bleu_4'])

                if average_epoch_loss < lowest_loss:
                    best_model = model
                    best_optimizer = optimizer
                    best_epoch = epoch
                    lowest_loss = average_epoch_loss

                save_checkpoint(best_epoch, best_model, best_optimizer, epoch,
                                model, optimizer, train_loss, val_loss,
                                metrics, params, file)

                if average_epoch_token_accuracy > highest_acc:
                    highest_acc = average_epoch_token_accuracy

                string = "Token_accuracy: {:.3f}\nSentence_accuracy: {:.3f}\n".format(
                    average_epoch_token_accuracy,
                    average_epoch_sentenence_accuracy)
                string += "Perplexity: {:.3f}\n".format(perplexity)
                string += "Bleu: {}\n".format(average_epoch_bleu)
                print(string, end='\n')
                output = output + string + '\n'

                random_idx = np.random.randint(len(dataset_val))
                sentence_1, sentence_2 = dataset_val[random_idx]
                sentence_1_var = variable(sentence_1)
                sentence_2_var = variable(sentence_2)

                outputs_var = model(sentence_1_var.unsqueeze(0))  # unsqueeze
                #  to get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                string = '> {}\n'.format(
                    get_sentence_from_indices(sentence_1, dataset_val.vocab,
                                              SentenceDataset.EOS_TOKEN))

                string = string + u'< {}\n'.format(
                    get_sentence_from_indices(outputs, dataset_val.vocab,
                                              SentenceDataset.EOS_TOKEN))

                string = string + u'= {}\n'.format(
                    get_sentence_from_indices(sentence_2, dataset_val.vocab,
                                              SentenceDataset.EOS_TOKEN))

                print(string, end='\n\n')
                output = output + string + '\n' + '\n'
            else:
                train_loss.append(average_epoch_loss)
        outfile = open(file["output_file"], 'w')
        outfile.write(output)
        outfile.close()