#                                               collate_fn=collate_fn_vi,
#                                               shuffle=True)
val_loader_vi = torch.utils.data.DataLoader(dataset=val_vi,
                                            batch_size=batch_size,
                                            collate_fn=collate_fn_vi,
                                            shuffle=True)
# val_loader_vi = torch.utils.data.DataLoader(dataset=val_vi,
#                                               batch_size=batch_size,
#                                               collate_fn=collate_fn_vi,
#                                               shuffle=True)
test_loader_vi = torch.utils.data.DataLoader(dataset=test_vi,
                                             batch_size=batch_size,
                                             collate_fn=collate_fn_vi,
                                             shuffle=True)

encoder = EncoderRNN(lang_vi.n_words, hidden_size, n_layers,
                     dropout=dropout).to(device)
decoder = DecoderRNN(hidden_size,
                     lang_en_vi.n_words,
                     n_layers,
                     dropout=dropout).to(device)

# Initialize optimizers and criterion
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(),
                               lr=learning_rate * decoder_learning_ratio)
criterion = nn.CrossEntropyLoss().to(device)

# Keep track of time elapsed and running averages
start = time.time()