def __init__(self, model, train_e_path, train_f_path, 
               dev_e_path, dev_f_path, dev_wa,
               test_e_path, test_f_path, test_wa,
               num_epochs=5, 
               batch_size=16, max_length=30, lr=0.1, lr_decay=0.001, session=None):
    """Initialize the trainer with a model."""

    self.model = model
    self.train_e_path = train_e_path
    self.train_f_path = train_f_path
    self.dev_e_path = dev_e_path
    self.dev_f_path = dev_f_path
    self.dev_wa = dev_wa
    self.test_e_path = test_e_path
    self.test_f_path = test_f_path
    self.test_wa = test_wa
    
    self.num_epochs = num_epochs
    self.batch_size = batch_size
    self.max_length = max_length
    self.lr = lr
    self.lr_decay = lr_decay
    self.session = session

    self.epoch_loss = []
    self.val_loss = []
    self.val_aer = []
    self.test_aer = []
    self.epoch_kl = []
    self.epoch_ce = []
    self.save_points = 0

    print("Training with B={} max_length={} lr={} lr_decay={}".format(
        batch_size, max_length, lr, lr_decay))

    self._build_optimizer()
    
    # This loads the data into memory so that we can easily shuffle it.
    # If this takes too much memory, shuffle the data on disk
    # and use bitext_reader directly.
    self.corpus = list(bitext_reader(
        smart_reader(train_e_path), 
        smart_reader(train_f_path), 
        max_length=max_length))    
    self.dev_corpus = list(bitext_reader(
        smart_reader(dev_e_path), 
        smart_reader(dev_f_path)))
    self.test_corpus = list(bitext_reader(
        smart_reader(test_e_path),
        smart_reader(test_f_path)))
Beispiel #2
0
def demo_number_filtered_sentence_pairs(src_path, trg_path):
    src_reader = smart_reader(src_path)
    trg_reader = smart_reader(trg_path)
    max_length = 30
    bitext = bitext_reader(src_reader, trg_reader, max_length=max_length)
    num_sentences = sum([1 for _ in bitext])
    print("There are {} sentences with max_length = {}".format(
        num_sentences, max_length))
Beispiel #3
0
def bitext_reader_demo(src_path, trg_path):
    """Demo of the bitext reader."""

    # create a reader
    src_reader = smart_reader(src_path)
    trg_reader = smart_reader(trg_path)
    bitext = bitext_reader(src_reader, trg_reader)

    # to see that it really works, try this:
    print(next(bitext))
    print(next(bitext))
    print(next(bitext))
    print(next(bitext))
Beispiel #4
0
# ### Mini-batching
#
# With our vocabulary, we still need a method that converts a whole sentence to a sequence of IDs.
# And, to speed up training, we would like to get a so-called mini-batch at a time: multiple of such sequences together. So our function takes a corpus iterator and a vocabulary, and returns a mini-batch of shape [Batch, Time], where the first dimension indexes the sentences in the batch, and the second the time steps in each sentence.

# In[19]:

from utils import iterate_minibatches, prepare_data

# Let's try it out!

# In[20]:

src_reader = smart_reader(train_e_path)
trg_reader = smart_reader(train_f_path)
bitext = bitext_reader(src_reader, trg_reader)

for batch_id, batch in enumerate(iterate_minibatches(bitext, batch_size=4)):

    print("This is the batch of data that we will train on, as tokens:")
    pprint(batch)
    print()

    x, y = prepare_data(batch, vocabulary_e, vocabulary_f)

    print("These are our inputs (i.e. words replaced by IDs):")
    print(x)
    print()

    print("These are the outputs (the foreign sentences):")
    print(y)
Beispiel #5
0
max_tokens = 1000
# max_tokens = 7000

corpus_e = smart_reader(train_e_path)
vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens)
pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb"))
print("English vocabulary size: {}".format(len(vocabulary_e)))

corpus_f = smart_reader(train_f_path)
vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens)
pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb"))
print("French vocabulary size: {}".format(len(vocabulary_f)))

# load test corpus
test_corpus = list(
    bitext_reader(smart_reader(test_e_path), smart_reader(test_f_path)))

# run
tf.reset_default_graph()

# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction=0.95
# config = tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.95))

# config.gpu_options.allow_growth=True

with tf.Session() as sess:
    # with tf.device("/cpu:0"):
    # with tf.device("/gpu:1"):
    # some hyper-parameters
    # tweak them as you wish
Beispiel #6
0
max_tokens = 1000

corpus_e = smart_reader(train_e_path)
vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens)
pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb"))
print("English vocabulary size: {}".format(len(vocabulary_e)))

corpus_f = smart_reader(train_f_path)
vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens)
pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb"))
print("French vocabulary size: {}".format(len(vocabulary_f)))



dev_corpus = list(bitext_reader(
        smart_reader(dev_e_path),
        smart_reader(dev_f_path)))

# some hyper-parameters
# tweak them as you wish
batch_size = 25  # on CPU, use something much smaller e.g. 1-16
max_length = 20
lr = 0.0001
lr_decay = 0.0  # set to 0.0 when using Adam optimizer (default)
emb_dim = 64
mlp_dim = 128
num_epochs = 1
gated = True