def __init__(self, model, train_e_path, train_f_path, dev_e_path, dev_f_path, dev_wa, test_e_path, test_f_path, test_wa, num_epochs=5, batch_size=16, max_length=30, lr=0.1, lr_decay=0.001, session=None): """Initialize the trainer with a model.""" self.model = model self.train_e_path = train_e_path self.train_f_path = train_f_path self.dev_e_path = dev_e_path self.dev_f_path = dev_f_path self.dev_wa = dev_wa self.test_e_path = test_e_path self.test_f_path = test_f_path self.test_wa = test_wa self.num_epochs = num_epochs self.batch_size = batch_size self.max_length = max_length self.lr = lr self.lr_decay = lr_decay self.session = session self.epoch_loss = [] self.val_loss = [] self.val_aer = [] self.test_aer = [] self.epoch_kl = [] self.epoch_ce = [] self.save_points = 0 print("Training with B={} max_length={} lr={} lr_decay={}".format( batch_size, max_length, lr, lr_decay)) self._build_optimizer() # This loads the data into memory so that we can easily shuffle it. # If this takes too much memory, shuffle the data on disk # and use bitext_reader directly. self.corpus = list(bitext_reader( smart_reader(train_e_path), smart_reader(train_f_path), max_length=max_length)) self.dev_corpus = list(bitext_reader( smart_reader(dev_e_path), smart_reader(dev_f_path))) self.test_corpus = list(bitext_reader( smart_reader(test_e_path), smart_reader(test_f_path)))
def demo_number_filtered_sentence_pairs(src_path, trg_path): src_reader = smart_reader(src_path) trg_reader = smart_reader(trg_path) max_length = 30 bitext = bitext_reader(src_reader, trg_reader, max_length=max_length) num_sentences = sum([1 for _ in bitext]) print("There are {} sentences with max_length = {}".format( num_sentences, max_length))
def bitext_reader_demo(src_path, trg_path): """Demo of the bitext reader.""" # create a reader src_reader = smart_reader(src_path) trg_reader = smart_reader(trg_path) bitext = bitext_reader(src_reader, trg_reader) # to see that it really works, try this: print(next(bitext)) print(next(bitext)) print(next(bitext)) print(next(bitext))
# ### Mini-batching # # With our vocabulary, we still need a method that converts a whole sentence to a sequence of IDs. # And, to speed up training, we would like to get a so-called mini-batch at a time: multiple of such sequences together. So our function takes a corpus iterator and a vocabulary, and returns a mini-batch of shape [Batch, Time], where the first dimension indexes the sentences in the batch, and the second the time steps in each sentence. # In[19]: from utils import iterate_minibatches, prepare_data # Let's try it out! # In[20]: src_reader = smart_reader(train_e_path) trg_reader = smart_reader(train_f_path) bitext = bitext_reader(src_reader, trg_reader) for batch_id, batch in enumerate(iterate_minibatches(bitext, batch_size=4)): print("This is the batch of data that we will train on, as tokens:") pprint(batch) print() x, y = prepare_data(batch, vocabulary_e, vocabulary_f) print("These are our inputs (i.e. words replaced by IDs):") print(x) print() print("These are the outputs (the foreign sentences):") print(y)
max_tokens = 1000 # max_tokens = 7000 corpus_e = smart_reader(train_e_path) vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens) pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb")) print("English vocabulary size: {}".format(len(vocabulary_e))) corpus_f = smart_reader(train_f_path) vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens) pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb")) print("French vocabulary size: {}".format(len(vocabulary_f))) # load test corpus test_corpus = list( bitext_reader(smart_reader(test_e_path), smart_reader(test_f_path))) # run tf.reset_default_graph() # config = tf.ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction=0.95 # config = tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.95)) # config.gpu_options.allow_growth=True with tf.Session() as sess: # with tf.device("/cpu:0"): # with tf.device("/gpu:1"): # some hyper-parameters # tweak them as you wish
max_tokens = 1000 corpus_e = smart_reader(train_e_path) vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens) pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb")) print("English vocabulary size: {}".format(len(vocabulary_e))) corpus_f = smart_reader(train_f_path) vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens) pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb")) print("French vocabulary size: {}".format(len(vocabulary_f))) dev_corpus = list(bitext_reader( smart_reader(dev_e_path), smart_reader(dev_f_path))) # some hyper-parameters # tweak them as you wish batch_size = 25 # on CPU, use something much smaller e.g. 1-16 max_length = 20 lr = 0.0001 lr_decay = 0.0 # set to 0.0 when using Adam optimizer (default) emb_dim = 64 mlp_dim = 128 num_epochs = 1 gated = True