def demo_number_filtered_sentence_pairs(src_path, trg_path): src_reader = smart_reader(src_path) trg_reader = smart_reader(trg_path) max_length = 30 bitext = bitext_reader(src_reader, trg_reader, max_length=max_length) num_sentences = sum([1 for _ in bitext]) print("There are {} sentences with max_length = {}".format( num_sentences, max_length))
def bitext_reader_demo(src_path, trg_path): """Demo of the bitext reader.""" # create a reader src_reader = smart_reader(src_path) trg_reader = smart_reader(trg_path) bitext = bitext_reader(src_reader, trg_reader) # to see that it really works, try this: print(next(bitext)) print(next(bitext)) print(next(bitext)) print(next(bitext))
def vocabulary_demo(): # We used up a few lines in the previous example, so we set up # our data generator again. corpus = smart_reader(train_e_path) # Let's create a vocabulary given our (tokenized) corpus vocabulary = Vocabulary(corpus=corpus) print("Original vocabulary size: {}".format(len(vocabulary))) # Now we only keep the highest-frequency words vocabulary_size = 1000 vocabulary.trim(vocabulary_size) print("Trimmed vocabulary size: {}".format(len(vocabulary))) # Now we can get word indexes using v.get_word_id(): for t in ["<PAD>", "<UNK>", "the"]: print("The index of \"{}\" is: {}".format(t, vocabulary.get_token_id(t))) # And the inverse too, using v.i2t: for i in range(10): print("The token with index {} is: {}".format(i, vocabulary.get_token(i))) # Now let's try to get a word ID for a word not in the vocabulary # we should get 1 (so, <UNK>) for t in ["!@!_not_in_vocab_!@!"]: print("The index of \"{}\" is: {}".format(t, vocabulary.get_token_id(t)))
def __init__(self, model, train_e_path, train_f_path, dev_e_path, dev_f_path, dev_wa, num_epochs=5, batch_size=16, max_length=30, lr=0.1, lr_decay=0.001, model_path="./model.ckpt", session=None): """Initialize the trainer with a model.""" self.model = model self.train_e_path = train_e_path self.train_f_path = train_f_path self.dev_e_path = dev_e_path self.dev_f_path = dev_f_path self.dev_wa = dev_wa self.num_epochs = num_epochs self.batch_size = batch_size self.max_length = max_length self.lr = lr self.lr_decay = lr_decay self.session = session self.model_path = model_path print("Training with B={} max_length={} lr={} lr_decay={}".format( batch_size, max_length, lr, lr_decay)) self._build_optimizer() # This loads the data into memory so that we can easily shuffle it. # If this takes too much memory, shuffle the data on disk # and use bitext_reader directly. self.corpus = list( bitext_reader(smart_reader(train_e_path), smart_reader(train_f_path), max_length=max_length)) self.dev_corpus = list( bitext_reader(smart_reader(dev_e_path), smart_reader(dev_f_path)))
vocabulary.get_token_id(t))) vocabulary_demo() # Now let's create the vocabularies that we use further on. # In[18]: # Using only 1000 words will result in many UNKs, but # it will make training a lot faster. # If you have a fast computer, a GPU, or a lot of time, # try with 10000 instead. max_tokens = 1000 corpus_e = smart_reader(train_e_path) vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens) pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb")) print("English vocabulary size: {}".format(len(vocabulary_e))) corpus_f = smart_reader(train_f_path) vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens) pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb")) print("French vocabulary size: {}".format(len(vocabulary_f))) print() def sample_words(vocabulary, n=5): """Print a few words from the vocabulary.""" for _ in range(n): token_id = np.random.randint(0, len(vocabulary) - 1)
dev_e_path = '../data/validation/dev.e.gz' dev_f_path = '../data/validation/dev.f.gz' dev_wa = '../data/validation/dev.wa.nonullalign' test_e_path = '../data/test/test.e.gz' test_f_path = '../data/test/test.f.gz' test_wa = '../data/test/test.wa.nonullalign' # Using only 1000 words will result in many UNKs, but # it will make training a lot faster. # If you have a fast computer, a GPU, or a lot of time, # try with 10000 instead. max_tokens = 1000 # max_tokens = 7000 corpus_e = smart_reader(train_e_path) vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens) pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb")) print("English vocabulary size: {}".format(len(vocabulary_e))) corpus_f = smart_reader(train_f_path) vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens) pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb")) print("French vocabulary size: {}".format(len(vocabulary_f))) # load test corpus test_corpus = list( bitext_reader(smart_reader(test_e_path), smart_reader(test_f_path))) # run tf.reset_default_graph()
train_f_path = 'data/training/hansards.36.2.f.gz' dev_e_path = 'data/validation/dev.e.gz' dev_f_path = 'data/validation/dev.f.gz' dev_wa = 'data/validation/dev.wa.nonullalign' # Using only 1000 words will result in many UNKs, but # it will make training a lot faster. # If you have a fast computer, a GPU, or a lot of time, # try with 10000 instead. max_tokens = 1000 corpus_e = smart_reader(train_e_path) vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens) pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb")) print("English vocabulary size: {}".format(len(vocabulary_e))) corpus_f = smart_reader(train_f_path) vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens) pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb")) print("French vocabulary size: {}".format(len(vocabulary_f))) dev_corpus = list(bitext_reader( smart_reader(dev_e_path), smart_reader(dev_f_path)))