def build_model1(self): # LookupTable to Embedding src_embedding_layer = EmbeddingLayer(input_dim=self.n_src_vocab, output_dim=self.src_embed_dim, name='src_embedding') tgt_embedding_layer = EmbeddingLayer(input_dim=self.n_tgt_vocab, output_dim=self.tgt_embed_dim, name='src_embedding') # LSTMs src_lstm_forward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim) src_lstm_backward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim) tgt_lstm = LSTM(input_dim=self.tgt_embed_dim, output_dim=self.tgt_lstm_op_dim) sys.stderr.write(str(tgt_lstm.params) + "\n") # TODO # From target LSTM to target word indexes # Input: target LSTM output dim + Attention from BiLSTM proj_layer = FullyConnectedLayer(input_dim=tgt_lstm_op_dim + 2 * src_lstm_op_dim, output_dim=self.n_tgt_vocab, activation='softmax') params = src_embedding_layer.params + tgt_embedding_layer.params + src_lstm_forward.params + src_lstm_backward.params + tgt_lstm.params[:-1] + proj_layer.params # declare input variables src_ip = T.ivector() tgt_ip = T.ivector() tgt_op = T.ivector() # lookup table -> embedding src_embed_ip = src_embedding_layer.fprop(src_ip) tgt_embed_ip = tgt_embedding_layer.fprop(tgt_ip) # embedding -> source BiLSTM src_lstm_forward.fprop(src_embed_ip) src_lstm_backward.fprop(src_embed_ip[::-1, :]) # Concatenate foward/backward. (Flip backward again to get corresponding h for the same word) encoderh = T.concatenate((src_lstm_forward.h, src_lstm_backward.h[::-1, :]), axis=1) # End of source BiLSTM -> target LSTM tgt_lstm.h_0 = encoderh[-1] tgt_lstm.fprop(tgt_embed_ip) # Attention # Read http://arxiv.org/abs/1508.04025 attention = tgt_lstm.h.dot(encoderh.transpose()) attention = attention.dot(encoderh) # Order preference? decoderh = T.concatenate((attention, tgt_lstm.h), axis=1) # LSTM output -> target word proj_op = proj_layer.fprop(decoder) # Cost + regularization cost = T.nnet.categorical_crossentropy(proj_op, tgt_op).mean() cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2) return dict({'cost': cost, 'src_ip': src_ip, 'tgt_ip': tgt_ip, 'tgt_op': tgt_op, 'params': params, 'proj_op': proj_op})
def process(train_source_file, train_target_file, dev_source_file, dev_target_file, test_source_file, test_target_predictions): train_source_data = get_data(train_source_file) train_target_data = get_data(train_target_file) dev_source_data = get_data(dev_source_file) dev_target_data = get_data(dev_target_file) test_source_data = get_data(test_source_file) source_words = set(itertools.chain(*(train_source_data + dev_source_data))) target_words = set(itertools.chain(*(train_target_data + dev_target_data))) source_word_to_idx = dict((v, i) for i, v in enumerate(source_words)) target_word_to_idx = dict((v, i) for i, v in enumerate(target_words)) target_idx_to_word = dict((i, v) for i, v in enumerate(target_words)) # Preparing data train_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in train_source_data] dev_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in dev_source_data] train_target_data = [[target_word_to_idx[word] for word in sentence] for sentence in train_target_data] dev_target_data = [[target_word_to_idx[word] for word in sentence] for sentence in dev_target_data] test_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in test_source_data] # Changing the input numpy arrays to tensor vectors source_sentence = T.ivector() target_sentence = T.ivector() target_gold = T.ivector() source_word_embedding = 128 target_word_embedding = 128 source_hidden_embedding = 256 target_hidden_embedding = 256 hyper_params = [] vocab_source_size = len(source_words) vocab_target_size = len(target_words) source_lookup = EmbeddingLayer(vocab_source_size, source_word_embedding) target_lookup = EmbeddingLayer(vocab_target_size, target_word_embedding) hyper_params += source_lookup.params + target_lookup.params source_lstm_forward = LSTM(source_word_embedding, source_hidden_embedding, with_batch=False) target_lstm = LSTM(256, target_hidden_embedding, with_batch=False) hyper_params += source_lstm_forward.params + target_lstm.params[:-1] # Removing the last output tanh_layer = HiddenLayer(source_hidden_embedding, target_word_embedding, activation='tanh') # weighted_attention_vector + target_sentence_embedding + last encoded vector softmax_layer = HiddenLayer(source_hidden_embedding + target_hidden_embedding, vocab_target_size, activation='softmax') hyper_params += softmax_layer.params # Getting the source and target embeddings source_sentence_emb = source_lookup.link(source_sentence) target_sentence_emb = target_lookup.link(target_sentence) last_h = source_lstm_forward.link(source_sentence_emb) # Repeating the last encoder_output for target word length times # First changing the last encoder_output into a row and vector and repeating target word length times broadcast_source_context = T.repeat(last_h.dimshuffle('x', 0), target_sentence_emb.shape[0], axis=0) broadcast_source_context = tanh_layer.link(broadcast_source_context) target_sentence_emb = T.concatenate((target_sentence_emb, broadcast_source_context), axis=1) target_lstm.h_0 = last_h target_lstm.link(target_sentence_emb) # Attention ht = target_lstm.h.dot(source_lstm_forward.h.transpose()) # Normalizing across rows to get attention probabilities attention_weights = T.nnet.softmax(ht) # Weighted source_context_vector based on attention probabilities attention_weighted_vector = attention_weights.dot(source_lstm_forward.h) # Concatenating the hidden state from lstm and weighted source_context_vector pred = T.concatenate([attention_weighted_vector, target_lstm.h], axis=1) # Final softmax to get the best translation word prediction = softmax_layer.link(pred) # Computing the cross-entropy loss loss = T.nnet.categorical_crossentropy(prediction, target_gold).mean() updates = LearningMethod(clip=5.0).get_updates('adam', loss, hyper_params) # For training train_function = theano.function( inputs=[source_sentence, target_sentence, target_gold], outputs=loss, updates=updates ) # For prediction predict_function = theano.function( inputs=[source_sentence, target_sentence], outputs=prediction, ) def get_translations(source_sentences): translated_sentences = [] for sentence in source_sentences: source_sentence = np.array(sentence).astype(np.int32) translated_so_far = [target_word_to_idx['<s>']] while True: next_word = predict_function(source_sentence, translated_so_far).argmax(axis=1)[-1] # Get the last translated word translated_so_far.append(next_word) if next_word == target_word_to_idx['</s>']: translated_sentences.append([target_idx_to_word[x] for x in translated_so_far]) break return translated_sentences iterations = 100 batch_size = 10000 c = 0 best_score = -1.0 * sys.maxint dev_preds = [] test_preds = [] dev_best_preds = [] test_best_preds = [] for i in xrange(iterations): print 'Iteration {}'.format(i) random_indexes = range(len(train_source_data)) np.random.shuffle(random_indexes) loss = [] for sent_no, index in enumerate(random_indexes): src_vector = np.array(train_source_data[index]).astype(np.int32) tgt_vector = np.array(train_target_data[index]).astype(np.int32) c = train_function(src_vector, tgt_vector[:-1], tgt_vector[1:]) loss.append(c) if sent_no % batch_size == 0 and sent_no > 0: dev_preds = get_translations(dev_source_data) dev_bleu_score = get_bleu(dev_preds) if dev_bleu_score > best_score: best_score = dev_bleu_score dev_best_preds = dev_preds[:] # Decoding the test once the dev reaches the baseline if dev_bleu_score >= 28: test_preds = get_translations(test_source_data) test_best_preds = test_preds[:] print 'Dev bleu score {}'.format(dev_bleu_score) print 'Iteration: {} Loss {}'.format(i, 1.0 * (sum(loss))/len(loss)) dev_output_fp = open('dev_output.txt', 'w') test_output_fp = open(test_target_predictions, 'w') for pred in dev_best_preds: dev_output_fp.write(' '.join(pred) + '\n') dev_output_fp.close() for pred in test_best_preds: test_output_fp.write(' '.join(pred) + '\n') test_output_fp.close()
def main(): config = ConfigParser.ConfigParser() train_src = load_data(config.get("Data", "train_src")) dev_src = load_data(config.get("Data", "dev_src")) test_src = load_data(config.get("Data", "test_src")) train_tgt = load_data(config.get("Data", "train_tgt")) dev_tgt = load_data(config.get("Data", "dev_tgt")) test_tgt = load_data(config.get("Data", "test_tgt")) assert len(train_src) == len(train_tgt) UD_path = config.get("Path", "UD") sys.path.append(UD_path + "/") words_src = get_words(train_src + dev_src) words_tgt = get_words(train_tgt + dev_tgt) source_word2ind = {word: ind for ind, word in enumerate(words_src)} source_ind2word = {ind: word for ind, word in enumerate(words_src)} target_word2ind = {word: ind for ind, word in enumerate(words_tgt)} target_ind2word = {ind: word for ind, word in enumerate(words_tgt)} # In[24]: # # Model # src_emb_dim = 256 # source word embedding dimension tgt_emb_dim = 256 # target word embedding dimension src_lstm_hid_dim = 512 # source LSTMs hidden dimension tgt_lstm_hid_dim = 2 * src_lstm_hid_dim # target LSTM hidden dimension proj_dim = 104 # size of the first projection layer dropout = 0.5 # dropout rate n_src = len(source_word2ind) # number of words in the source language n_tgt = len(target_word2ind) # number of words in the target language # Parameters params = [] # Source words + target words embeddings layer src_lookup = EmbeddingLayer(n_src, src_emb_dim, name="src_lookup") # lookup table for source words tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name="tgt_lookup") # lookup table for target words params += src_lookup.params + tgt_lookup.params # LSTMs src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name="src_lstm_for", with_batch=False) src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name="src_lstm_rev", with_batch=False) tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name="tgt_lstm", with_batch=False) params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1] # Projection layers proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name="proj_layer1", activation="softmax") proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name="proj_layer2", activation="tanh") params += proj_layer1.params # + proj_layer2.params # Train status is_train = T.iscalar("is_train") # Input sentence src_sentence = T.ivector() # Current output translation tgt_sentence = T.ivector() # Gold translation tgt_gold = T.ivector() src_sentence_emb = src_lookup.link(src_sentence) tgt_sentence_emb = tgt_lookup.link(tgt_sentence) print "src_sentence_emb", src_sentence_emb.eval({src_sentence: src_sentence_t}).shape print "tgt_sentence_emb", tgt_sentence_emb.eval({tgt_sentence: tgt_sentence_t}).shape src_lstm_for.link(src_sentence_emb) src_lstm_rev.link(src_sentence_emb[::-1, :]) print "src_lstm_for.h", src_lstm_for.h.eval({src_sentence: src_sentence_t}).shape print "src_lstm_rev.h", src_lstm_rev.h.eval({src_sentence: src_sentence_t}).shape src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1) print "src_context", src_context.eval({src_sentence: src_sentence_t}).shape tgt_lstm.h_0 = src_context[-1] print "tgt sentence emb", tgt_sentence_emb.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape tgt_lstm.link(tgt_sentence_emb) print "tgt_lstm.h", tgt_lstm.h.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape transition = tgt_lstm.h.dot(src_context.transpose()) transition = transition.dot(src_context) print "transition", transition.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape transition_last = T.concatenate([transition, tgt_lstm.h], axis=1) print "transition_last", transition_last.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape prediction = proj_layer1.link(transition_last) print "prediction", prediction.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean() cost += beta * T.mean( (tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2 ) # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf print "cost", cost.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t, tgt_gold: tgt_gold_t}) # In[26]: updates = LearningMethod(clip=5.0).get_updates("adam", cost, params) # In[27]: f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates) # In[28]: f_eval = theano.function(inputs=[src_sentence, tgt_sentence], outputs=prediction)
def main(): source_word2idx, source_idx2word = create_word_table(train_src) target_word2idx, target_idx2word = create_word_table(train_tgt) sys.stderr.write("Lookup table constructed." + "\n") src_emb_dim = 256 # source word embedding dimension tgt_emb_dim = 256 # target word embedding dimension src_lstm_hid_dim = 512 # source LSTMs hidden dimension tgt_lstm_hid_dim = 2 * src_lstm_hid_dim # target LSTM hidden dimension dropout = 0.5 # dropout rate n_src = len(source_word2idx) # number of words in the source language n_tgt = len(target_word2idx) # number of words in the target language # Parameters params = [] # Source words + target words embeddings layer # lookup table for source words src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup') # lookup table for target words tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup') params += src_lookup.params + tgt_lookup.params # LSTMs src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False) src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False) tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False) params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1] # Projection layers proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax') proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') params += proj_layer1.params + proj_layer2.params beta = 500 # Train status is_train = T.iscalar('is_train') # Input sentence src_sentence = T.ivector() # Current output translation tgt_sentence = T.ivector() # Gold translation tgt_gold = T.ivector() src_sentence_emb = src_lookup.link(src_sentence) tgt_sentence_emb = tgt_lookup.link(tgt_sentence) src_lstm_for.link(src_sentence_emb) src_lstm_rev.link(src_sentence_emb[::-1, :]) src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1) tgt_lstm.h_0 = src_context[-1] repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0) repeated_src_context = proj_layer2.link(repeated_src_context) tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1) tgt_lstm.link(tgt_sentence_emb) # Attention transition = tgt_lstm.h.dot(src_context.transpose()) transition = transition.dot(src_context) transition_last = T.concatenate([transition, tgt_lstm.h], axis=1) prediction = proj_layer1.link(transition_last) cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean() # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf cost += beta * T.mean((tgt_lstm.h[:-1]**2 - tgt_lstm.h[1:]**2)**2) updates = LearningMethod(clip=5.0).get_updates('adam', cost, params) f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates) f_eval = theano.function( inputs=[src_sentence, tgt_sentence], outputs=prediction, ) best_valid_preds = None best_valid_score = -sys.maxint best_test_preds = None log = open('blue_valid_log.txt', 'w') all_costs = [] batch_size = 50 n_epochs = 10 for i in xrange(n_epochs): print 'Starting epoch %i' % i indices = range(len(train_src)) np.random.shuffle(indices) train_src_batch = [train_src[ind] for ind in indices] train_tgt_batch = [train_tgt[ind] for ind in indices] assert len(train_src_batch) == len(train_tgt_batch) costs = [] for j in xrange(len(train_src_batch)): new_cost = f_train( np.array([source_word2idx[x] for x in train_src_batch[j]]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][1:]).astype(np.int32)) all_costs.append((j, new_cost)) costs.append(new_cost) if j % 300 == 0: print j, np.mean(costs) costs = [] if np.isnan(new_cost): print 'NaN detected.' break if j % 10000 == 0 and j != 0: valid_preds = get_predictions(source_word2idx, target_word2idx, target_idx2word, f_eval, mode="validation") bleu = get_validation_bleu(valid_preds) print '===================================================================' print 'Epoch %i BLEU on Validation : %s ' % (i, bleu) print '===================================================================' if float(bleu) >= best_valid_score: best_valid_score = float(bleu) best_valid_preds = copy.deepcopy(valid_preds) best_test_preds = get_predictions(source_word2idx, target_word2idx, target_idx2word, f_eval, mode="test") print 'Found new best validation score %f ' % ( best_valid_score) log.write('Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, bleu)) # Store after epoch fout = open('output' + str(i) + '.txt', 'w') for line in best_test_preds: fout.write(' '.join(line) + '\n') fout.close() log.close()
def main(): source_word2idx, source_idx2word = create_word_table(train_src) target_word2idx, target_idx2word = create_word_table(train_tgt) sys.stderr.write("Lookup table constructed." + "\n") src_emb_dim = 256 # source word embedding dimension tgt_emb_dim = 256 # target word embedding dimension src_lstm_hid_dim = 512 # source LSTMs hidden dimension tgt_lstm_hid_dim = 2 * src_lstm_hid_dim # target LSTM hidden dimension dropout = 0.5 # dropout rate n_src = len(source_word2idx) # number of words in the source language n_tgt = len(target_word2idx) # number of words in the target language # Parameters params = [] # Source words + target words embeddings layer # lookup table for source words src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup') # lookup table for target words tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup') params += src_lookup.params + tgt_lookup.params # LSTMs src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False) src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False) tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False) params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1] # Projection layers proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax') proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh') params += proj_layer1.params + proj_layer2.params beta = 500 # Train status is_train = T.iscalar('is_train') # Input sentence src_sentence = T.ivector() # Current output translation tgt_sentence = T.ivector() # Gold translation tgt_gold = T.ivector() src_sentence_emb = src_lookup.link(src_sentence) tgt_sentence_emb = tgt_lookup.link(tgt_sentence) src_lstm_for.link(src_sentence_emb) src_lstm_rev.link(src_sentence_emb[::-1, :]) src_context = T.concatenate( [src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1) tgt_lstm.h_0 = src_context[-1] repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0) repeated_src_context = proj_layer2.link(repeated_src_context) tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1) tgt_lstm.link(tgt_sentence_emb) # Attention transition = tgt_lstm.h.dot(src_context.transpose()) transition = transition.dot(src_context) transition_last = T.concatenate([transition, tgt_lstm.h], axis=1) prediction = proj_layer1.link(transition_last) cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean() # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2) updates = LearningMethod(clip=5.0).get_updates('adam', cost, params) f_train = theano.function( inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates ) f_eval = theano.function( inputs=[src_sentence, tgt_sentence], outputs=prediction, ) best_valid_preds = None best_valid_score = -sys.maxint best_test_preds = None log = open('blue_valid_log.txt', 'w') all_costs = [] batch_size = 50 n_epochs = 10 for i in xrange(n_epochs): print 'Starting epoch %i' % i indices = range(len(train_src)) np.random.shuffle(indices) train_src_batch = [train_src[ind] for ind in indices] train_tgt_batch = [train_tgt[ind] for ind in indices] assert len(train_src_batch) == len(train_tgt_batch) costs = [] for j in xrange(len(train_src_batch)): new_cost = f_train( np.array([source_word2idx[x] for x in train_src_batch[j]]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32), np.array([target_word2idx[x] for x in train_tgt_batch[j]][1:]).astype(np.int32) ) all_costs.append((j, new_cost)) costs.append(new_cost) if j % 300 == 0: print j, np.mean(costs) costs = [] if np.isnan(new_cost): print 'NaN detected.' break if j % 10000 == 0 and j != 0: valid_preds = get_predictions( source_word2idx, target_word2idx, target_idx2word, f_eval, mode="validation") bleu = get_validation_bleu(valid_preds) print '===================================================================' print 'Epoch %i BLEU on Validation : %s ' % (i, bleu) print '===================================================================' if float(bleu) >= best_valid_score: best_valid_score = float(bleu) best_valid_preds = copy.deepcopy(valid_preds) best_test_preds = get_predictions( source_word2idx, target_word2idx, target_idx2word, f_eval, mode="test") print 'Found new best validation score %f ' % (best_valid_score) log.write( 'Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, bleu)) # Store after epoch fout = open('output' + str(i) + '.txt', 'w') for line in best_test_preds: fout.write(' '.join(line) + '\n') fout.close() log.close()