Python LSTM.h_0 Exemples, network.LSTM.h_0 Python Exemples

Exemple #1

0

Afficher le fichier

    def build_model1(self):
        # LookupTable to Embedding
        src_embedding_layer = EmbeddingLayer(input_dim=self.n_src_vocab, output_dim=self.src_embed_dim, name='src_embedding')
        tgt_embedding_layer = EmbeddingLayer(input_dim=self.n_tgt_vocab, output_dim=self.tgt_embed_dim, name='src_embedding')

        # LSTMs
        src_lstm_forward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim)
        src_lstm_backward = LSTM(input_dim=self.src_embed_dim, output_dim=self.src_lstm_op_dim)
        tgt_lstm = LSTM(input_dim=self.tgt_embed_dim, output_dim=self.tgt_lstm_op_dim)
        sys.stderr.write(str(tgt_lstm.params) + "\n")  # TODO

        # From target LSTM to target word indexes
        # Input: target LSTM output dim + Attention from BiLSTM
        proj_layer = FullyConnectedLayer(input_dim=tgt_lstm_op_dim + 2 * src_lstm_op_dim, output_dim=self.n_tgt_vocab, activation='softmax')

        params = src_embedding_layer.params + tgt_embedding_layer.params + src_lstm_forward.params + src_lstm_backward.params + tgt_lstm.params[:-1] + proj_layer.params

        # declare input variables
        src_ip = T.ivector()
        tgt_ip = T.ivector()
        tgt_op = T.ivector()

        # lookup table -> embedding
        src_embed_ip = src_embedding_layer.fprop(src_ip)
        tgt_embed_ip = tgt_embedding_layer.fprop(tgt_ip)

        # embedding -> source BiLSTM
        src_lstm_forward.fprop(src_embed_ip)
        src_lstm_backward.fprop(src_embed_ip[::-1, :])
        # Concatenate foward/backward. (Flip backward again to get corresponding h for the same word)
        encoderh = T.concatenate((src_lstm_forward.h, src_lstm_backward.h[::-1, :]), axis=1)

        # End of source BiLSTM -> target LSTM
        tgt_lstm.h_0 = encoderh[-1]
        tgt_lstm.fprop(tgt_embed_ip)

        # Attention
        # Read http://arxiv.org/abs/1508.04025
        attention = tgt_lstm.h.dot(encoderh.transpose())
        attention = attention.dot(encoderh)

        # Order preference?
        decoderh = T.concatenate((attention, tgt_lstm.h), axis=1)

        # LSTM output -> target word
        proj_op = proj_layer.fprop(decoder)

        # Cost + regularization
        cost = T.nnet.categorical_crossentropy(proj_op, tgt_op).mean()
        cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2)

        return dict({'cost': cost,
                     'src_ip': src_ip,
                     'tgt_ip': tgt_ip,
                     'tgt_op': tgt_op,
                     'params': params,
                     'proj_op': proj_op})

Exemple #2

0

Afficher le fichier

Fichier : nmt.py Projet : aswarth/sp2016.11-731

def process(train_source_file, train_target_file, dev_source_file, dev_target_file, test_source_file, test_target_predictions):
    
    train_source_data = get_data(train_source_file)
    train_target_data = get_data(train_target_file)
    dev_source_data = get_data(dev_source_file)
    dev_target_data = get_data(dev_target_file)
    test_source_data = get_data(test_source_file)
    
    source_words = set(itertools.chain(*(train_source_data + dev_source_data)))
    target_words = set(itertools.chain(*(train_target_data + dev_target_data)))
    
    source_word_to_idx = dict((v, i) for i, v in enumerate(source_words))
    target_word_to_idx = dict((v, i) for i, v in enumerate(target_words))
    target_idx_to_word = dict((i, v) for i, v in enumerate(target_words))
    
    # Preparing data    
    train_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in train_source_data]
    dev_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in dev_source_data]
    train_target_data = [[target_word_to_idx[word] for word in sentence] for sentence in train_target_data]
    dev_target_data = [[target_word_to_idx[word] for word in sentence] for sentence in dev_target_data]
    test_source_data = [[source_word_to_idx[word] for word in sentence] for sentence in test_source_data]
    
    # Changing the input numpy arrays to tensor vectors
    source_sentence = T.ivector()
    target_sentence = T.ivector()
    target_gold = T.ivector()
    
    source_word_embedding = 128
    target_word_embedding = 128
    source_hidden_embedding = 256
    target_hidden_embedding = 256
        
    hyper_params = []
    
    vocab_source_size = len(source_words)
    vocab_target_size = len(target_words)
    
    source_lookup = EmbeddingLayer(vocab_source_size, source_word_embedding) 
    target_lookup = EmbeddingLayer(vocab_target_size, target_word_embedding) 
    hyper_params += source_lookup.params + target_lookup.params

    source_lstm_forward = LSTM(source_word_embedding, source_hidden_embedding, with_batch=False)
    
    target_lstm = LSTM(256, target_hidden_embedding, with_batch=False)
    hyper_params += source_lstm_forward.params + target_lstm.params[:-1] # Removing the last output

    tanh_layer = HiddenLayer(source_hidden_embedding, target_word_embedding, activation='tanh')
    # weighted_attention_vector + target_sentence_embedding + last encoded vector
    softmax_layer = HiddenLayer(source_hidden_embedding + target_hidden_embedding, vocab_target_size, activation='softmax')
    hyper_params += softmax_layer.params

    # Getting the source and target embeddings
    source_sentence_emb = source_lookup.link(source_sentence)
    target_sentence_emb = target_lookup.link(target_sentence)
    last_h = source_lstm_forward.link(source_sentence_emb)

    # Repeating the last encoder_output for target word length times
    # First changing the last encoder_output into a row and vector and repeating target word length times
    broadcast_source_context = T.repeat(last_h.dimshuffle('x', 0), target_sentence_emb.shape[0], axis=0)
    broadcast_source_context = tanh_layer.link(broadcast_source_context)
    target_sentence_emb = T.concatenate((target_sentence_emb, broadcast_source_context), axis=1)
    target_lstm.h_0 = last_h
    target_lstm.link(target_sentence_emb)
    
    # Attention
    ht = target_lstm.h.dot(source_lstm_forward.h.transpose())
    # Normalizing across rows to get attention probabilities
    attention_weights = T.nnet.softmax(ht)
    # Weighted source_context_vector based on attention probabilities
    attention_weighted_vector = attention_weights.dot(source_lstm_forward.h)
    # Concatenating the hidden state from lstm and weighted source_context_vector
    pred = T.concatenate([attention_weighted_vector, target_lstm.h], axis=1)
    # Final softmax to get the best translation word
    prediction = softmax_layer.link(pred)
    
    # Computing the cross-entropy loss
    loss = T.nnet.categorical_crossentropy(prediction, target_gold).mean()
    
    updates = LearningMethod(clip=5.0).get_updates('adam', loss, hyper_params)
    
    # For training
    train_function = theano.function(
        inputs=[source_sentence, target_sentence, target_gold],
        outputs=loss,
        updates=updates
    )

    # For prediction
    predict_function = theano.function(
        inputs=[source_sentence, target_sentence],
        outputs=prediction,
    )
        
    def get_translations(source_sentences):
        translated_sentences = []
        for sentence in source_sentences:
            source_sentence = np.array(sentence).astype(np.int32)
            translated_so_far = [target_word_to_idx['<s>']]
            while True:
                next_word = predict_function(source_sentence, translated_so_far).argmax(axis=1)[-1] # Get the last translated word
                translated_so_far.append(next_word)
                if next_word == target_word_to_idx['</s>']:
                    translated_sentences.append([target_idx_to_word[x] for x in translated_so_far])
                    break
        return translated_sentences
    
    iterations = 100
    batch_size = 10000
    c = 0
    best_score = -1.0 * sys.maxint
    dev_preds = []
    test_preds = []
    dev_best_preds = []
    test_best_preds = []
    for i in xrange(iterations):
        print 'Iteration {}'.format(i)
        random_indexes = range(len(train_source_data))
        np.random.shuffle(random_indexes)
        loss = []
        for sent_no, index in enumerate(random_indexes):
            src_vector = np.array(train_source_data[index]).astype(np.int32)
            tgt_vector = np.array(train_target_data[index]).astype(np.int32)
            c = train_function(src_vector, tgt_vector[:-1], tgt_vector[1:])                  
            loss.append(c)
            if sent_no % batch_size == 0 and sent_no > 0:
                dev_preds = get_translations(dev_source_data)
                dev_bleu_score = get_bleu(dev_preds)
                if dev_bleu_score > best_score:
                    best_score = dev_bleu_score
                    dev_best_preds = dev_preds[:]
                    # Decoding the test once the dev reaches the baseline
                    if dev_bleu_score >= 28:
                        test_preds = get_translations(test_source_data)
                        test_best_preds = test_preds[:]
                    print 'Dev bleu score {}'.format(dev_bleu_score)
                
        print 'Iteration: {} Loss {}'.format(i, 1.0 * (sum(loss))/len(loss))

            
    dev_output_fp = open('dev_output.txt', 'w')
    test_output_fp = open(test_target_predictions, 'w')
    
    for pred in dev_best_preds:
        dev_output_fp.write(' '.join(pred) + '\n')
    dev_output_fp.close()
    
    for pred in test_best_preds:
        test_output_fp.write(' '.join(pred) + '\n')
    test_output_fp.close()

Exemple #3

0

Afficher le fichier

Fichier : seq-seq-UD.py Projet : ramakumar1729/sp2016.11-731

def main():
    config = ConfigParser.ConfigParser()
    train_src = load_data(config.get("Data", "train_src"))
    dev_src = load_data(config.get("Data", "dev_src"))
    test_src = load_data(config.get("Data", "test_src"))

    train_tgt = load_data(config.get("Data", "train_tgt"))
    dev_tgt = load_data(config.get("Data", "dev_tgt"))
    test_tgt = load_data(config.get("Data", "test_tgt"))

    assert len(train_src) == len(train_tgt)

    UD_path = config.get("Path", "UD")

    sys.path.append(UD_path + "/")

    words_src = get_words(train_src + dev_src)
    words_tgt = get_words(train_tgt + dev_tgt)

    source_word2ind = {word: ind for ind, word in enumerate(words_src)}
    source_ind2word = {ind: word for ind, word in enumerate(words_src)}
    target_word2ind = {word: ind for ind, word in enumerate(words_tgt)}
    target_ind2word = {ind: word for ind, word in enumerate(words_tgt)}

    # In[24]:

    #
    # Model
    #
    src_emb_dim = 256  # source word embedding dimension
    tgt_emb_dim = 256  # target word embedding dimension
    src_lstm_hid_dim = 512  # source LSTMs hidden dimension
    tgt_lstm_hid_dim = 2 * src_lstm_hid_dim  # target LSTM hidden dimension
    proj_dim = 104  # size of the first projection layer
    dropout = 0.5  # dropout rate

    n_src = len(source_word2ind)  # number of words in the source language
    n_tgt = len(target_word2ind)  # number of words in the target language

    # Parameters
    params = []

    # Source words + target words embeddings layer
    src_lookup = EmbeddingLayer(n_src, src_emb_dim, name="src_lookup")  # lookup table for source words
    tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name="tgt_lookup")  # lookup table for target words
    params += src_lookup.params + tgt_lookup.params

    # LSTMs
    src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name="src_lstm_for", with_batch=False)
    src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name="src_lstm_rev", with_batch=False)
    tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name="tgt_lstm", with_batch=False)
    params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1]

    # Projection layers
    proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name="proj_layer1", activation="softmax")
    proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name="proj_layer2", activation="tanh")
    params += proj_layer1.params  # + proj_layer2.params

    # Train status
    is_train = T.iscalar("is_train")
    # Input sentence
    src_sentence = T.ivector()
    # Current output translation
    tgt_sentence = T.ivector()
    # Gold translation
    tgt_gold = T.ivector()

    src_sentence_emb = src_lookup.link(src_sentence)
    tgt_sentence_emb = tgt_lookup.link(tgt_sentence)
    print "src_sentence_emb", src_sentence_emb.eval({src_sentence: src_sentence_t}).shape
    print "tgt_sentence_emb", tgt_sentence_emb.eval({tgt_sentence: tgt_sentence_t}).shape

    src_lstm_for.link(src_sentence_emb)
    src_lstm_rev.link(src_sentence_emb[::-1, :])

    print "src_lstm_for.h", src_lstm_for.h.eval({src_sentence: src_sentence_t}).shape
    print "src_lstm_rev.h", src_lstm_rev.h.eval({src_sentence: src_sentence_t}).shape

    src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1)
    print "src_context", src_context.eval({src_sentence: src_sentence_t}).shape

    tgt_lstm.h_0 = src_context[-1]
    print "tgt sentence emb", tgt_sentence_emb.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape
    tgt_lstm.link(tgt_sentence_emb)
    print "tgt_lstm.h", tgt_lstm.h.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

    transition = tgt_lstm.h.dot(src_context.transpose())
    transition = transition.dot(src_context)
    print "transition", transition.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

    transition_last = T.concatenate([transition, tgt_lstm.h], axis=1)
    print "transition_last", transition_last.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

    prediction = proj_layer1.link(transition_last)
    print "prediction", prediction.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t}).shape

    cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean()
    cost += beta * T.mean(
        (tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2
    )  # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf

    print "cost", cost.eval({src_sentence: src_sentence_t, tgt_sentence: tgt_sentence_t, tgt_gold: tgt_gold_t})

    # In[26]:

    updates = LearningMethod(clip=5.0).get_updates("adam", cost, params)

    # In[27]:

    f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold], outputs=cost, updates=updates)

    # In[28]:

    f_eval = theano.function(inputs=[src_sentence, tgt_sentence], outputs=prediction)

Exemple #4

0

Afficher le fichier

def main():

    source_word2idx, source_idx2word = create_word_table(train_src)
    target_word2idx, target_idx2word = create_word_table(train_tgt)
    sys.stderr.write("Lookup table constructed." + "\n")

    src_emb_dim = 256  # source word embedding dimension
    tgt_emb_dim = 256  # target word embedding dimension
    src_lstm_hid_dim = 512  # source LSTMs hidden dimension
    tgt_lstm_hid_dim = 2 * src_lstm_hid_dim  # target LSTM hidden dimension
    dropout = 0.5  # dropout rate

    n_src = len(source_word2idx)  # number of words in the source language
    n_tgt = len(target_word2idx)  # number of words in the target language

    # Parameters
    params = []

    # Source words + target words embeddings layer
    # lookup table for source words
    src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup')
    # lookup table for target words
    tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup')
    params += src_lookup.params + tgt_lookup.params

    # LSTMs
    src_lstm_for = LSTM(src_emb_dim,
                        src_lstm_hid_dim,
                        name='src_lstm_for',
                        with_batch=False)
    src_lstm_rev = LSTM(src_emb_dim,
                        src_lstm_hid_dim,
                        name='src_lstm_rev',
                        with_batch=False)
    tgt_lstm = LSTM(2 * tgt_emb_dim,
                    tgt_lstm_hid_dim,
                    name='tgt_lstm',
                    with_batch=False)
    params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1]

    # Projection layers
    proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim,
                              n_tgt,
                              name='proj_layer1',
                              activation='softmax')
    proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim,
                              tgt_emb_dim,
                              name='proj_layer2',
                              activation='tanh')
    # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh')
    params += proj_layer1.params + proj_layer2.params

    beta = 500

    # Train status
    is_train = T.iscalar('is_train')
    # Input sentence
    src_sentence = T.ivector()
    # Current output translation
    tgt_sentence = T.ivector()
    # Gold translation
    tgt_gold = T.ivector()

    src_sentence_emb = src_lookup.link(src_sentence)
    tgt_sentence_emb = tgt_lookup.link(tgt_sentence)

    src_lstm_for.link(src_sentence_emb)
    src_lstm_rev.link(src_sentence_emb[::-1, :])

    src_context = T.concatenate([src_lstm_for.h, src_lstm_rev.h[::-1, :]],
                                axis=1)

    tgt_lstm.h_0 = src_context[-1]
    repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0),
                                    tgt_sentence_emb.shape[0],
                                    axis=0)
    repeated_src_context = proj_layer2.link(repeated_src_context)

    tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context),
                                     axis=1)
    tgt_lstm.link(tgt_sentence_emb)

    # Attention
    transition = tgt_lstm.h.dot(src_context.transpose())
    transition = transition.dot(src_context)

    transition_last = T.concatenate([transition, tgt_lstm.h], axis=1)

    prediction = proj_layer1.link(transition_last)

    cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean()
    # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf
    cost += beta * T.mean((tgt_lstm.h[:-1]**2 - tgt_lstm.h[1:]**2)**2)

    updates = LearningMethod(clip=5.0).get_updates('adam', cost, params)

    f_train = theano.function(inputs=[src_sentence, tgt_sentence, tgt_gold],
                              outputs=cost,
                              updates=updates)

    f_eval = theano.function(
        inputs=[src_sentence, tgt_sentence],
        outputs=prediction,
    )

    best_valid_preds = None
    best_valid_score = -sys.maxint
    best_test_preds = None

    log = open('blue_valid_log.txt', 'w')
    all_costs = []
    batch_size = 50
    n_epochs = 10
    for i in xrange(n_epochs):
        print 'Starting epoch %i' % i
        indices = range(len(train_src))
        np.random.shuffle(indices)
        train_src_batch = [train_src[ind] for ind in indices]
        train_tgt_batch = [train_tgt[ind] for ind in indices]
        assert len(train_src_batch) == len(train_tgt_batch)
        costs = []
        for j in xrange(len(train_src_batch)):
            new_cost = f_train(
                np.array([source_word2idx[x]
                          for x in train_src_batch[j]]).astype(np.int32),
                np.array([target_word2idx[x]
                          for x in train_tgt_batch[j]][:-1]).astype(np.int32),
                np.array([target_word2idx[x]
                          for x in train_tgt_batch[j]][1:]).astype(np.int32))
            all_costs.append((j, new_cost))
            costs.append(new_cost)
            if j % 300 == 0:
                print j, np.mean(costs)
                costs = []
            if np.isnan(new_cost):
                print 'NaN detected.'
                break
            if j % 10000 == 0 and j != 0:
                valid_preds = get_predictions(source_word2idx,
                                              target_word2idx,
                                              target_idx2word,
                                              f_eval,
                                              mode="validation")
                bleu = get_validation_bleu(valid_preds)
                print '==================================================================='
                print 'Epoch %i BLEU on Validation : %s ' % (i, bleu)
                print '==================================================================='
                if float(bleu) >= best_valid_score:
                    best_valid_score = float(bleu)
                    best_valid_preds = copy.deepcopy(valid_preds)
                    best_test_preds = get_predictions(source_word2idx,
                                                      target_word2idx,
                                                      target_idx2word,
                                                      f_eval,
                                                      mode="test")
                    print 'Found new best validation score %f ' % (
                        best_valid_score)
                log.write('Epoch %d Minibatch %d BLEU on Validation : %s \n' %
                          (i, j, bleu))

        # Store after epoch
        fout = open('output' + str(i) + '.txt', 'w')
        for line in best_test_preds:
            fout.write(' '.join(line) + '\n')
        fout.close()

    log.close()

Exemple #5

0

Afficher le fichier

Fichier : nmtw_new.py Projet : rooa/sp2016.11-731

def main():

    source_word2idx, source_idx2word = create_word_table(train_src)
    target_word2idx, target_idx2word = create_word_table(train_tgt)
    sys.stderr.write("Lookup table constructed." + "\n")

    src_emb_dim = 256  # source word embedding dimension
    tgt_emb_dim = 256  # target word embedding dimension
    src_lstm_hid_dim = 512  # source LSTMs hidden dimension
    tgt_lstm_hid_dim = 2 * src_lstm_hid_dim  # target LSTM hidden dimension
    dropout = 0.5  # dropout rate

    n_src = len(source_word2idx)  # number of words in the source language
    n_tgt = len(target_word2idx)  # number of words in the target language

    # Parameters
    params = []

    # Source words + target words embeddings layer
    # lookup table for source words
    src_lookup = EmbeddingLayer(n_src, src_emb_dim, name='src_lookup')
    # lookup table for target words
    tgt_lookup = EmbeddingLayer(n_tgt, tgt_emb_dim, name='tgt_lookup')
    params += src_lookup.params + tgt_lookup.params

    # LSTMs
    src_lstm_for = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_for', with_batch=False)
    src_lstm_rev = LSTM(src_emb_dim, src_lstm_hid_dim, name='src_lstm_rev', with_batch=False)
    tgt_lstm = LSTM(2 * tgt_emb_dim, tgt_lstm_hid_dim, name='tgt_lstm', with_batch=False)
    params += src_lstm_for.params + src_lstm_rev.params + tgt_lstm.params[:-1]

    # Projection layers
    proj_layer1 = HiddenLayer(tgt_lstm_hid_dim + 2 * src_lstm_hid_dim, n_tgt, name='proj_layer1', activation='softmax')
    proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh')
    # proj_layer2 = HiddenLayer(2 * src_lstm_hid_dim, tgt_emb_dim, name='proj_layer2', activation='tanh')
    params += proj_layer1.params + proj_layer2.params

    beta = 500

    # Train status
    is_train = T.iscalar('is_train')
    # Input sentence
    src_sentence = T.ivector()
    # Current output translation
    tgt_sentence = T.ivector()
    # Gold translation
    tgt_gold = T.ivector()

    src_sentence_emb = src_lookup.link(src_sentence)
    tgt_sentence_emb = tgt_lookup.link(tgt_sentence)

    src_lstm_for.link(src_sentence_emb)
    src_lstm_rev.link(src_sentence_emb[::-1, :])

    src_context = T.concatenate(
        [src_lstm_for.h, src_lstm_rev.h[::-1, :]], axis=1)

    tgt_lstm.h_0 = src_context[-1]
    repeated_src_context = T.repeat(src_context[-1].dimshuffle('x', 0), tgt_sentence_emb.shape[0], axis=0)
    repeated_src_context = proj_layer2.link(repeated_src_context)

    tgt_sentence_emb = T.concatenate((tgt_sentence_emb, repeated_src_context), axis=1)
    tgt_lstm.link(tgt_sentence_emb)

    # Attention
    transition = tgt_lstm.h.dot(src_context.transpose())
    transition = transition.dot(src_context)

    transition_last = T.concatenate([transition, tgt_lstm.h], axis=1)

    prediction = proj_layer1.link(transition_last)

    cost = T.nnet.categorical_crossentropy(prediction, tgt_gold).mean()
    # Regularization of RNNs from http://arxiv.org/pdf/1511.08400v6.pdf
    cost += beta * T.mean((tgt_lstm.h[:-1] ** 2 - tgt_lstm.h[1:] ** 2) ** 2)

    updates = LearningMethod(clip=5.0).get_updates('adam', cost, params)

    f_train = theano.function(
        inputs=[src_sentence, tgt_sentence, tgt_gold],
        outputs=cost,
        updates=updates
    )

    f_eval = theano.function(
        inputs=[src_sentence, tgt_sentence],
        outputs=prediction,
    )

    best_valid_preds = None
    best_valid_score = -sys.maxint
    best_test_preds = None

    log = open('blue_valid_log.txt', 'w')
    all_costs = []
    batch_size = 50
    n_epochs = 10
    for i in xrange(n_epochs):
        print 'Starting epoch %i' % i
        indices = range(len(train_src))
        np.random.shuffle(indices)
        train_src_batch = [train_src[ind] for ind in indices]
        train_tgt_batch = [train_tgt[ind] for ind in indices]
        assert len(train_src_batch) == len(train_tgt_batch)
        costs = []
        for j in xrange(len(train_src_batch)):
            new_cost = f_train(
                np.array([source_word2idx[x] for x in train_src_batch[j]]).astype(np.int32),
                np.array([target_word2idx[x] for x in train_tgt_batch[j]][:-1]).astype(np.int32),
                np.array([target_word2idx[x] for x in train_tgt_batch[j]][1:]).astype(np.int32)
            )
            all_costs.append((j, new_cost))
            costs.append(new_cost)
            if j % 300 == 0:
                print j, np.mean(costs)
                costs = []
            if np.isnan(new_cost):
                print 'NaN detected.'
                break
            if j % 10000 == 0 and j != 0:
                valid_preds = get_predictions(
                    source_word2idx, target_word2idx, target_idx2word, f_eval, mode="validation")
                bleu = get_validation_bleu(valid_preds)
                print '==================================================================='
                print 'Epoch %i BLEU on Validation : %s ' % (i, bleu)
                print '==================================================================='
                if float(bleu) >= best_valid_score:
                    best_valid_score = float(bleu)
                    best_valid_preds = copy.deepcopy(valid_preds)
                    best_test_preds = get_predictions(
                        source_word2idx, target_word2idx, target_idx2word, f_eval, mode="test")
                    print 'Found new best validation score %f ' % (best_valid_score)
                log.write(
                    'Epoch %d Minibatch %d BLEU on Validation : %s \n' % (i, j, bleu))

        # Store after epoch
        fout = open('output' + str(i) + '.txt', 'w')
        for line in best_test_preds:
            fout.write(' '.join(line) + '\n')
        fout.close()

    log.close()