Exemple #1
0
def calc_sent_loss(sent):
    # Create a computation graph
    dy.renew_cg()

    # Get embeddings for the sentence
    emb = [W_w_p[x] for x in sent]

    # Sample K negative words for each predicted word at each position
    all_neg_words = np.random.choice(nwords,
                                     size=2 * N * K * len(emb),
                                     replace=True,
                                     p=word_probabilities)

    # W_w = dy.parameter(W_w_p)
    # Step through the sentence and calculate the negative and positive losses
    all_losses = []
    for i, my_emb in enumerate(emb):
        neg_words = all_neg_words[i * K * 2 * N:(i + 1) * K * 2 * N]
        pos_words = (
            [sent[x] if x >= 0 else S for x in range(i - N, i)] +
            [sent[x] if x < len(sent) else S for x in range(i + 1, i + N + 1)])
        neg_loss = -dy.log(
            dy.logistic(
                -dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words))))
        pos_loss = -dy.log(
            dy.logistic(
                dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words))))
        all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss))
    return dy.esum(all_losses)
    def __call__(self,
                 indexes: Dict[str, List[Indices]],
                 is_train=False) -> List[dy.Expression]:
        len_s = len(indexes['head'][0])
        batch_num = len(indexes['head'])
        vectors = []
        for i in range(len_s):
            # map token indexes -> vector
            w_idxes = [indexes['word']['word'][x][i] for x in range(batch_num)]
            g_idxes = [
                indexes['word']['glove'][x][i] for x in range(batch_num)
            ]
            t_idxes = [indexes['tag']['tag'][x][i] for x in range(batch_num)]
            w_vec = dy.lookup_batch(self.wlookup, w_idxes)
            g_vec = dy.lookup_batch(self.glookup, g_idxes, False)
            w_vec += g_vec
            t_vec = dy.lookup_batch(self.tlookup, t_idxes)

            # build token mask with dropout scale
            # For only word dropped: tag * 3
            # For only tag dropped: word * 1.5
            # For both word and tag dropped: 0 vector
            if is_train:
                wm = np.random.binomial(1, 1. - self.cfg.WORD_DROP,
                                        batch_num).astype(np.float32)
                tm = np.random.binomial(1, 1. - self.cfg.TAG_DROP,
                                        batch_num).astype(np.float32)
                scale = np.logical_or(wm, tm) * 3 / (2 * wm + tm + 1e-12)
                wm *= scale
                tm *= scale
                w_vec *= dy.inputTensor(wm, batched=True)
                t_vec *= dy.inputTensor(tm, batched=True)
            vectors.append(dy.concatenate([w_vec, t_vec]))
        return vectors
Exemple #3
0
 def init(self, x, usr, test=True, update=True, update_mode='full'):
     if update_mode=='biases':
         self.usr_vec = dy.logsumexp_dim(self.B_p.expr(True) + dy.lookup_batch(self.U_p, usr, True), d=1)
     elif update_mode=='mixture_weights':
         self.usr_vec = dy.logsumexp_dim(self.B_p.expr(update) + dy.lookup_batch(self.U_p, usr, True), d=1)
     else:
         self.usr_vec = dy.logsumexp_dim(self.B_p.expr(update) + dy.lookup_batch(self.U_p, usr, update), d=1)
Exemple #4
0
    def baseline(self, sentences):
        # LTR / random non-projective
        if self.order == 1 or self.order == 3:
            if self.order == 3:
                np.random.shuffle(sentences)
            vecs = [self.tree_lstm(
                L = None,
                R = None,
                x = dy.lookup_batch(self.embeddings, sentences[i], update=self.update_embeddings),
            ) for i in range(sentences.shape[0])]

            state = vecs[0]
            for i in range(1, sentences.shape[0]):
                state = self.tree_lstm(L = state, R = vecs[i], x = None)

        # RTL
        elif self.order == 2:
            vecs = [self.tree_lstm(
                L = None,
                R = None,
                x = dy.lookup_batch(self.embeddings, sentences[i], update=self.update_embeddings),
            ) for i in range(sentences.shape[0])]
            state = vecs[0]
            for i in range(1, sentences.shape[0]):
                state = self.tree_lstm(L = vecs[i], R = state, x = None)

        else:
            raise ValueError("Invalid composition order "+str(self.order))

        return state.h
Exemple #5
0
    def get_loss_batch(self, sent_array):
        renew_cg()
        init_state = self.builder.initial_state()

        R = parameter(self.R)
        bias = parameter(self.bias)
        wids = []
        masks = []

        # get the wids and masks for each step
        # "I am good", "This is good", "Good Morning" -> [['I', 'Today', 'Good'], ['am', 'is', 'Morning'], ['good', 'good', '<S>'], ['I', 'Today', 'Good'], ['am', 'is', 'Morning'], ['good', 'good', '<S>']]

        tot_words = 0
        wids = []
        masks = []
        for i in range(len(sent_array[0])):
            wids.append([(sent[i] if len(sent) > i else 3)
                         for sent in sent_array])
            mask = [(1 if len(sent) > i else 0) for sent in sent_array]
            masks.append(mask)
            tot_words += sum(mask)

        # start the rnn by inputting "<s>"
        init_ids = [2] * len(sent_array)
        #print dy.lookup_batch(self.lookup,init_ids)
        #print "Looked up"
        s = init_state.add_input(dy.lookup_batch(self.lookup, init_ids))

        # feed word vectors into the RNN and predict the next word
        losses = []
        for wid, mask in zip(wids, masks):
            # calculate the softmax and loss
            #print "WID ", wid
            score = dy.affine_transform([bias, R, s.output()])
            loss = dy.pickneglogsoftmax_batch(score, wid)
            # mask the loss if at least one sentence is shorter
            if mask[-1] != 1:
                mask_expr = dy.inputVector(mask)
                mask_expr = dy.reshape(mask_expr, (1, ), len(sent_array))
                loss = loss * mask_expr
            losses.append(loss)
            # update the state of the RNN
            wemb = dy.lookup_batch(self.lookup, wid)
            s = s.add_input(wemb)

        return dy.sum_batches(dy.esum(losses)), tot_words

        errs = []  # will hold expressions
        es = []

        for (wid, mask) in zip(wids, masks):
            # assume word is already a word-id
            x_t = lookup(self.lookup, int(cw))
            state = state.add_input(x_t)
            y_t = state.output()
            r_t = bias + (R * y_t)
            err = pickneglogsoftmax(r_t, int(nw))
            errs.append(err)
        nerr = esum(errs)
        return nerr
    def step_batch(self, instances):
        dy.renew_cg()
        self.l2r_builder.set_dropout(0.2)
        self.r2l_builder.set_dropout(0.2)
        self.dec_builder.set_dropout(0.2)
    
        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        src_sents = [x[0] for x in instances]
        padded_src = self.__pad_batch(src_sents, True)
        src_cws = np.transpose(padded_src)
        tgt_sents = [x[1] for x in instances]
        padded_tgt = self.__pad_batch(tgt_sents, False)
        masks_tgt, num_words = self.__mask(tgt_sents)
        masks_tgt = np.transpose(masks_tgt)
        padded_tgt = np.transpose(padded_tgt)
        instance_size = len(instances)
        src_cws_rev = list(reversed(src_cws))
        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for (cws_l2r, cws_r2l) in zip(src_cws, src_cws_rev):
            l2r_state = l2r_state.add_input(dy.lookup_batch(self.src_lookup, cws_l2r))
            r2l_state = r2l_state.add_input(dy.lookup_batch(self.src_lookup, cws_r2l))
            l2r_contexts.append(l2r_state.output()) #[<S>, x_1, x_2, ..., </S>]
            r2l_contexts.append(r2l_state.output()) #[</S> x_n, x_{n-1}, ... <S>]
        r2l_contexts.reverse() #[<S>, x_1, x_2, ..., </S>]
        # Combine the left and right representations for every word
        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)
        losses = []
        
        # Decoder
        c_t = dy.vecInput(self.hidden_size * 2)
        start = dy.concatenate([dy.lookup_batch(self.tgt_lookup, len(tgt_sents) * [self.tgt_token_to_id[self.src_pad]]), c_t])
        dec_state = self.dec_builder.initial_state().add_input(start)

        for (cws, nws, mask) in zip(padded_tgt, padded_tgt[1:], masks_tgt):
            h_e = dec_state.output()
            c_t = self.__attention_mlp(h_fs_matrix, h_e)
            # Get the embedding for the current target word
            embed_t = dy.lookup_batch(self.tgt_lookup, cws)
            # Create input vector to the decoder
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            y_star = b_y + W_y * dec_state.output()
            loss = dy.pickneglogsoftmax_batch(y_star, nws)
            if mask[-1] == 0:
                mask_loss = dy.reshape(dy.inputVector(mask), (1,), instance_size)
                masked = loss * mask_loss
                losses.append(masked)
            else:
                losses.append(loss)
            #losses = [(x / num_words) for x in losses]
        return dy.sum_batches(dy.esum(losses)), num_words
    def step_batch(self, instances):
        dy.renew_cg()
        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        src_sents = [x[0] for x in instances]
        padded_src = self.__pad_batch(src_sents)
        masks_src = np.transpose(self.__mask(padded_src))
        src_cws = np.transpose(padded_src)
        tgt_sents = [x[1] for x in instances]
        tgt_ids = []

        for sent in tgt_sents:
            sent = [self.tgt_token_to_id[x] for x in sent]
            tgt_ids.append(sent)

        tgt_ids = map(list, zip(*tgt_ids))
        padded_src_rev = list(reversed(padded_src))
        src_cws_rev = np.transpose(padded_src_rev)
        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for (cws_l2r, cws_r2l) in zip(src_cws, src_cws_rev):
            l2r_state = l2r_state.add_input(dy.lookup_batch(self.src_lookup, cws_l2r))
            r2l_state = r2l_state.add_input(dy.lookup_batch(self.src_lookup, cws_r2l))
            l2r_contexts.append(l2r_state.output()) #[<S>, x_1, x_2, ..., </S>]
            r2l_contexts.append(r2l_state.output()) #[</S> x_n, x_{n-1}, ... <S>]
        r2l_contexts.reverse() #[<S>, x_1, x_2, ..., </S>]
        # Combine the left and right representations for every word
        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)
        losses = []
        num_words = 0

        # Decoder
        c_t = dy.vecInput(self.hidden_size * 2)
        start = dy.concatenate([dy.lookup_batch(self.tgt_lookup, len(tgt_sents) * [self.tgt_token_to_id['<S>']]), c_t])
        dec_state = self.dec_builder.initial_state().add_input(start)

        for (cws, nws, mask) in zip(tgt_ids, tgt_ids[1:], masks_src):
            h_e = dec_state.output()
            c_t = self.__attention_mlp(h_fs_matrix, h_e)
            # Get the embedding for the current target word
            embed_t = dy.lookup_batch(self.tgt_lookup, cws)
            # Create input vector to the decoder
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            y_star = b_y + W_y * dec_state.output()
            loss = dy.pickneglogsoftmax_batch(y_star, nws)
            if mask[0] == 0:
                mask_loss = dy.reshape(dy.inputVector(mask), (1,), self.BATCH_SIZE)
                loss = loss * mask_loss
            losses.append(loss)
            num_words += 1
        return dy.sum_batches(dy.esum(losses)/num_words), num_words
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])

    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs(
        [dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output()
    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append(
            [sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)

    current_state = LSTM_TRG_BUILDER.initial_state().set_s(
        [src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the
        current_state = current_state.add_input(
            dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1, ), len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
    def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size):
        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # initial "input feeding" vectors to feed decoder - 3*h
        init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size)

        # initial feedback embeddings for the decoder, use begin seq symbol embedding
        init_feedback = dn.lookup_batch(self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size)

        # init decoder rnn
        decoder_init = dn.concatenate([init_feedback, init_input_feeding])
        s = s_0.add_input(decoder_init)

        # loss per timestep
        losses = []

        # run the decoder through the output sequences and aggregate loss
        for i, step_word_ids in enumerate(output_word_ids):

            # returns h x batch size matrix
            decoder_rnn_output = s.output()

            # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix)
            attention_output_vector, alphas = self.attend(encoded_inputs, decoder_rnn_output, input_masks)

            # compute output scores (returns vocab_size x batch size matrix)
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform([self.bias, self.readout, attention_output_vector])

            # get batch loss for this timestep
            batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids)

            # mask the loss if at least one sentence is shorter
            if output_masks and output_masks[i][-1] != 1:
                mask_expr = dn.inputVector(output_masks[i])
                # noinspection PyArgumentList
                mask_expr = dn.reshape(mask_expr, (1,), batch_size)
                batch_loss = batch_loss * mask_expr

            # input feeding approach - input h (attention_output_vector) to the decoder
            # prepare for the next iteration - "feedback"
            feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids)
            decoder_input = dn.concatenate([feedback_embeddings, attention_output_vector])
            s = s.add_input(decoder_input)

            losses.append(batch_loss)

        # sum the loss over the time steps and batch
        total_batch_loss = dn.sum_batches(dn.esum(losses))
        return total_batch_loss
Exemple #10
0
 def init(self, x, usr, test=True, update=True, update_mode='full'):
     self.Wh = self.bh_p
     self.bh = self.bh_p
     self.Su = self.Su_p
     self.bu = self.bu_p
     if update_mode=='biases':
         self.usr_vec = dy.lookup_batch(self.BU_p, usr, True)
     else:
         self.usr_vec = dy.lookup_batch(self.BU_p, usr, update)
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output()
    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Exemple #12
0
    def get_hidden_states(self, word_ids, upos_ids):
        n = word_ids.shape[-1]

        word_embs = [
            dy.lookup_batch(self.wlookup, word_ids[:, i]) for i in range(n)
        ]
        upos_embs = [
            dy.lookup_batch(self.tlookup, upos_ids[:, i]) for i in range(n)
        ]
        words = [dy.concatenate([w, p]) for w, p in zip(word_embs, upos_embs)]
        state_pairs_list = self.deep_bilstm.add_inputs(words)
        return state_pairs_list
Exemple #13
0
 def init(self, x, usr, test=True, update=True, update_mode='full'):
     self.Wh = self.bh_p
     self.bh = self.bh_p
     self.Su = self.Su_p
     self.bu = self.bu_p
     if update_mode=='biases':
         #self.usr_vec = self.BU_p.expr(True)#dy.pick(self.B_p.expr(True), index=0, dim=1)# * dy.lookup_batch(self.U_p, usr, True)
         self.usr_vec = self.B_p.expr(True) * dy.lookup_batch(self.U_p, usr, True)
     elif update_mode=='mixture_weights':
         self.usr_vec = self.B_p * dy.lookup_batch(self.U_p, usr, True)
     else:
         self.usr_vec = self.B_p * dy.lookup_batch(self.U_p, usr, update)
Exemple #14
0
    def encode(self, src, test=False):
        """Encode a batch of sentences
        
        Arguments:
            src (list): List of sentences. It is assumed that all source sentences have the same length
        
        Keyword Arguments:
            test (bool) -- Switch used for things like dropout where the behaviour is different at test time (default: (False)
        
        Returns:
            dynet.Expression -- Expression of the encodings
        """
        # Prepare batch
        x, _ = self.prepare_batch(src, self.src_eos)
        # Add encoder to computation graph
        es = self.enc.initial_state()
        # Embed words
        wembs = [dy.lookup_batch(self.MS_p, iw) for iw in x]
        # Encode sentence
        encoded_states = es.transduce(wembs)
        # Use bidirectional encoder
        if self.bidir:
            res = self.rev_enc.initial_state()
            rev_encoded_states = res.transduce(wembs[::-1])[::-1]
        # Create encoding matrix
        H = dy.concatenate_cols(encoded_states)
        if self.bidir:
            H_bidir = dy.concatenate_cols(rev_encoded_states)
            H = dy.concatenate([H, H_bidir])
        if self.word_emb:
            H_word_embs = dy.concatenate_cols(wembs)
            H = dy.concatenate([H, H_word_embs])

        return H
Exemple #15
0
    def encode(self, src_sents):
        dy.renew_cg()

        # bidirectional representations
        forward_state = self.enc_forward_builder.initial_state()
        backward_state = self.enc_backward_builder.initial_state()

        src_words, src_masks = input_transpose(src_sents)
        src_words_embeds = [
            dy.lookup_batch(self.src_lookup, wids) for wids in src_words
        ]
        src_words_embeds_reversed = src_words_embeds[::-1]

        forward_states = forward_state.add_inputs(src_words_embeds)
        backward_states = backward_state.add_inputs(
            src_words_embeds_reversed)[::-1]

        src_encodings = []
        forward_cells = []
        backward_cells = []
        for forward_state, backward_state in zip(forward_states,
                                                 backward_states):
            fwd_cell, fwd_enc = forward_state.s()
            bak_cell, bak_enc = backward_state.s()

            src_encodings.append(dy.concatenate([fwd_enc, bak_enc]))
            forward_cells.append(fwd_cell)
            backward_cells.append(bak_cell)

        decoder_init = dy.concatenate([forward_cells[-1], backward_cells[0]])
        return src_encodings, decoder_init
Exemple #16
0
 def test_concatenate_to_batch(self):
     dy.renew_cg()
     x = dy.lookup_batch(self.p, [0, 1])
     y = dy.pick_batch_elem(x, 0)
     z = dy.pick_batch_elem(x, 1)
     w = dy.concatenate_to_batch([y, z])
     self.assertTrue(np.allclose(w.npvalue(), self.pval.T))
Exemple #17
0
 def test_concatenate_to_batch(self):
     dy.renew_cg()
     x = dy.lookup_batch(self.p, [0, 1])
     y = dy.pick_batch_elem(x, 0)
     z = dy.pick_batch_elem(x, 1)
     w = dy.concatenate_to_batch([y, z])
     self.assertTrue(np.allclose(w.npvalue(), self.pval.T))
Exemple #18
0
    def Ext_embeds(self, sentences, predictFlag=False):

        if predictFlag:
            wordtoidx = self.ext_words_devtest
            lookup_matrix = self.elookup_devtest
        else:
            wordtoidx = self.ext_words_train
            lookup_matrix = self.elookup_train

        idxtoword = {ind: word for word, ind in wordtoidx.items()}

        ext_embs = []
        for sent in sentences:
            ext_embs.extend([entry.norm for entry in sent])
        ext_embs_set = list(set(ext_embs))
        ext_embs_idx = []
        for emb in ext_embs_set:
            try:
                w_ind = wordtoidx[emb]
                ext_embs_idx.append(w_ind)
            except KeyError:
                continue
        ext_lookup_batch = dy.lookup_batch(lookup_matrix, ext_embs_idx)
        projected_embs = self.projected_embs(ext_lookup_batch)

        proj_embs = {}
        for idx in range(len(ext_embs_idx)):
            proj_embs[idxtoword[ext_embs_idx[idx]]] = dy.pick_batch_elem(
                projected_embs, idx)

        return proj_embs
 def __call__(self, x, test=True, update=True):
     wembs = [dy.lookup_batch(self.E, iw, update=update) for iw in x]
     # Encode sentence
     encoded_states = self.es.transduce(wembs)
     # Create encoding matrix
     H = dy.concatenate_cols(encoded_states)
     return H
Exemple #20
0
 def test_pick_batch_elems(self):
     dy.renew_cg()
     x = dy.lookup_batch(self.p, [0, 1])
     y = dy.pick_batch_elems(x, [0])
     self.assertTrue(np.allclose(y.npvalue(), self.pval[0]))
     z = dy.pick_batch_elems(x, [0, 1])
     self.assertTrue(np.allclose(z.npvalue(), self.pval.T))
Exemple #21
0
 def test_pick_batch_elems(self):
     dy.renew_cg()
     x = dy.lookup_batch(self.p, [0, 1])
     y = dy.pick_batch_elems(x, [0])
     self.assertTrue(np.allclose(y.npvalue(), self.pval[0]))
     z = dy.pick_batch_elems(x, [0, 1])
     self.assertTrue(np.allclose(z.npvalue(), self.pval.T))
Exemple #22
0
    def RNN_embeds(self, sentences, predictFlag=False):

        tokenIdChars = []
        for sent in sentences:
            tokenIdChars.extend([entry.idChars for entry in sent])
        tokenIdChars_set = set(map(tuple, tokenIdChars))
        tokenIdChars = list(map(list, tokenIdChars_set))
        tokenIdChars.sort(key=lambda x: -len(x))

        char_src_len = len(max(tokenIdChars, key=len))
        chars_mask = []
        char_ids = []
        for i in range(char_src_len):
            char_ids.append([(chars[i] if len(chars) > i else 4)
                             for chars in tokenIdChars])
            char_mask = [(1 if len(chars) > i else 0)
                         for chars in tokenIdChars]
            chars_mask.append(char_mask)
        char_embs = []
        for cid in char_ids:
            char_embs.append(dy.lookup_batch(self.clookup, cid))
        wordslen = list(map(lambda x: len(x), tokenIdChars))

        chr_embs = self.HybridCharembs.predict_sequence_batched(
            char_embs, chars_mask, wordslen, predictFlag)

        RNN_embs = {}
        for idx in range(len(tokenIdChars)):
            RNN_embs[str(tokenIdChars[idx])] = dy.pick_batch_elem(
                chr_embs, idx)

        return RNN_embs
Exemple #23
0
 def decode(self, prev_words):
     prev_dec_output = self.decoder(
         dy.lookup_batch(self.tgt_embeddings, prev_words))
     # Using Bahdanau-style attention so we use the previous decoder output
     context_vector, _ = self.attention(self.encoder.encodings_matrix,
                                        prev_dec_output)
     scores = self.decoder.score(prev_dec_output, context_vector)
     return scores
Exemple #24
0
 def score_one_sequence(self, tag_scores, tags, batch_size):
     ''' tags: list of tag ids at each time step '''
     # print tags, batch_size
     # print batch_size
     # print "scoring one sentence"
     tags = [[self.start_id] * batch_size
             ] + tags  # len(tag_scores) = len(tags) - 1
     score = dy.inputTensor(np.zeros(batch_size), batched=True)
     # tag_scores = dy.concatenate_cols(tag_scores) # tot_tags, sent_len, batch_size
     # print "tag dim: ", tag_scores.dim()
     for i in range(len(tags) - 1):
         score += dy.pick_batch(dy.lookup_batch(self.transition_matrix, tags[i + 1]), tags[i]) \
                  + dy.pick_batch(tag_scores[i], tags[i + 1])
     score += dy.pick_batch(
         dy.lookup_batch(self.transition_matrix,
                         [self.end_id] * batch_size), tags[-1])
     return score
    def embed_sentence(self, ws, pwords, ts, chars, is_train):
        cembed = [dy.lookup_batch(self.clookup, c) for c in chars]
        char_fwd, char_bckd = self.char_lstm.builder_layers[0][0].initial_state().transduce(cembed)[-1], \
                              self.char_lstm.builder_layers[0][1].initial_state().transduce(reversed(cembed))[-1]
        crnn = dy.reshape(dy.concatenate_cols([char_fwd, char_bckd]), (self.options.we, ws.shape[0] * ws.shape[1]))
        cnn_reps = [list() for _ in range(len(ws))]
        for i in range(ws.shape[0]):
            cnn_reps[i] = dy.pick_batch(crnn, [i * ws.shape[1] + j for j in range(ws.shape[1])], 1)

        wembed = [dy.lookup_batch(self.wlookup, ws[i]) + dy.lookup_batch(self.elookup, pwords[i]) + cnn_reps[i] for i in range(len(ws))]
        posembed = [dy.lookup_batch(self.tlookup, ts[i]) for i in range(len(ts))]
        if (not is_train) or self.options.dropout == 0:
            return [dy.concatenate([wembed[i], posembed[i]]) for i in range(len(ts))]
        else:
            emb_masks = self.generate_emb_mask(ws.shape[0], ws.shape[1])
            return [dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in
                      zip(wembed, posembed, emb_masks)]
Exemple #26
0
    def run_lstm(self, word_inputs, tag_inputs, isTrain=True):
        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]

        word_embs = [
            dy.lookup_batch(
                self.word_embs,
                np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) +
            dy.lookup_batch(self.pret_word_embs, w, update=False)
            for w in word_inputs
        ]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        common_top_input, c_fs, c_bs = biLSTM(
            self.cLSTM_builders, emb_inputs, batch_size,
            self.dropout_clstm_input if isTrain else 0.,
            self.dropout_clstm_hidden if isTrain else 0.)
        common_top_recur = dy.concatenate_cols(common_top_input)

        private_top_input, p_fs, p_bs = biLSTM(
            self.pLSTM_builders, emb_inputs, batch_size,
            self.dropout_plstm_input if isTrain else 0.,
            self.dropout_plstm_hidden if isTrain else 0.)
        private_top_recur = dy.concatenate_cols(private_top_input)

        if isTrain:
            common_top_recur = dy.dropout_dim(common_top_recur, 1,
                                              self.dropout_mlp)
            private_top_recur = dy.dropout_dim(private_top_recur, 1,
                                               self.dropout_mlp)

        return common_top_recur, private_top_recur, p_fs, p_bs
Exemple #27
0
def decode_batch(dec_lstm, vectors, output):
    output = [EOS] + list(output) + [EOS]
    output = [char2int[c] for c in output]
    #output = [c for c in output]
    output = array([
        output,
    ] * MB_SIZE)
    output = np.transpose(output)
    #print('output ',output)
    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)
    w1 = dy.parameter(attention_w1)

    #print('len vectors ', len(vectors))
    #print('dim ', array(vectors[0].value()).shape)
    #print('dim ', vectors[0].value())
    input_mat = dy.concatenate_cols(vectors)
    #print("input_mat dim ", array(input_mat.value()).shape)
    w1dt = None

    last_output_embeddings = dy.lookup_batch(
        output_lookup, array([
            char2int[EOS],
        ] * MB_SIZE))
    #last_output_embeddings = output_lookup[char2int[EOS]]
    #print("last_output_embeddings dim ",array(last_output_embeddings.value()).shape)
    s = dec_lstm.initial_state().add_input(
        dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings]))
    losses = []

    for chars in output:
        #print(chars)
        # w1dt can be computed and cached once for the entire decoding phase
        w1dt = w1dt or w1 * input_mat
        vector = dy.concatenate(
            [attend(input_mat, s, w1dt), last_output_embeddings])
        s = s.add_input(vector)
        out_vector = w * s.output() + b
        #print(out_vector.value())
        loss = dy.pickneglogsoftmax_batch(out_vector, chars)
        #probs = dy.softmax(out_vector)
        last_output_embeddings = dy.lookup_batch(output_lookup, chars)
        #loss.append(-dy.log(dy.pick(probs, char)))
        losses.append(loss)
    return dy.sum_batches(dy.esum(losses))
    def generate(self, minibatch):
        words, pwords, tags, _, _, _, chars, sen_lens, masks = minibatch
        embedded = self.embed_sentence(words, pwords, tags, chars, False)
        encoded = self.encode_sentence(embedded)
        input_mat = dy.concatenate_cols(encoded)
        w1dt = None

        last_output_embeddings = dy.lookup_batch(self.wlookup, words[0])
        last_tag_embeddings = dy.lookup_batch(self.tlookup, tags[0])
        empty_tensor = dy.reshape(dy.inputTensor(np.zeros((self.options.hdim * 2, len(words[0])), dtype=float)),
                                  (self.options.hdim * 2,), len(words[0]))
        s = self.dec_lstm.initial_state().add_input(dy.concatenate([empty_tensor, last_output_embeddings, last_tag_embeddings]))

        out = np.zeros((words.shape[1], words.shape[0]), dtype=int)
        first_mask = np.full((words.shape[0], words.shape[1]), -float('inf'), dtype=float)
        mask = np.zeros((words.shape[0], words.shape[1]), dtype=float)
        first_mask[0] = np.array([0] * words.shape[1])
        mask[0] = np.array([-float('inf')] * words.shape[1])
        for m1 in range(masks.shape[0]):
            for m2 in range(masks.shape[1]):
                if masks[m1][m2] == 0:
                    mask[m1][m2] = -float('inf')
                if sen_lens[m2] - 1 <= m1:
                    mask[m1][m2] = -float('inf')

        for p in range(len(words)):
            # w1dt can be computed and cached once for the entire decoding phase
            w1dt = w1dt or self.attention_w1.expr() * input_mat
            att_weights = self.attend(s, w1dt, False)
            vector = dy.concatenate([input_mat * att_weights, last_output_embeddings, last_tag_embeddings])
            s = s.add_input(vector)

            scores = (att_weights).npvalue().reshape((mask.shape[0], mask.shape[1]))
            cur_mask = first_mask if p == 0 else mask
            scores = np.sum([scores, cur_mask], axis=0)
            next_positions = np.argmax(scores, axis=0)
            next_words = [words[position][i] for i, position in enumerate(next_positions)]
            next_tags = [tags[position][i] for i, position in enumerate(next_positions)]
            for i, position in enumerate(next_positions):
                mask[position][i] = -float('inf')
                out[i][p] = position
            last_output_embeddings = dy.lookup_batch(self.wlookup, next_words)
            last_tag_embeddings = dy.lookup_batch(self.tlookup, next_tags)
        dy.renew_cg()
        return out
    def embed_batch_seq(self, wids):
        """
        Embedding method for a batch of sentences
        :param wids: Word IDs for a batch of sentences
        :return: Word embedding matrix
        """

        wembs_batch = [dynet.lookup_batch(self.src_lookup, wid) for wid in wids]
        return wembs_batch
Exemple #30
0
    def BuildLMGraph_batch(self, batch, sent_args=None):
        if "skip_renew" not in sent_args: dynet.renew_cg()

        APPLY_DROPOUT = self.args.dropout is not None and (
            "test" not in sent_args or sent_args["test"] != True)
        if APPLY_DROPOUT: self.gen_rnn.set_dropout(self.args.dropout)
        else: self.gen_rnn.disable_dropout()

        init_state = self.gen_rnn.initial_state()

        #MASK SENTENCES
        isents = []  # Dimension: maxSentLength * minibatch_size

        # List of lists to store whether an input is
        # present(1)/absent(0) for an example at a time step
        masks = []  # Dimension: maxSentLength * minibatch_size

        #No of words processed in this batch
        maxSentLength = max([len(sent) for sent in batch])

        for sent in batch:
            isents.append([self.vocab[word].i for word in sent] + [
                self.vocab[self.vocab.END_TOK].i
                for _ in range(maxSentLength - len(sent))
            ])
            masks.append([1 for _ in sent] +
                         [0 for _ in range(maxSentLength - len(sent))])
        isents = map(list, zip(*isents))  # transposes
        masks = map(list, zip(*masks))

        R = dynet.parameter(self.gen_R)
        bias = dynet.parameter(self.gen_bias)
        vocab_basis = dynet.transpose(
            dynet.concatenate_cols(
                [self.gen_lookup[i] for i in range(self.vocab.size)]))
        errs = []  # will hold expressions
        state = init_state

        for (mask, curr_words, next_words) in zip(masks[1:], isents,
                                                  isents[1:]):
            x_t = dynet.lookup_batch(self.gen_lookup, curr_words)
            state = state.add_input(x_t)
            y_t = state.output()
            if APPLY_DROPOUT: y_t = dynet.dropout(y_t, self.args.dropout)
            r_t = vocab_basis * (bias + (R * y_t))
            err = dynet.pickneglogsoftmax_batch(r_t, next_words)

            ## mask the loss if at least one sentence is shorter. (sents sorted reverse-length, so it must be bottom)
            if mask[-1] == 0:
                mask_expr = dynet.inputVector(mask)
                mask_expr = dynet.reshape(mask_expr, (1, ), len(mask))
                err = err * mask_expr

            errs.append(err)
        nerr = dynet.esum(errs)
        return nerr
Exemple #31
0
    def BuildLMGraph(self, sents):
        dy.renew_cg()
        # initialize the RNN
        init_state = self.builder.initial_state()
        # parameters -> expressions
        R = dy.parameter(self.R)
        bias = dy.parameter(self.bias)

        S = vocab.w2i["<s>"]
        # get the cids and masks for each step
        tot_chars = 0
        cids = []
        masks = []

        for i in range(len(sents[0])):
            cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S)
                         for sent in sents])
            mask = [(1 if len(sent) > i else 0) for sent in sents]
            masks.append(mask)
            tot_chars += sum(mask)

        # start the rnn with "<s>"
        init_ids = cids[0]
        s = init_state.add_input(dy.lookup_batch(self.lookup, init_ids))

        losses = []

        # feed char vectors into the RNN and predict the next char
        for cid, mask in zip(cids[1:], masks[1:]):
            score = dy.affine_transform([bias, R, s.output()])
            loss = dy.pickneglogsoftmax_batch(score, cid)
            # mask the loss if at least one sentence is shorter
            if mask[-1] != 1:
                mask_expr = dy.inputVector(mask)
                mask_expr = dy.reshape(mask_expr, (1, ), len(sents))
                loss = loss * mask_expr

            losses.append(loss)
            # update the state of the RNN
            cemb = dy.lookup_batch(self.lookup, cid)
            s = s.add_input(cemb)

        return dy.sum_batches(dy.esum(losses)), tot_chars
 def _get_probabilities_over_batch(self, batch):
     dy.renew_cg()
     # The I iteration embed all the i-th items in all batches
     embedded = [
         dy.lookup_batch(self.input_lookup, chars) for chars in zip(*batch)
     ]
     state = self.rnn.initial_state()
     output_vec = state.transduce(embedded)[-1]
     w = self.W.expr(update=False)
     return w * output_vec
Exemple #33
0
    def cal_scores(self, src_encodings, masks, train):

        src_len = len(src_encodings)
        batch_size = src_encodings[0].dim()[1]
        heads_LRlayer = []
        mods_LRlayer = []
        for encoding in src_encodings:
            heads_LRlayer.append(
                self.leaky_ReLu(self.b_head.expr() +
                                self.W_head.expr() * encoding))
            mods_LRlayer.append(
                self.leaky_ReLu(self.b_mod.expr() +
                                self.W_mod.expr() * encoding))

        heads_labels = []
        heads = []
        labels = []
        neg_inf = dy.constant(1, -float("inf"))
        for row in range(
                1, src_len
        ):  #exclude root @ index=0 since roots do not have heads

            scores_idx = []
            for col in range(src_len):

                dist = col - row
                mdist = self.dist_max
                dist_i = (min(dist, mdist - 1) + mdist if dist >= 0 else int(
                    min(-1.0 * dist, mdist - 1)))
                dist_vec = dy.lookup_batch(self.dlookup, [dist_i] * batch_size)
                if train:
                    input_vec = dy.concatenate([
                        dy.esum([
                            dy.dropout(heads_LRlayer[col], self.dropout),
                            dy.dropout(mods_LRlayer[row], self.dropout)
                        ]), dist_vec
                    ])
                else:
                    input_vec = dy.concatenate([
                        dy.esum([heads_LRlayer[col], mods_LRlayer[row]]),
                        dist_vec
                    ])
                score = self.scoreHeadModLabel(input_vec, train)
                mask = masks[row] and masks[col]
                join_scores = []
                for bdx in range(batch_size):
                    if (mask[bdx] == 1):
                        join_scores.append(dy.pick_batch_elem(score, bdx))
                    else:
                        join_scores.append(
                            dy.concatenate([neg_inf] * self.n_labels))
                scores_idx.append(dy.concatenate_to_batch(join_scores))
            heads_labels.append(dy.concatenate(scores_idx))

        return heads_labels
Exemple #34
0
    def BuildLMGraph(self, sents):
        dy.renew_cg()
        # initialize the RNN
        init_state = self.builder.initial_state()
        # parameters -> expressions
        R = dy.parameter(self.R)
        bias = dy.parameter(self.bias)

        S = vocab.w2i["<s>"]
        # get the cids and masks for each step
        tot_chars = 0
        cids = []
        masks = []

        for i in range(len(sents[0])):
            cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents])
            mask = [(1 if len(sent)>i else 0) for sent in sents]
            masks.append(mask)
            tot_chars += sum(mask)

        # start the rnn with "<s>"
        init_ids = cids[0]
        s = init_state.add_input(dy.lookup_batch(self.lookup, init_ids))

        losses = []

        # feed char vectors into the RNN and predict the next char
        for cid, mask in zip(cids[1:], masks[1:]):
            score = dy.affine_transform([bias, R, s.output()])
            loss = dy.pickneglogsoftmax_batch(score, cid)
            # mask the loss if at least one sentence is shorter
            if mask[-1] != 1:
                mask_expr = dy.inputVector(mask)
                mask_expr = dy.reshape(mask_expr, (1,), len(sents))
                loss = loss * mask_expr

            losses.append(loss)
            # update the state of the RNN
            cemb = dy.lookup_batch(self.lookup, cid)
            s = s.add_input(cemb)

        return dy.sum_batches(dy.esum(losses)), tot_chars
Exemple #35
0
def calc_score_of_histories(words, dropout=0.0):
  # This will change from a list of histories, to a list of words in each history position
  words = np.transpose(words)
  # Lookup the embeddings and concatenate them
  emb = dy.concatenate([dy.lookup_batch(W_emb, x) for x in words])
  # Create the hidden layer
  h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
  # Perform dropout
  if dropout != 0.0:
    h = dy.dropout(h, dropout)
  # Calculate the score and return
  return dy.affine_transform([b_sm, W_sm, h])
def calc_sent_loss(sent):
  # Create a computation graph
  dy.renew_cg()
  
  # Get embeddings for the sentence
  emb = [W_w_p[x] for x in sent]

  # Sample K negative words for each predicted word at each position
  all_neg_words = np.random.choice(nwords, size=2*N*K*len(emb), replace=True, p=word_probabilities)

  # W_w = dy.parameter(W_w_p)
  # Step through the sentence and calculate the negative and positive losses
  all_losses = [] 
  for i, my_emb in enumerate(emb):
    neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N]
    pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] +
                 [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)])
    neg_loss = -dy.log(dy.logistic(-dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words))))
    pos_loss = -dy.log(dy.logistic(dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words))))
    all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss))
  return dy.esum(all_losses)
Exemple #37
0
def calc_lm_loss(sents):
    dy.renew_cg()

    # initialize the RNN
    f_init = RNN.initial_state()

    # get the wids and masks for each step
    tot_words = 0
    wids = []
    masks = []
    for i in range(len(sents[0])):
        wids.append([(sent[i] if len(sent) > i else S) for sent in sents])
        mask = [(1 if len(sent) > i else 0) for sent in sents]
        masks.append(mask)
        tot_words += sum(mask)

    # start the rnn by inputting "<s>"
    init_ids = [S] * len(sents)
    s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids))

    # feed word vectors into the RNN and predict the next word
    losses = []
    for wid, mask in zip(wids, masks):
        # calculate the softmax and loss
        score = dy.affine_transform([b_exp, W_exp, s.output()])
        loss = dy.pickneglogsoftmax_batch(score, wid)
        # mask the loss if at least one sentence is shorter
        if mask[-1] != 1:
            mask_expr = dy.inputVector(mask)
            mask_expr = dy.reshape(mask_expr, (1,), len(sents))
            loss = loss * mask_expr
        losses.append(loss)
        # update the state of the RNN
        wemb = dy.lookup_batch(WORDS_LOOKUP, wid)
        s = s.add_input(wemb)

    return dy.sum_batches(dy.esum(losses)), tot_words
Exemple #38
0
import dynet as dy
import numpy as np

m = dy.Model()
lp = m.add_lookup_parameters((100,10))

# regular lookup
a = lp[1].npvalue()
b = lp[2].npvalue()
c = lp[3].npvalue()

# batch lookup instead of single elements.
# two ways of doing this.
abc1 = dy.lookup_batch(lp, [1,2,3])
print(abc1.npvalue())

abc2 = lp.batch([1,2,3])
print(abc2.npvalue())

print(np.hstack([a,b,c]))


# use pick and pickneglogsoftmax in batch mode
# (must be used in conjunction with lookup_batch):
print("\nPick")
W = dy.parameter( m.add_parameters((5, 10)) )
h = W * lp.batch([1,2,3])
print(h.npvalue())
print(dy.pick_batch(h,[1,2,3]).npvalue())
print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value())
Exemple #39
0
 def test_lookup_batch(self):
     dy.renew_cg()
     x = dy.lookup_batch(self.p, [0, 1])
     self.assertTrue(np.allclose(x.npvalue(), self.pval.T))
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #get the outputs of the first LSTM
    src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])]
    src_output = src_outputs[-1]

    #gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix

    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()
        att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words