Ejemplo n.º 1
0
    def cal_scores(self, src_encodings,predict=False):

        src_len = len(src_encodings)
        src_encodings = dy.concatenate_cols(src_encodings)  # src_ctx_dim, src_len, batch_size
        batch_size = src_encodings.dim()[1]

        W_pos = dy.parameter(self.W_pos)
        b_pos = dy.parameter(self.b_pos)
        W_xpos = dy.parameter(self.W_xpos)
        b_xpos = dy.parameter(self.b_xpos)


        W_affine_pos = dy.parameter(self.W_affine_pos)
        b_affine_pos = dy.parameter(self.b_affine_pos)
        W_affine_xpos = dy.parameter(self.W_affine_xpos)
        b_affine_xpos = dy.parameter(self.b_affine_xpos)

        if predict:
            pos = self.leaky_ReLu(dy.affine_transform([b_pos, W_pos, src_encodings]))  # n_pos_mlp_units, src_len, bs
            xpos = self.leaky_ReLu(dy.affine_transform([b_xpos, W_xpos, src_encodings]))

        else:
            src_encodings = dy.dropout_dim(src_encodings,1,self.dropout)
            pos = dy.dropout_dim(self.leaky_ReLu(dy.affine_transform([b_pos, W_pos, src_encodings])),1,self.dropout)  # n_pos_mlp_units, src_len, bs
            xpos = dy.dropout_dim(self.leaky_ReLu(dy.affine_transform([b_xpos, W_xpos, src_encodings])),1,self.dropout)


        pos_label = dy.affine_transform([b_affine_pos, dy.transpose(W_affine_pos), pos])
        xpos_label = dy.affine_transform([b_affine_xpos, dy.transpose(W_affine_xpos), xpos])

        return pos_label, xpos_label
Ejemplo n.º 2
0
    def predict_sequence_batched(self,
                                 inputs,
                                 mask_array,
                                 wlen,
                                 predictFlag=False):

        batch_size = inputs[0].dim()[1]
        src_len = len(inputs)

        if not predictFlag:
            self.charlstm.set_dropouts(self.dropout, self.dropout)
            self.charlstm.set_dropout_masks(batch_size)

        char_fwd = self.charlstm.initial_state(batch_size)
        recur_states, cells = char_fwd.add_inputs(inputs, mask_array,
                                                  predictFlag)

        hidden_states = []
        for idx in range(src_len):
            mask = dy.inputVector(mask_array[idx])
            mask_expr = dy.reshape(mask, (1, ), batch_size)
            hidden_states.append(recur_states[idx] * mask_expr)

        H = dy.concatenate_cols(hidden_states)

        if (predictFlag):
            a = dy.softmax(dy.transpose(self.W_atten.expr()) * H)
        else:
            #dropout attention connections(keep the same dim across the sequence)
            a = dy.softmax(
                dy.transpose(self.W_atten.expr()) *
                dy.dropout_dim(H, 1, self.dropout))

        cell_states = []
        for idx in range(batch_size):
            if (wlen[idx] > 0):
                cell = dy.pick_batch_elem(cells[wlen[idx] - 1], idx)
            else:
                cell = dy.zeros(self.ldims)

            cell_states.append(cell)

        C = dy.concatenate_to_batch(cell_states)

        H_atten = H * dy.transpose(a)
        char_emb = dy.concatenate([H_atten, C])

        if predictFlag:
            proj_char_emb = dy.affine_transform(
                [self.b_linear.expr(),
                 self.W_linear.expr(), char_emb])
        else:
            proj_char_emb = dy.affine_transform([
                self.b_linear.expr(),
                self.W_linear.expr(),
                dy.dropout(char_emb, self.dropout)
            ])

        return proj_char_emb
Ejemplo n.º 3
0
    def run_lstm(self, word_inputs, tag_inputs, isTrain=True):
        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]

        word_embs = [
            dy.lookup_batch(
                self.word_embs,
                np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) +
            dy.lookup_batch(self.pret_word_embs, w, update=False)
            for w in word_inputs
        ]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        common_top_input, c_fs, c_bs = biLSTM(
            self.cLSTM_builders, emb_inputs, batch_size,
            self.dropout_clstm_input if isTrain else 0.,
            self.dropout_clstm_hidden if isTrain else 0.)
        common_top_recur = dy.concatenate_cols(common_top_input)

        private_top_input, p_fs, p_bs = biLSTM(
            self.pLSTM_builders, emb_inputs, batch_size,
            self.dropout_plstm_input if isTrain else 0.,
            self.dropout_plstm_hidden if isTrain else 0.)
        private_top_recur = dy.concatenate_cols(private_top_input)

        if isTrain:
            common_top_recur = dy.dropout_dim(common_top_recur, 1,
                                              self.dropout_mlp)
            private_top_recur = dy.dropout_dim(private_top_recur, 1,
                                               self.dropout_mlp)

        return common_top_recur, private_top_recur, p_fs, p_bs
Ejemplo n.º 4
0
 def __call__(self, x, train=False):
     h = x
     # for W, b in zip(self.W[:-1], self.b[:-1]):
     for i in range(len(self.W[:-1])):
         h = self.act(self.W[i]*h + (self.b[i] if self.bias else 0))
         if train: 
             if len(h.dim()[0]) > 1: h = dy.dropout_dim(h, 1, self.dropout)
             else: h = dy.dropout(h, self.dropout)
     return self.W[-1]*h + (self.b[-1] if self.bias else 0)
Ejemplo n.º 5
0
 def __call__(self, x):
     for layer, dim in zip(self.layers, self.outdim):
         x = layer(x)
         if self.dropout > 0.:
             if self.dropout_dim >= 0:
                 x = dropout_dim(x, self.dropout_dim, self.dropout)
             else:
                 x = dropout(x, self.dropout)
     return x
Ejemplo n.º 6
0
 def __call__(self, x, train=False):
     h = x
     for i in range(len(self.W[:-1])):
         h = self.f(self.W[i] * h + (self.b[i] if self.bias else 0))
         if train:
             if len(h.dim()[0]) > 1:
                 h = dy.dropout_dim(h, 1, self.p)
             else:
                 h = dy.dropout(h, self.p)
     return self.W[-1] * h + (self.b[-1] if self.bias else 0)
Ejemplo n.º 7
0
 def encode(self, sentence):
     if self._train_flag:
         return dy.dropout_dim(
            self._bilstm.encode(\
             self._base.encode(sentence),\
             len(sentence)),
            1, self.dropout_rate)
     else:
         return self._bilstm.encode(\
            self._base.encode(sentence),\
            len(sentence))
Ejemplo n.º 8
0
 def next(self, word_idx, context, train, cur_state=None):
     embs = dy.pick_batch(self.E, word_idx)
     if train:
         embs = dy.dropout_dim(embs, 0, self.word_dropout)
     x = dy.concatenate([embs, context])
     if cur_state is None:
         self.dec_state = self.dec_state.add_input(x)
         next_state = self.dec_state
     else:
         next_state = cur_state.add_input(x)
     hidden = next_state.output()
     return hidden, embs, next_state
Ejemplo n.º 9
0
 def next(self, w, c, test=True, state=None):
     e = dy.pick_batch(self.E, w)
     if not test:
         e = dy.dropout_dim(e, 0, self.wdr)
     # Run LSTM
     if state is None:
         self.ds = self.ds.add_input(e)
         next_state = self.ds
     else:
         next_state = state.add_input(e)
     h = next_state.output()
     return h, e, next_state
Ejemplo n.º 10
0
    def next(self, w, c, test=True, state=None):
        if isinstance(w, dy.Expression):
            e = w
        else:
            e = dy.pick_batch(self.E, w)

        if not test:
            e = dy.dropout_dim(e, 0, self.wdr)
        x = dy.concatenate([e, c])
        # Run LSTM
        if state is None:
            self.ds = self.ds.add_input(x)
            next_state = self.ds
        else:
            next_state = state.add_input(x)
        h = next_state.output()
        return h, e, next_state
Ejemplo n.º 11
0
 def transduce(self, inputs, train):
     xs = inputs[:self.max_length]
     if not xs:
         return []
     for i in range(self.lstm_layers):
         for n, d in ("f", 1), ("b", -1):
             Wr, br, Wh = [self.params["%s%d%s" % (p, i, n)] for p in ("Wr", "br", "Wh")]
             hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d])
             hs = [hs_[0]]
             for t in range(1, len(hs_)):
                 r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br)
                 hs.append(dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t]))
             xs = hs
             if train:
                 x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout)
                 xs = [dy.pick(x, i, 1) for i in range(len(xs))]
     return xs
Ejemplo n.º 12
0
 def transduce(self, inputs, train):
     xs = inputs[:self.max_length]
     if not xs:
         return []
     for i in range(self.lstm_layers):
         for n, d in ("f", 1), ("b", -1):
             Wr, br, Wh = [
                 dy.parameter(self.params["%s%d%s" % (p, i, n)])
                 for p in ("Wr", "br", "Wh")
             ]
             hs_ = self.params["rnn%d%s" %
                               (i, n)].initial_state().transduce(xs[::d])
             hs = [hs_[0]]
             for t in range(1, len(hs_)):
                 r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) +
                                 br)
                 hs.append(
                     dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t]))
             xs = hs
             if train:
                 x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout)
                 xs = [dy.pick(x, i, 1) for i in range(len(xs))]
     return xs
Ejemplo n.º 13
0
    def run(self,
            word_inputs,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            isTrain=True):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))

        if isTrain or arc_targets is not None:
            mask_1D = dynet_flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        word_embs = [
            dy.lookup_batch(self.word_embs,
                            np.where(w < self._vocab.words_in_train, w,
                                     self._vocab.UNK),
                            update=True)
            #+ dy.lookup_batch(self.pret_word_embs, w, update = False) # remove 1 line
            for w in word_inputs
        ]
        tag_embs = [
            dy.lookup_batch(self.tag_embs, pos, update=True)
            for pos in tag_inputs
        ]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep, head = leaky_relu(dy.affine_transform([
            b_dep, W_dep, top_recur
        ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if isTrain:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ),
                                     seq_len * batch_size)
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        # seq_len x batch_size

        if isTrain or arc_targets is not None:
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
            # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
            # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Ejemplo n.º 14
0
    def run_parser(self,
                   word_inputs,
                   common_top_recur,
                   private_top_recur,
                   arc_targets=None,
                   rel_targets=None,
                   isTrain=True):
        # inputs, targets: seq_len x batch_size

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))
        top_recur = dy.concatenate([common_top_recur, private_top_recur])

        if isTrain or arc_targets is not None:
            mask_1D = self.dynet_flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep = leaky_relu(dy.affine_transform([b_dep, W_dep, top_recur]))
        head = leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if isTrain:
            dep = dy.dropout_dim(dep, 1, self.dropout_mlp)
            head = dy.dropout_dim(head, 1, self.dropout_mlp)

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ),
                                     seq_len * batch_size)
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        # seq_len x batch_size

        if isTrain or arc_targets is not None:
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = self.dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
        # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        # dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        # head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = self.dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
        # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * self.dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Ejemplo n.º 15
0
    def run(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None):

        is_train = arc_targets is not None

        # @djam modification
        word_inputs = word_inputs.T
        tag_inputs = tag_inputs.T

        if arc_targets is not None:
            arc_targets[:, 0] = 0
            arc_targets = arc_targets.T
            targets_1D = dynet_flatten_numpy(arc_targets)


        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]

        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        
        if self.pret_word_embs:
            word_embs = [
                dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK))
                + dy.lookup_batch(self.pret_word_embs, w, update=False)
                for w in word_inputs
            ]
        else:
            word_embs = [dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) for w in word_inputs]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]
        
        if is_train:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)]
        else:
            emb_inputs = [dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs)]

        top_recur = dy.concatenate_cols(biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if is_train else 0., self.dropout_lstm_hidden if is_train else 0.))
        if is_train:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(self.mlp_head_b)
        dep, head = leaky_relu(dy.affine_transform([b_dep, W_dep, top_recur])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if is_train:
            dep, head= dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim(head, 1, self.dropout_mlp)
        
        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs= 1, bias_x = True, bias_y = False)
        # (#head x #dep) x batch_size

        arc_preds = arc_logits.npvalue().argmax(0)
        arc_preds = arc_preds if arc_preds.ndim == 2 else arc_preds[:, None]
        # seq_len x batch_size
        
        W_rel = dy.parameter(self.rel_W)
        
        rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True)
        # (#head x rel_size x #dep) x batch_size
        
        flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(flat_rel_logits, targets_1D if is_train else dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        # @djam - restored shape
        partial_rel_logits = dy.reshape(partial_rel_logits, (self._vocab.rel_size, seq_len), batch_size)
        
        # if not isTrain:
        arc_probs = np.transpose(np.reshape(dy.softmax(arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F'))
        # #batch_size x #dep x #head
        rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
        # batch_size x #dep x #head x #nclasses

        # @djam contribution
        if is_train:
            # 'decode' with argmax
            # Why on earth can't i get this guy to work
            # arc_predictions = arc_probs.argmax(1)
            arc_predictions = arc_preds.T
            # batch_size x dep

            _1 = np.repeat(range(batch_size), seq_len)  # batches
            _2 = np.tile(range(seq_len), batch_size)  # modifiers
            _3 = arc_predictions.reshape(-1)  # predicted arcs

            rel_predictions = rel_probs[_1, _2, _3].argmax(-1)
            rel_predictions = rel_predictions.reshape(batch_size, seq_len)
            # batch_size x dep

            return arc_predictions, rel_predictions, arc_logits, partial_rel_logits
        else:
            arc_predictions, rel_predictions = [], []
            for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs):
                msk[0] = 1.
                sent_len = int(np.sum(msk))
                arc_pred = uniparse.arc_argmax(arc_prob, sent_len, msk)
                rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
                rel_pred = uniparse.rel_argmax(rel_prob, sent_len)

                arc_predictions.append(arc_pred[:sent_len])
                rel_predictions.append(rel_pred[:sent_len])
            
            return arc_predictions, rel_predictions, None, None
Ejemplo n.º 16
0
    def run(self, word_inputs, lemma_inputs, tag_inputs, pred_golds, rel_targets=None, isTrain=True):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1,), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        marker = self._vocab.PAD if self._unified else self._vocab.DUMMY
        mask = np.greater(word_inputs, marker).astype(np.float32)
        num_tokens = int(np.sum(mask))

        word_embs = [dy.lookup_batch(self.word_embs,
                                     np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)
                                     ) for w in word_inputs]
        pre_embs = [dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs]
        flag_embs = [dy.lookup_batch(self.flag_embs,
                                     np.array(w == i + 1, dtype=np.int)
                                     ) for i, w in enumerate(pred_golds)]
        lemma_embs = [dy.lookup_batch(self.lemma_embs, lemma) for lemma in lemma_inputs]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [dy.concatenate([dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm),
                                          dy.cmult(lemma, wm), dy.cmult(pos, posm)])
                          for word, pre, flag, lemma, pos, (wm, posm) in
                          zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, emb_masks)]

        else:
            emb_inputs = [dy.concatenate([word, pre, flag, lemma, pos])
                          for word, pre, flag, lemma, pos in
                          zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs)]

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_arg, b_arg = dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b)
        W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter(self.mlp_pred_b)
        arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur]))
        # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur]))
        predicates_1D = pred_golds[0]
        pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1)
        pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, pred_recur]))
        if isTrain:
            arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp)
            # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp)
            pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp)

        W_rel = dy.parameter(self.rel_W)

        # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size,
        # 						num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True)
        # # (#pred x rel_size x #arg) x batch_size

        # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size)
        # # (#pred x rel_size) x (#arg x batch_size)

        # predicates_1D = dynet_flatten_numpy(pred_golds)
        # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D)
        # # (rel_size) x (#arg x batch_size)

        rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size,
                              num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True)
        # (1 x rel_size x #arg) x batch_size
        flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size), seq_len * batch_size)
        # (1 x rel_size) x (#arg x batch_size)

        predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0])
        partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D)
        # (1 x rel_size) x (#arg x batch_size)

        if isTrain:
            mask_1D = dynet_flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens
            return rel_accuracy, rel_loss

        # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
        # 									(self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))

        rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                                            (self._vocab.rel_size, 1, seq_len, batch_size), 'F'))
        outputs = []

        # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs):
        # 	msk[0] = 1.
        # 	sent_len = int(np.sum(msk))
        # 	rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold]
        # 	rel_pred = rel_argmax(rel_prob)
        # 	outputs.append(rel_pred[:sent_len])

        for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs):
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            rel_prob = rel_prob[np.arange(len(pred_gold)), 0]
            rel_pred = rel_argmax(rel_prob)
            outputs.append(rel_pred[:sent_len])

        return outputs
Ejemplo n.º 17
0
    def run(self,
            word_inputs,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            isTrain=True):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))

        if isTrain or arc_targets is not None:
            mask_1D = dynet_flatten_numpy(mask)
            # batched here means that the last dim is treated as batch dimension, both in input and output
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        # TODO: 注意 _words_in_train
        # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len
        if self.pre_train_emb:
            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) +
                dy.lookup_batch(self.pret_word_embs, w, update=False)
                for w in word_inputs
            ]  # 两个 embedding 相加 [Expression] * seq_len
        else:
            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) for w in word_inputs
            ]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep, head = leaky_relu(dy.affine_transform([
            b_dep, W_dep, top_recur
        ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if isTrain:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)
            # 1 就意味着某些情况下整个 dim 1 变成0, dim=0 就是 drop 列, dim=1 就是 drop 行, 第三维是 batch

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len *
                                     batch_size)  # 这种风格的平坦是为了计算 loss 啦
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        # seq_len x batch_size

        if isTrain or arc_targets is not None:
            # 用得分最高的去计算 loss, 并不意味着我就选这个作为解码结果的哦, 但是必须削减它
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask  # mask 你真厉害呀现在还活着
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
            # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D  # 这里的形状如此, 需要用 mask1d
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
            # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by ones
            # 我非常赞同, parse 的解码这一部分根本没法 batch
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len],
                            rel_pred[1:sent_len]))  # 难道第 0 个真的是 ROOT, 确实如此

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Ejemplo n.º 18
0
    def __call__(self, inputs, masks, truth, is_train=True, is_tree=True):
        sent_len = len(inputs)
        batch_size = inputs[0].dim()[1]
        flat_len = sent_len * batch_size

        # H -> hidden size, L -> sentence length, B -> batch size
        # ((H, L), B)
        X = dy.concatenate_cols(inputs)
        if is_train:
            X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP)
        # M_H -> MLP hidden size
        # ((M_H, L), B)
        # head_mat = leaky_relu(self.head_MLP(X, is_train))
        head_mat = self.head_MLP(X, is_train)
        # ((M_H, L), B)
        dept_mat = self.dept_MLP(X, is_train)
        if is_train:
            total_token = sum(masks['flat'].tolist())
            head_mat = dy.dropout_dim(head_mat, 1, self.cfg.MLP_DROP)
            dept_mat = dy.dropout_dim(dept_mat, 1, self.cfg.MLP_DROP)

        # A_H -> Arc hidden size, R_H -> Label hidden size, A_H + R_H = M_H
        head_arc = head_mat[:self.arc_size]  # ((A_H, L), B)
        dept_arc = dept_mat[:self.arc_size]  # ((A_H, L), B)
        head_rel = head_mat[self.arc_size:]  # ((R_H, L), B)
        dept_rel = dept_mat[self.arc_size:]  # ((R_H, L), B)

        # ((L, L), B)
        masks_2D = dy.inputTensor(masks['2D'], True)
        # (1, L*B)
        masks_flat = dy.inputTensor(masks['flat'], True)

        gnn_losses = []
        for k in range(self.cfg.GRAPH_LAYERS):
            # Graph Weights
            # ((L, L), B)
            arc_mat = self.arc_attn_mat[k](head_arc,
                                           dept_arc) - 1e9 * (1 - masks_2D)
            arc_prob = dy.softmax(arc_mat)

            # Layer-wise Loss
            if is_train:
                # ((L,), L*B)
                arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
                # ((1,), L*B)
                arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
                # (1,)
                arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token
                gnn_losses.append(arc_loss)

            # Aggregation Function
            # Fusion head and dept representation
            # ((A_H, L), B)
            HX = head_arc * arc_prob
            DX = dept_arc * dy.transpose(arc_prob)
            FX = HX + DX

            # Async Update Function
            # Head-first
            # ((A_H, L), B)
            head_arc = self.head_gnn(FX, head_arc)
            FX_new = head_arc * arc_prob + DX
            dept_arc = self.dept_gnn(FX_new, dept_arc)

        # ((L, L), B)
        arc_mat = self.arc_attn_mat[-1](head_arc,
                                        dept_arc) - 1e9 * (1 - masks_2D)
        # ((L,), L*B)
        arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
        # Predict Relation
        # (R_H, L*B)
        head_rel = dy.reshape(head_rel, (self.rel_size, flat_len))
        # ((R_H,), L*B)
        dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len)
        if is_train:
            # ((1,), L*B)
            arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
            # (1,)
            arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token
            # ((R_H,), L*B)
            truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1)
            # R -> Relation Set Size
            # ((R,), L*B)
            rel_mat = self.rel_attn(dept_rel, truth_rel)
        else:
            if is_tree:
                # MST Inference, Achieve Tree Edge.
                arc_probs = dy.softmax(arc_mat).npvalue()
                arc_probs = np.reshape(arc_probs,
                                       (sent_len, sent_len, batch_size), 'F')
                arc_probs = np.transpose(arc_probs)
                # Mask PAD
                arc_masks = [
                    np.array(masks['flat'][i:i + sent_len])
                    for i in range(0, flat_len, sent_len)
                ]
                arc_pred = []
                # Inference One By One.
                for msk, arc_prob in zip(arc_masks, arc_probs):
                    msk[0] = 1
                    seq_len = int(np.sum(msk))
                    tmp_pred = MST_inference(arc_prob, seq_len, msk)
                    tmp_pred[0] = 0
                    arc_pred.extend(tmp_pred)
            else:
                # Greedy Inference (argmax)
                arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # Pick Predicted Edge's <Head, Dept> pair.
            flat_pred = [
                j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred)
            ]
            pred_rel = dy.pick_batch(head_rel, flat_pred, 1)
            # Predict Relation (mask ROOT)
            rel_mat = self.rel_attn(dept_rel, pred_rel)
            rel_mask = dy.inputTensor(self.rel_mask)
            rel_mat = rel_mat - 1e9 * rel_mask
        if is_train:
            # Calculate Relation Classification Loss
            # ((1,), L*B)
            rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel'])
            # (1,)
            rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token
            # Final Total Loss with Layer-wise
            losses = (rel_loss + arc_loss) * self.cfg.LAMBDA2
            if gnn_losses:
                losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1
            losses_list = gnn_losses + [arc_loss, rel_loss]
            return losses, losses_list
        else:
            rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue()
            rel_pred = np.argmax(rel_mat, 0)
            pred = {}
            pred['head'], pred['rel'] = arc_pred, rel_pred
            return pred
Ejemplo n.º 19
0
    def __call__(self, inputs, masks, truth, iters, is_train=True, is_tree=True):
        if type(inputs) == list:
            sent_len = len(inputs)
            batch_size = inputs[0].dim()[1]
            X = dy.concatenate_cols(inputs)

        else:
            sent_len = inputs.dim()[0][0]
            batch_size = inputs.dim()[1]
            X = dy.transpose(inputs, [1, 0])
 
        
        
        flat_len = sent_len * batch_size
        

        #sent_len = len(inputs) 
        #batch_size = inputs[0].dim()[1]
        #flat_len = sent_len * batch_size

        # H -> hidden size, L -> sentence length, B -> batch size
        # ((H, L), B)
        #X = dy.concatenate_cols(inputs)
        if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP)
        # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size
        # ((A_H, L), B)
        head_arc = self.head_arc_MLP(X, is_train)
        dept_arc = self.dept_arc_MLP(X, is_train)
        # ((R_H, L), B)
        head_rel = self.head_rel_MLP(X, is_train)
        dept_rel = self.dept_rel_MLP(X, is_train)

        if is_train:
            total_token = sum(masks['flat'].tolist())
            head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP)
            head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP)
            dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP)
            dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP)

        # ((L, L), B)
        masks_2D = 1e9*(1-dy.inputTensor(masks['2D'], True))
        # (1, L*B)
        masks_flat = dy.inputTensor(masks['flat'], True)

        gnn_losses = []
        arc_norm = math.sqrt(self.arc_size)
        rel_norm = math.sqrt(self.rel_size)
        for k in range(self.cfg.GRAPH_LAYERS):
            # Graph Weights
            # ((L, L), B)
            arc_mat = self.arc_attn_mat[k](head_arc, dept_arc)/arc_norm-masks_2D
            arc_prob = dy.softmax(arc_mat)
            # Layer-wise Loss
            if is_train:
                arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP)
                # ((L,), L*B)
                arc_mat = dy.reshape(arc_mat, (sent_len,), flat_len)
                # ((1,), L*B)
                arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
                # (1,)
                arc_loss = dy.sum_batches(arc_loss*masks_flat)/total_token
                gnn_losses.append(arc_loss)

            # Aggregation Function
            # Fusion head and dept representation
            # ((A_H, L), B)
            HX = head_arc * arc_prob
            DX = dept_arc * dy.transpose(arc_prob)
            FX = HX + DX
            
            # Async Update Function
            # Head-first
            # ((A_H, L), B)
            head_arc = self.head_gnn(FX, head_arc)
            FX_new = head_arc * arc_prob + DX
            dept_arc = self.dept_gnn(FX_new, dept_arc)

            # Relation Aggregation Function
            # Sync update 
            # ((R_H, L), B)
            HR = head_rel * arc_prob
            DR = dept_rel * dy.transpose(arc_prob)
            FX = HR+DR
            head_rel = self.head_rel_gnn(FX, head_rel) + head_rel
            dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel

        # ((L, L), B)
        arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc)/arc_norm-masks_2D

        is_tree_computed_val = is_tree_computed(arc_mat.npvalue())
        print(is_tree_computed_val)
        exit(0)
        # ((L,), L*B)
        arc_mat = dy.reshape(arc_mat, (sent_len,), flat_len)
        # Predict Relation
        # (R_H, L*B)
        head_rel = dy.reshape(head_rel, (self.rel_size, flat_len))
        # ((R_H,), L*B)
        dept_rel = dy.reshape(dept_rel, (self.rel_size,), flat_len)
        if is_train:
            
            # print(arc_mat.dim()) # ((3,), 300)
            # arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # print(arc_pred.shape) # (300,)
            # print(arc_pred) # all 0's and 1's

            # ((1,), L*B)
            arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
            # (1,)
            arc_loss = dy.sum_batches(arc_losses*masks_flat)/total_token
            # ((R_H,), L*B)
            truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1)
            # R -> Relation Set Size
            # ((R,), L*B)
            rel_mask = 1e9*dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, truth_rel)/rel_norm - rel_mask

            # Calculate Relation Classification Loss
            # ((1,), L*B)
            rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel'])
            # (1,)
            rel_loss = dy.sum_batches(rel_losses*masks_flat) / total_token
            # Final Total Loss with Layer-wise
            warm = [int(iters>=x) for x in self.warm_list]
            losses = rel_loss*self.cfg.LAMBDA2*warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1]
            if gnn_losses:
                for i in range(self.cfg.GRAPH_LAYERS):
                    gnn_losses[i] *= warm[i]
                losses += dy.esum(gnn_losses)*self.cfg.LAMBDA1
            losses_list = gnn_losses + [arc_loss, rel_loss]
            return losses, losses_list
        else:
            if is_tree:
                # MST Inference, Achieve Tree Edge.
                arc_probs = dy.softmax(arc_mat).npvalue()
                arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F')
                arc_probs = np.transpose(arc_probs)
                # Mask PAD
                arc_masks = [np.array(masks['flat'][i:i+sent_len])
                             for i in range(0, flat_len, sent_len)]
                arc_pred = []
                # Inference One By One.
                for msk, arc_prob in zip(arc_masks, arc_probs):
                    msk[0] = 1
                    seq_len = int(np.sum(msk))
                    tmp_pred = MST_inference(arc_prob, seq_len, msk)
                    tmp_pred[0] = 0
                    arc_pred.extend(tmp_pred)
            else:
                # Greedy Inference (argmax)
                arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # Pick Predicted Edge's <Head, Dept> pair.
            flat_pred = [j+(i//sent_len)*sent_len for i, j in enumerate(arc_pred)]
            pred_rel = dy.pick_batch(head_rel, flat_pred, 1)
            # Predict Relation (mask ROOT)
            rel_mask = 1e9*dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, pred_rel)/rel_norm-rel_mask
            rel_mat = dy.reshape(rel_mat, (self.rel_num,)).npvalue()
            rel_pred = np.argmax(rel_mat, 0)
            pred = {}
            pred['head'], pred['rel'] = arc_pred, rel_pred
            return pred
Ejemplo n.º 20
0
    def single_training_call(self,
                             inputs,
                             masks,
                             truth,
                             iters,
                             is_train=True,
                             is_tree=True):
        if type(inputs) == list:
            sent_len = len(inputs)
            batch_size = inputs[0].dim()[1]
            X = dy.concatenate_cols(inputs)

        else:
            sent_len = inputs.dim()[0][0]
            batch_size = inputs.dim()[1]
            X = dy.transpose(inputs, [1, 0])

        flat_len = sent_len * batch_size

        # H -> hidden size, L -> sentence length, B -> batch size
        # ((H, L), B)
        #X = dy.concatenate_cols(inputs)
        if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP)
        # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size
        # ((A_H, L), B)
        head_arc = self.head_arc_MLP(X, is_train)
        dept_arc = self.dept_arc_MLP(X, is_train)
        # ((R_H, L), B)
        head_rel = self.head_rel_MLP(X, is_train)
        dept_rel = self.dept_rel_MLP(X, is_train)

        if is_train:
            total_token = sum(masks['flat'].tolist())
            head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP)
            head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP)
            dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP)
            dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP)
        else:
            # added by me
            total_token = None

        # ((L, L), B)
        masks_2D = 1e9 * (1 - dy.inputTensor(masks['2D'], True))
        # (1, L*B)
        masks_flat = dy.inputTensor(masks['flat'], True)

        gnn_losses = []
        arc_norm = math.sqrt(self.arc_size)
        rel_norm = math.sqrt(self.rel_size)
        for k in range(self.cfg.GRAPH_LAYERS):
            # Graph Weights
            # ((L, L), B)
            arc_mat = self.arc_attn_mat[k](head_arc,
                                           dept_arc) / arc_norm - masks_2D
            arc_prob = dy.softmax(arc_mat)
            # Layer-wise Loss
            if is_train:
                arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP)
                # ((L,), L*B)
                arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
                # ((1,), L*B)
                arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
                # (1,)
                arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token
                gnn_losses.append(arc_loss)

            # Aggregation Function
            # Fusion head and dept representation
            # ((A_H, L), B)
            HX = head_arc * arc_prob
            DX = dept_arc * dy.transpose(arc_prob)
            FX = HX + DX

            # Async Update Function
            # Head-first
            # ((A_H, L), B)
            head_arc = self.head_gnn(FX, head_arc)
            FX_new = head_arc * arc_prob + DX
            dept_arc = self.dept_gnn(FX_new, dept_arc)

            # Relation Aggregation Function
            # Sync update
            # ((R_H, L), B)
            HR = head_rel * arc_prob
            DR = dept_rel * dy.transpose(arc_prob)
            FX = HR + DR
            head_rel = self.head_rel_gnn(FX, head_rel) + head_rel
            dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel

        # ((L, L), B)
        arc_mat = self.arc_attn_mat[-1](head_arc,
                                        dept_arc) / arc_norm - masks_2D
        # ((L,), L*B)
        arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
        # Predict Relation
        # (R_H, L*B)
        head_rel = dy.reshape(head_rel, (self.rel_size, flat_len))
        # ((R_H,), L*B)
        dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len)

        return arc_mat, head_rel, dept_rel, masks_flat, total_token, gnn_losses, sent_len, batch_size, arc_norm, rel_norm, flat_len
Ejemplo n.º 21
0
    def cal_scores(self, src_encodings, predict=False):

        src_len = len(src_encodings)
        src_encodings = dy.concatenate_cols(
            src_encodings)  # src_ctx_dim, src_len, batch_size
        batch_size = src_encodings.dim()[1]

        W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head)
        b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head)
        W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep)
        b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep)

        W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head)
        b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head)
        W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep)
        b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep)

        U_arc_1 = dy.parameter(self.U_arc_1)
        u_arc_2 = dy.parameter(self.u_arc_2)

        U_label_1 = [dy.parameter(x) for x in self.U_label_1]
        u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1]
        u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2]
        b_label = [dy.parameter(x) for x in self.b_label]

        if predict:
            h_arc_head = self.leaky_ReLu(
                dy.affine_transform([
                    b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings
                ]))  # n_arc_ml_units, src_len, bs
            h_arc_dep = self.leaky_ReLu(
                dy.affine_transform(
                    [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings]))
            h_label_head = self.leaky_ReLu(
                dy.affine_transform([
                    b_label_hidden_to_head, W_label_hidden_to_head,
                    src_encodings
                ]))
            h_label_dep = self.leaky_ReLu(
                dy.affine_transform([
                    b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings
                ]))
        else:

            src_encodings = dy.dropout_dim(src_encodings, 1,
                                           self.arc_mlp_dropout)

            h_arc_head = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_arc_hidden_to_head, W_arc_hidden_to_head,
                        src_encodings
                    ])), 1,
                self.arc_mlp_dropout)  # n_arc_ml_units, src_len, bs
            h_arc_dep = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings
                    ])), 1, self.arc_mlp_dropout)
            h_label_head = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_label_hidden_to_head, W_label_hidden_to_head,
                        src_encodings
                    ])), 1, self.label_mlp_dropout)
            h_label_dep = dy.dropout_dim(
                self.leaky_ReLu(
                    dy.affine_transform([
                        b_label_hidden_to_dep, W_label_hidden_to_dep,
                        src_encodings
                    ])), 1, self.label_mlp_dropout)

        h_arc_head_transpose = dy.transpose(h_arc_head)
        h_label_head_transpose = dy.transpose(h_label_head)

        s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep,
                                                      u_arc_2)

        s_label = []
        for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2,
                                        b_label):
            e1 = h_label_head_transpose * U_1 * h_label_dep
            e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len))
            e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep
            s_label.append(e1 + e2 + e3 + b)
        return s_arc, s_label
Ejemplo n.º 22
0
    def run(self,
            char_vocab,
            cased_word_inputs,
            word_inputs,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            is_train=True):
        """
        Train or test
        :param char_vocab:
        :param cased_word_inputs: seq_len x batch_size
        :param word_inputs: seq_len x batch_size
        :param tag_inputs: seq_len x batch_size
        :param arc_targets: seq_len x batch_size
        :param rel_targets: seq_len x batch_size
        :param is_train: is training or test
        :return:
        """
        def flatten_numpy(ndarray):
            """
            Flatten nd-array to 1-d column vector
            :param ndarray:
            :return:
            """
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))  # non padding, non root token number

        if is_train or arc_targets is not None:
            mask_1D = flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)
            #  if batched=True, the last dimension is used as a batch dimension if arr is a list of numpy ndarrays

        if self.char_lstm:
            # Subword model
            char_w = dy.parameter(self.char_w)

            def LSTM_attention(lstm, inputs, dropout_x=0., dropout_h=0.):
                ss = LSTM(lstm, inputs, None, dropout_x, dropout_h)
                hs = [s.h()[0] for s in ss]
                return dy.concatenate([attention(hs, char_w), ss[-1].s()[0]])

            subword_embs = []
            for char_ids in char_vocab:
                char_inputs = [
                    dy.lookup(self.char_embs, char) for char in char_ids
                ]
                subword_embs.append(
                    LSTM_attention(
                        self.char_lstm, char_inputs,
                        self.dropout_lstm_input if is_train else 0.,
                        self.dropout_lstm_hidden if is_train else 0.))
            subword_embs = dy.concatenate_cols(subword_embs)

            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) + subword_embs *
                dy.inputTensor(one_hot(cw, len(char_vocab)).T, batched=True) +
                0 if self.pret_word_embs is None else dy.lookup_batch(
                    self.pret_word_embs, w, update=False)
                for cw, w in zip(cased_word_inputs, word_inputs)
            ]
        else:
            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) +
                0 if self.pret_word_embs is None else dy.lookup_batch(
                    self.pret_word_embs, w, update=False) for w in word_inputs
            ]

        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        # Dropout
        if is_train:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]  # seq_len x batch_size

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if is_train else 0.,
                   self.dropout_lstm_hidden if is_train else 0.))
        if is_train:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep, head = leaky_relu(dy.affine_transform([
            b_dep, W_dep, top_recur
        ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if is_train:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ),
                                     seq_len * batch_size)
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        if len(arc_preds.shape) == 1:  # dynet did unnecessary jobs
            arc_preds = np.expand_dims(arc_preds, axis=1)
        # seq_len x batch_size

        if is_train or arc_targets is not None:
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
        # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        # dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        # head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if is_train else flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if is_train or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
        # batch_size x #dep x #head x #nclasses

        if is_train or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if is_train:
            return arc_accuracy * 100., rel_accuracy * 100., overall_accuracy * 100., loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy * 100., rel_accuracy * 100., overall_accuracy * 100., outputs
        return outputs
Ejemplo n.º 23
0
    def __call__(self,
                 inputs,
                 masks,
                 truth,
                 iters,
                 is_train=True,
                 is_tree=True):
        sent_len = len(inputs)
        batch_size = inputs[0].dim()[1]
        flat_len = sent_len * batch_size

        print('===Vào call===')
        print('input length: ', inputs.__len__())  # input length:  46
        print('input dim: ', inputs[1].dim())  # input dim:  ((400,), 2)
        print('sent_len', sent_len)  # sent_len 46
        print('batch_size', batch_size)  # batch_size 2
        print('flat_len', flat_len)  # flat_len 92

        # H -> hidden size, L -> sentence length, B -> batch size
        # ((H, L), B)
        X = dy.concatenate_cols(inputs)
        print('X dim: ', X.dim())  # X dim:  ((400, 46), 2)
        if is_train:
            X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP)

        # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size
        # ((A_H, L), B)
        head_arc = self.head_arc_MLP(X, is_train)
        dept_arc = self.dept_arc_MLP(X, is_train)
        print('head_arc dim: ', head_arc.dim())
        print('dept_arc dim: ', dept_arc.dim())
        # head_arc dim:  ((300, 46), 2)
        # dept_arc dim:  ((300, 46), 2)

        # ((R_H, L), B)
        head_rel = self.head_rel_MLP(X, is_train)
        dept_rel = self.dept_rel_MLP(X, is_train)
        print('head_rel dim: ', head_rel.dim())
        print('dept_rel dim: ', dept_rel.dim())
        # head_rel dim:  ((100, 46), 2)
        # dept_rel dim:  ((100, 46), 2)

        if is_train:
            total_token = sum(masks['flat'].tolist())
            head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP)
            head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP)
            dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP)
            dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP)

        # ((L, L), B)

        masks_2D = 1e9 * (1 - dy.inputTensor(masks['2D'], True))

        masks_flat = dy.inputTensor(masks['flat'], True)

        gnn_losses = []
        arc_norm = math.sqrt(self.arc_size)
        rel_norm = math.sqrt(self.rel_size)
        for k in range(self.cfg.GRAPH_LAYERS):
            print('----layer-----', k)
            # Graph Weights
            # ((L, L), B)
            arc_mat = self.arc_attn_mat[k](head_arc,
                                           dept_arc) / arc_norm - masks_2D
            arc_prob = dy.softmax(arc_mat)

            # arc_mat dim:  ((46, 46), 2)
            # arc_prob dim:  ((46, 46), 2)

            # Layer-wise Loss
            if is_train:
                arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP)
                # ((L,), L*B)
                arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
                # ((1,), L*B)
                print('arc_mat val', arc_mat.value())
                print('arc_mat dim', arc_mat.dim())
                print("truth['head'] value", truth['head'])
                print("truth['head'] lengt", truth['head'].__len__())

                arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
                print('arc_loss', arc_loss.value())
                print('arc_loss', arc_loss.dim())

                # (1,)

                arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token
                print('arc_loss', arc_loss.value)
                print('arc_loss', arc_loss.dim())

                gnn_losses.append(arc_loss.value())
                input("pause")

            # Aggregation Function
            # Fusion head and dept representation
            # ((A_H, L), B)
            HX = head_arc * arc_prob
            DX = dept_arc * dy.transpose(arc_prob)
            FX = HX + DX

            print('HX dim: ', HX.dim())
            print('DX dim: ', DX.dim())
            print('FX dim: ', FX.dim())
            # HX dim:  ((300, 46), 2)
            # DX dim:  ((300, 46), 2)
            # FX dim:  ((300, 46), 2)

            # Async Update Function
            # Head-first
            # ((A_H, L), B)
            head_arc = self.head_gnn(FX, head_arc)
            FX_new = head_arc * arc_prob + DX
            dept_arc = self.dept_gnn(FX_new, dept_arc)

            print('head_arc dim: ', head_arc.dim())
            print('FX_new dim: ', FX_new.dim())
            print('dept_arc dim: ', dept_arc.dim())
            # head_arc dim:  ((300, 46), 2)
            # FX_new dim:  ((300, 46), 2)
            # dept_arc dim:  ((300, 46), 2)

            # Relation Aggregation Function
            # Sync update
            # ((R_H, L), B)
            HR = head_rel * arc_prob
            DR = dept_rel * dy.transpose(arc_prob)
            FX = HR + DR
            head_rel = self.head_rel_gnn(FX, head_rel) + head_rel
            dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel

            print('HR dim: ', HR.dim())
            print('DR dim: ', DR.dim())
            print('FX dim: ', FX.dim())
            # HR dim:  ((100, 46), 2)
            # DR dim:  ((100, 46), 2)
            # FX dim:  ((100, 46), 2)

            print('head_rel dim: ', head_rel.dim())
            print('dept_rel dim: ', dept_rel.dim())
# head_rel dim:  ((100, 46), 2)
# dept_rel dim:  ((100, 46), 2)

# ((L, L), B)
        arc_mat = self.arc_attn_mat[-1](head_arc,
                                        dept_arc) / arc_norm - masks_2D
        # ((L,), L*B)
        arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
        # Predict Relation
        # (R_H, L*B)
        head_rel = dy.reshape(head_rel, (self.rel_size, flat_len))
        # ((R_H,), L*B)
        dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len)

        print('arc_mat dim: ', arc_mat.dim())
        print('head_rel dim: ', head_rel.dim())
        print('dept_rel dim: ', dept_rel.dim())
        # arc_mat dim:  ((46,), 92)
        # head_rel dim:  ((100, 92), 1)
        # dept_rel dim:  ((100,), 92)

        if is_train:
            # ((1,), L*B)
            arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
            # (1,)
            arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token
            # ((R_H,), L*B)
            truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1)
            # R -> Relation Set Size
            # ((R,), L*B)
            rel_mask = 1e9 * dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, truth_rel) / rel_norm - rel_mask
            # Calculate Relation Classification Loss
            # ((1,), L*B)
            rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel'])
            # (1,)
            rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token
            # Final Total Loss with Layer-wise
            warm = [int(iters >= x) for x in self.warm_list]
            losses = rel_loss*self.cfg.LAMBDA2 * \
                warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1]
            if gnn_losses:
                for i in range(self.cfg.GRAPH_LAYERS):
                    gnn_losses[i] *= warm[i]
                losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1
            losses_list = gnn_losses + [arc_loss, rel_loss]
            return losses, losses_list
        else:
            if is_tree:
                # MST Inference, Achieve Tree Edge.
                arc_probs = dy.softmax(arc_mat).npvalue()
                arc_probs = np.reshape(arc_probs,
                                       (sent_len, sent_len, batch_size), 'F')
                arc_probs = np.transpose(arc_probs)
                # Mask PAD
                arc_masks = [
                    np.array(masks['flat'][i:i + sent_len])
                    for i in range(0, flat_len, sent_len)
                ]
                arc_pred = []
                # Inference One By One.
                for msk, arc_prob in zip(arc_masks, arc_probs):
                    msk[0] = 1
                    seq_len = int(np.sum(msk))
                    tmp_pred = MST_inference(arc_prob, seq_len, msk)
                    tmp_pred[0] = 0
                    arc_pred.extend(tmp_pred)
            else:
                # Greedy Inference (argmax)
                arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # Pick Predicted Edge's <Head, Dept> pair.
            flat_pred = [
                j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred)
            ]
            pred_rel = dy.pick_batch(head_rel, flat_pred, 1)
            # Predict Relation (mask ROOT)
            rel_mask = 1e9 * dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, pred_rel) / rel_norm - rel_mask
            rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue()
            rel_pred = np.argmax(rel_mat, 0)
            pred = {}
            pred['head'], pred['rel'] = arc_pred, rel_pred
            return pred
Ejemplo n.º 24
0
    def run(self,
            word_inputs,
            lengths,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            isTrain=True):
        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = (np.broadcast_to(np.reshape(np.arange(seq_len), (seq_len, 1)),
                                (seq_len, batch_size)) < lengths).astype(
                                    np.float32)
        mask[0] = 0.
        num_tokens = int(np.sum(mask))

        if isTrain or arc_targets is not None:
            mask_1D = self.dynet_flatten_numpy(mask)
            # batched here means that the last dim is treated as batch dimension, both in input and output
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        # TODO: 注意 _words_in_train
        # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len
        if self.e_ext is not None:
            word_embs = [
                dy.lookup_batch(
                    self.e_form,
                    np.where(w < self.v_train, w,
                             self.vocab_form.stoi["<unk>"])) +
                dy.lookup_batch(self.e_ext, w, update=False)
                for w in word_inputs
            ]  # 两个 embedding 相加 [Expression] * seq_len
        else:
            word_embs = [
                dy.lookup_batch(
                    self.e_form,
                    np.where(w < self.v_train, w,
                             self.vocab_form.stoi["<unk>"]))
                for w in word_inputs
            ]
        tag_embs = [dy.lookup_batch(self.e_tag, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_msk(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        top_recur = dy.concatenate_cols(
            biLSTM(self.lstm_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            # drop some dim for lstm_output for all words, all sentences
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        dep = leaky_relu(
            dy.affine_transform([self.mlp_dep_b, self.mlp_dep_W, top_recur]))
        head = leaky_relu(
            dy.affine_transform([self.mlp_head_b, self.mlp_head_W, top_recur]))
        if isTrain:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)
            # drop dim k means, it is possible that the whole dim k is set to zeros
            # for matrix with batch, ((R, C), B)
            # drop dim 0 means drop some cols, drop dim 1 means drop some rows
            # drop 2 means drop some batches, and it only supports for Tensor with rank <=3

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        arc_logits = bilinear(dep_arc,
                              self.arc_W,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ),
                                     seq_len * batch_size)
        # flatten it to compute loss
        # (#head ) x (#dep x batch_size)

        arc_preds = np.reshape(arc_logits.npvalue().argmax(0),
                               (seq_len, batch_size))
        # seq_len x batch_size
        # here if an Expression's batch size is 1
        # npvalue() will drop the batch dimension
        # so add it back if needed

        if isTrain or arc_targets is not None:
            # tarin it in a neg log likelihood fashion, but enforce tree constraint when testing
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask
            # mask is used to filter <pad>'s out in summing loss
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = self.dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
            # #batch_size x #dep x #head, transpose reverse all, and since layout has changed, it's totally fine

        rel_logits = bilinear(dep_rel,
                              self.rel_W,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=len(self.vocab_deprel),
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, len(self.vocab_deprel)),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = self.dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D  # 这里的形状如此, 需要用 mask1d
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (len(self.vocab_deprel), seq_len, seq_len, batch_size),
                    'F'))
            # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * self.dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by ones
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(
                rel_prob, sent_len, self.vocab_deprel,
                "root" if "root" in self.vocab_deprel.stoi else "ROOT")
            outputs.append(
                (arc_pred[1:sent_len], rel_pred[1:sent_len]))  # w_0 is <roor>
        assert (len(outputs) == batch_size)

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Ejemplo n.º 25
0
    def forward(self, words, extwords, tags, isTrain):
        # inputs, targets: seq_le
        seq_len = len(words)

        dynamic_embs = [dy.lookup(self.word_embs, w) for w in words]
        static_embs = [
            dy.lookup(self.pret_word_embs, w, update=False) for w in extwords
        ]
        word_embs = [
            dynamic_emb + static_emb
            for dynamic_emb, static_emb in zip(dynamic_embs, static_embs)
        ]
        tag_embs = [dy.lookup(self.tag_embs, pos) for pos in tags]

        if isTrain:
            word_masks = np.random.binomial(1, 1. - self.dropout_emb,
                                            seq_len).astype(np.float32)
            tag_masks = np.random.binomial(1, 1. - self.dropout_emb,
                                           seq_len).astype(np.float32)
            scale = 3. / (2. * word_masks + tag_masks + 1e-12)
            word_masks *= scale
            tag_masks *= scale
            word_embs = [dy.cmult(word_emb, dy.inputVector([word_mask])) \
                         for word_emb, word_mask in zip(word_embs, word_masks)]
            tag_embs = [dy.cmult(tag_emb, dy.inputVector([tag_mask])) \
                         for tag_emb, tag_mask in zip(tag_embs, tag_masks)]


        emb_inputs = [ dy.concatenate([word_emb, pos_emb]) \
                      for word_emb, pos_emb in zip(word_embs, tag_embs)]

        # (2 * lstm_hiddens) * seq_len
        bilstm_out = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))

        if isTrain:
            bilstm_out = dy.dropout_dim(bilstm_out, 1, self.dropout_mlp)

        # (mlp_arc_size + mlp_rel_size) * seq_len
        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        dep = leaky_relu(dy.affine_transform([b_dep, W_dep, bilstm_out]))

        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        head = leaky_relu(dy.affine_transform([b_head, W_head, bilstm_out]))

        if isTrain:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)

        # mlp_arc_size * seq_len,  mlp_rel_size * seq_len
        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        # (#head x #dep)
        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)

        # (#head x rel_size x #dep)
        W_rel = dy.parameter(self.rel_W)
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              num_outputs=self.rel_size,
                              bias_x=True,
                              bias_y=True)

        return arc_logits, rel_logits
Ejemplo n.º 26
0
def dropout_dim_list(rep_list, dp_rate, dim=0):
    return [dy.dropout_dim(rep, dim, dp_rate) for rep in rep_list]
Ejemplo n.º 27
0
    def run(self,
            word_inputs,
            lemma_inputs,
            tag_inputs,
            pred_golds,
            rel_targets=None,
            isTrain=True,
            syn_mask=None,
            seq_lens=None):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.PAD).astype(np.float32)
        num_tokens = int(np.sum(mask))

        word_embs = [
            dy.lookup_batch(
                self.word_embs,
                np.where(w < self._vocab.words_in_train, w, self._vocab.UNK))
            for w in word_inputs
        ]

        if self.use_lm:
            lm_embs = np.zeros((batch_size, seq_len, self.lm_dims),
                               dtype=float)
            for idx in range(batch_size):
                if self._unified:
                    txt = [
                        self._vocab.id2word(w) for w in word_inputs[1:, idx]
                        if self._vocab.id2word(w) != '<PAD>'
                    ]
                    key = ' '.join(txt)
                    key = self.lm_dict.get(key, None)
                    if key is None:
                        for sidx in range(len(self.lm_sentences)):
                            line = self.lm_sentences[sidx]
                            if len(line) != len(txt):
                                continue
                            found = True
                            for mdx in range(len(line)):
                                if line[mdx] != txt[mdx] and txt[
                                        mdx] != '<UNK>':
                                    found = False
                                    break
                            if found:
                                key = str(sidx)
                                self.lm_dict[' '.join(txt)] = key
                                break
                    assert key is not None
                    lm_embs[idx, 1:1 + len(txt), :] = self.lm_data[key][...]
                else:
                    txt = [
                        self._vocab.id2word(w) for w in word_inputs[:, idx]
                        if self._vocab.id2word(w) != '<PAD>'
                    ]
                    key = ' '.join(txt)
                    key = self.lm_dict.get(key, None)
                    if key is None:
                        for sidx in range(len(self.lm_sentences)):
                            line = self.lm_sentences[sidx]
                            if len(line) != len(txt):
                                continue
                            found = True
                            for mdx in range(len(line)):
                                if line[mdx] != txt[mdx] and txt[
                                        mdx] != '<UNK>':
                                    found = False
                                    break
                            if found:
                                key = str(sidx)
                                self.lm_dict[' '.join(txt)] = key
                                break
                    assert key is not None
                    lm_embs[idx, :len(txt), :] = self.lm_data[key][...]
            lm_embs = lm_embs.transpose(1, 2, 0)
            lm_embs = [dy.inputTensor(e, batched=True) for e in list(lm_embs)]

        pre_embs = [
            dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs
        ]
        flag_embs = [
            dy.lookup_batch(self.flag_embs, np.array(w == i + 1, dtype=np.int))
            for i, w in enumerate(pred_golds)
        ]
        if self.use_lemma:
            lemma_embs = [
                dy.lookup_batch(self.lemma_embs, lemma)
                for lemma in lemma_inputs
            ]
        if self.use_pos:
            tag_embs = [
                dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs
            ]

        if self.use_lm:
            if isTrain:
                emb_masks = self.generate_emb_mask(seq_len, batch_size)
                if self.use_lemma and self.use_pos:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lemma, wm),
                            dy.cmult(lme, wm),
                            dy.cmult(pos, posm)
                        ]) for word, pre, flag, lemma, pos, lme, (wm, posm) in
                        zip(word_embs, pre_embs, flag_embs, lemma_embs,
                            tag_embs, lm_embs, emb_masks)
                    ]
                elif self.use_lemma:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lemma, wm),
                            dy.cmult(lme, wm)
                        ]) for word, pre, flag, lemma, pos, lme, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             lemma_embs, lm_embs, emb_masks)
                    ]
                elif self.use_pos:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lme, wm),
                            dy.cmult(pos, posm)
                        ]) for word, pre, flag, pos, lme, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             tag_embs, lm_embs, emb_masks)
                    ]
                else:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lme, wm)
                        ]) for word, pre, flag, lme, (wm, posm) in zip(
                            word_embs, pre_embs, flag_embs, lm_embs, emb_masks)
                    ]

            else:
                if self.use_lemma and self.use_pos:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lemma, lme, pos])
                        for word, pre, flag, lemma, lme, pos in zip(
                            word_embs, pre_embs, flag_embs, lemma_embs,
                            lm_embs, tag_embs)
                    ]
                elif self.use_lemma:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lme, pos])
                        for word, pre, flag, lemma, lme, pos in zip(
                            word_embs, pre_embs, flag_embs, lm_embs, tag_embs)
                    ]
                elif self.use_pos:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lemma, lme])
                        for word, pre, flag, lemma, lme in zip(
                            word_embs, pre_embs, flag_embs, lemma_embs,
                            lm_embs)
                    ]
                else:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lme])
                        for word, pre, flag, lme in zip(
                            word_embs, pre_embs, flag_embs, lm_embs)
                    ]
        else:
            if isTrain:
                emb_masks = self.generate_emb_mask(seq_len, batch_size)
                if self.use_lemma and self.use_pos:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lemma, wm),
                            dy.cmult(pos, posm)
                        ]) for word, pre, flag, lemma, pos, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             lemma_embs, tag_embs, emb_masks)
                    ]
                elif self.use_lemma:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lemma, wm)
                        ]) for word, pre, flag, lemma, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             lemma_embs, emb_masks)
                    ]
                elif self.use_pos:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(pos, posm)
                        ]) for word, pre, flag, pos, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             tag_embs, emb_masks)
                    ]
                else:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm)
                        ]) for word, pre, flag, (wm, posm) in zip(
                            word_embs, pre_embs, flag_embs, emb_masks)
                    ]

            else:
                if self.use_lemma and self.use_pos:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lemma, pos])
                        for word, pre, flag, lemma, pos in zip(
                            word_embs, pre_embs, flag_embs, lemma_embs,
                            tag_embs)
                    ]
                elif self.use_lemma:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lemma])
                        for word, pre, flag, lemma in zip(
                            word_embs, pre_embs, flag_embs, lemma_embs)
                    ]
                elif self.use_pos:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, pos])
                        for word, pre, flag, pos in zip(
                            word_embs, pre_embs, flag_embs, tag_embs)
                    ]
                else:
                    emb_inputs = [
                        dy.concatenate([word, pre,
                                        flag]) for word, pre, flag in zip(
                                            word_embs, pre_embs, flag_embs)
                    ]

        if self.encoder_type == 'rnn':
            top_recur = dy.concatenate_cols(
                biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                       self.dropout_lstm_input if isTrain else 0.,
                       self.dropout_lstm_hidden if isTrain else 0.))
        else:

            emb_inputs = dy.concatenate_cols(emb_inputs)

            emb_inputs = emb_inputs * math.sqrt(self.input_dims)

            emb_inputs = emb_inputs + dy.transpose(
                dy.inputTensor(self.pe[:seq_len]))

            emb_inputs = dy.transpose(emb_inputs)

            encoder_outputs = self.transformer(emb_inputs,
                                               src_len=seq_lens,
                                               train=isTrain)

            top_recur = encoder_outputs.output

            top_recur = dy.concatenate_cols(top_recur)

            #print(top_recur.dim())

        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_arg, b_arg = self.mlp_arg_W.expr(), self.mlp_arg_b.expr(
        )  #dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b)
        W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter(
            self.mlp_pred_b)
        arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur]))
        # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur]))
        predicates_1D = pred_golds[0]
        pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1)
        pred_hidden = leaky_relu(
            dy.affine_transform([b_pred, W_pred, pred_recur]))
        if isTrain:
            arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp)
            # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp)
            pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp)

        W_rel = dy.parameter(self.rel_W)

        # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size,
        # 						num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True)
        # # (#pred x rel_size x #arg) x batch_size

        # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size)
        # # (#pred x rel_size) x (#arg x batch_size)

        # predicates_1D = dynet_flatten_numpy(pred_golds)
        # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D)
        # # (rel_size) x (#arg x batch_size)

        if self.use_si_droput and syn_mask is not None:
            syn_mask = np.expand_dims(syn_mask,
                                      axis=0)  # (1, seq_len, batch_size)
            arg_hidden = dy.cmult(arg_hidden,
                                  dy.inputTensor(syn_mask, batched=True))

        rel_logits = bilinear(arg_hidden,
                              W_rel,
                              pred_hidden,
                              self.mlp_size,
                              seq_len,
                              1,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # if self.use_biaffine:
        # 	rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size,
        # 							num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True)
        # else:
        # 	pred_hidden = dy.reshape(pred_hidden, (self.mlp_size, 1), batch_size)
        # 	preds_hidden = [pred_hidden for _ in xrange(seq_len)]
        # 	preds_hidden = dy.concatenate(preds_hidden, d=1)
        # 	rel_hidden = dy.concatenate([preds_hidden, arg_hidden], d=0)  # (2*mlp_size x seq_len) x batch_size
        # 	flat_rel_hidden = dy.reshape(rel_hidden, (self.mlp_size*2, ), seq_len * batch_size)

        # 	W_ffn_layer1 = dy.parameter(self.ffn_layer1_W)
        # 	b_ffn_layer1 = dy.parameter(self.ffn_layer1_b)
        # 	W_ffn_layer2 = dy.parameter(self.ffn_layer2_W)
        # 	b_ffn_layer2 = dy.parameter(self.ffn_layer2_b)

        # 	flat_rel_hidden = leaky_relu(dy.affine_transform([b_ffn_layer1, W_ffn_layer1, flat_rel_hidden]))
        # 	flat_rel_hidden = leaky_relu(dy.affine_transform([b_ffn_layer2, W_ffn_layer2, flat_rel_hidden]))
        # 	flat_rel_hidden = W_rel * flat_rel_hidden
        # 	rel_logits = dy.reshape(flat_rel_hidden, (1, self._vocab.rel_size, seq_len), batch_size)

        # (1 x rel_size x #arg) x batch_size
        flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (1 x rel_size) x (#arg x batch_size)

        predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0])
        partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D)
        # (1 x rel_size) x (#arg x batch_size)

        if isTrain:
            mask_1D = dynet_flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens
            return rel_accuracy, rel_loss

        # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
        # 									(self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))

        rel_probs = np.transpose(
            np.reshape(
                dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                (self._vocab.rel_size, 1, seq_len, batch_size), 'F'))
        outputs = []

        # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs):
        # 	msk[0] = 1.
        # 	sent_len = int(np.sum(msk))
        # 	rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold]
        # 	rel_pred = rel_argmax(rel_prob)
        # 	outputs.append(rel_pred[:sent_len])

        for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T,
                                            rel_probs):
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            rel_prob = rel_prob[np.arange(len(pred_gold)), 0]
            rel_pred = rel_argmax(rel_prob)
            outputs.append(rel_pred[:sent_len])

        return outputs