Esempio n. 1
0
    def decode_loss(self, src_encodings, tgt_seqs):
        """
        :param tgt_seqs: (tgt_heads, tgt_labels): list (length=batch_size) of (src_len)
        """

        # todo(NOTE): Sentences should start with empty token (as root of dependency tree)!

        tgt_heads, tgt_labels = tgt_seqs

        src_len = len(tgt_heads[0])
        batch_size = len(tgt_heads)
        np_tgt_heads = np.array(tgt_heads).flatten()  # (src_len * batch_size)
        np_tgt_labels = np.array(tgt_labels).flatten()
        s_arc, s_label = self.cal_scores(src_encodings)  # (src_len, src_len, bs), ([(src_len, src_len, bs)])

        s_arc_value = s_arc.npvalue()
        s_arc_choice = np.argmax(s_arc_value, axis=0).transpose().flatten()  # (src_len * batch_size)

        s_pick_labels = [dy.pick_batch(dy.reshape(score, (src_len,), batch_size=src_len * batch_size), s_arc_choice)
                     for score in s_label]
        s_argmax_labels = dy.concatenate(s_pick_labels, d=0)  # n_labels, src_len * batch_size

        reshape_s_arc = dy.reshape(s_arc, (src_len,), batch_size=src_len * batch_size)
        arc_loss = dy.pickneglogsoftmax_batch(reshape_s_arc, np_tgt_heads)
        label_loss = dy.pickneglogsoftmax_batch(s_argmax_labels, np_tgt_labels)

        loss = dy.sum_batches(arc_loss + label_loss) / batch_size
        return loss
Esempio n. 2
0
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output()
    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Esempio n. 3
0
def calc_sent_loss(sent, dropout=0.0):
  # Create a computation graph
  dy.renew_cg()
  # The initial history is equal to end of sentence symbols
  hist = [S] * N
  # Step through the sentence, including the end of sentence token
  all_histories = []
  all_targets = []
  for next_word in sent + [S]:
    all_histories.append(list(hist))
    all_targets.append(next_word)
    hist = hist[1:] + [next_word]
  s = calc_score_of_histories(all_histories, dropout=dropout)
  return dy.sum_batches(dy.pickneglogsoftmax_batch(s, all_targets))
Esempio n. 4
0
    def BuildLMGraph(self, sents):
        dy.renew_cg()
        # initialize the RNN
        init_state = self.builder.initial_state()
        # parameters -> expressions
        R = dy.parameter(self.R)
        bias = dy.parameter(self.bias)

        S = vocab.w2i["<s>"]
        # get the cids and masks for each step
        tot_chars = 0
        cids = []
        masks = []

        for i in range(len(sents[0])):
            cids.append([(vocab.w2i[sent[i]] if len(sent) > i else S) for sent in sents])
            mask = [(1 if len(sent)>i else 0) for sent in sents]
            masks.append(mask)
            tot_chars += sum(mask)

        # start the rnn with "<s>"
        init_ids = cids[0]
        s = init_state.add_input(lookup_batch(self.lookup, init_ids))

        losses = []

        # feed char vectors into the RNN and predict the next char
        for cid, mask in zip(cids[1:], masks[1:]):
            score = dy.affine_transform([bias, R, s.output()])
            loss = dy.pickneglogsoftmax_batch(score, cid)
            # mask the loss if at least one sentence is shorter
            if mask[-1] != 1:
                mask_expr = dy.inputVector(mask)
                mask_expr = dy.reshape(mask_expr, (1,), len(sents))
                loss = loss * mask_expr

            losses.append(loss)
            # update the state of the RNN
            cemb = dy.lookup_batch(self.lookup, cid)
            s = s.add_input(cemb)

        return dy.sum_batches(dy.esum(losses)), tot_chars
Esempio n. 5
0
    def calc_loss(self, mlp_dec_state, ref_action):
        scores = self.get_scores(mlp_dec_state)

        if self.label_smoothing == 0.0:
            # single mode
            if not xnmt.batcher.is_batched(ref_action):
                return dy.pickneglogsoftmax(scores, ref_action)
            # minibatch mode
            else:
                return dy.pickneglogsoftmax_batch(scores, ref_action)

        else:
            log_prob = dy.log_softmax(scores)
            if not xnmt.batcher.is_batched(ref_action):
                pre_loss = -dy.pick(log_prob, ref_action)
            else:
                pre_loss = -dy.pick_batch(log_prob, ref_action)

            ls_loss = -dy.mean_elems(log_prob)
            loss = ((1 - self.label_smoothing) *
                    pre_loss) + (self.label_smoothing * ls_loss)
            return loss
Esempio n. 6
0
    def update_batch(self, words_batch, tags_batch):
        dynet.renew_cg()
        length = max(len(words) for words in words_batch)
        word_ids = np.zeros((length, len(words_batch)), dtype='int32')
        for j, words in enumerate(words_batch):
            for i, word in enumerate(words):
                word_ids[i, j] = self.vw.w2i.get(word, self.UNK)
        tag_ids = np.zeros((length, len(words_batch)), dtype='int32')
        for j, tags in enumerate(tags_batch):
            for i, tag in enumerate(tags):
                tag_ids[i, j] = self.vt.w2i.get(tag, self.UNK)
        wembs = [
            dynet.lookup_batch(self._E, word_ids[i]) for i in range(length)
        ]
        wembs = [dynet.noise(we, 0.1) for we in wembs]

        f_state = self._fwd_lstm.initial_state()
        b_state = self._bwd_lstm.initial_state()

        fw = [x.output() for x in f_state.add_inputs(wembs)]
        bw = [x.output() for x in b_state.add_inputs(reversed(wembs))]

        H = dynet.parameter(self._pH)
        O = dynet.parameter(self._pO)

        errs = []
        for i, (f, b) in enumerate(zip(fw, reversed(bw))):
            f_b = dynet.concatenate([f, b])
            r_t = O * (dynet.tanh(H * f_b))
            err = dynet.pickneglogsoftmax_batch(r_t, tag_ids[i])
            errs.append(dynet.sum_batches(err))
        sum_errs = dynet.esum(errs)
        squared = -sum_errs  # * sum_errs
        losses = sum_errs.scalar_value()
        sum_errs.backward()
        self._sgd.update()
        return losses
def calc_lm_loss(sents):
    dy.renew_cg()

    # initialize the RNN
    f_init = RNN.initial_state()

    # get the wids and masks for each step
    tot_words = 0
    wids = []
    masks = []
    for i in range(len(sents[0])):
        wids.append([(sent[i] if len(sent) > i else S) for sent in sents])
        mask = [(1 if len(sent) > i else 0) for sent in sents]
        masks.append(mask)
        tot_words += sum(mask)

    # start the rnn by inputting "<s>"
    init_ids = [S] * len(sents)
    s = f_init.add_input(dy.lookup_batch(WORDS_LOOKUP, init_ids))

    # feed word vectors into the RNN and predict the next word
    losses = []
    for wid, mask in zip(wids, masks):
        # calculate the softmax and loss
        score = dy.affine_transform([b_exp, W_exp, s.output()])
        loss = dy.pickneglogsoftmax_batch(score, wid)
        # mask the loss if at least one sentence is shorter
        if mask[-1] != 1:
            mask_expr = dy.inputVector(mask)
            mask_expr = dy.reshape(mask_expr, (1,), len(sents))
            loss = loss * mask_expr
        losses.append(loss)
        # update the state of the RNN
        wemb = dy.lookup_batch(WORDS_LOOKUP, wid)
        s = s.add_input(wemb)

    return dy.sum_batches(dy.esum(losses)), tot_words
Esempio n. 8
0
File: scorers.py Progetto: gmwe/xnmt
    def calc_loss(self, x: dy.Expression,
                  y: Union[int, List[int]]) -> dy.Expression:

        scores = self.calc_scores(x)

        if self.label_smoothing == 0.0:
            # single mode
            if not batchers.is_batched(y):
                loss = dy.pickneglogsoftmax(scores, y)
            # minibatch mode
            else:
                loss = dy.pickneglogsoftmax_batch(scores, y)
        else:
            log_prob = dy.log_softmax(scores)
            if not batchers.is_batched(y):
                pre_loss = -dy.pick(log_prob, y)
            else:
                pre_loss = -dy.pick_batch(log_prob, y)

            ls_loss = -dy.mean_elems(log_prob)
            loss = ((1 - self.label_smoothing) *
                    pre_loss) + (self.label_smoothing * ls_loss)

        return loss
Esempio n. 9
0
    def run(self, word_inputs, lemma_inputs, tag_inputs, pred_golds, rel_targets=None, isTrain=True):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1,), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        marker = self._vocab.PAD if self._unified else self._vocab.DUMMY
        mask = np.greater(word_inputs, marker).astype(np.float32)
        num_tokens = int(np.sum(mask))

        word_embs = [dy.lookup_batch(self.word_embs,
                                     np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)
                                     ) for w in word_inputs]
        pre_embs = [dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs]
        flag_embs = [dy.lookup_batch(self.flag_embs,
                                     np.array(w == i + 1, dtype=np.int)
                                     ) for i, w in enumerate(pred_golds)]
        lemma_embs = [dy.lookup_batch(self.lemma_embs, lemma) for lemma in lemma_inputs]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [dy.concatenate([dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm),
                                          dy.cmult(lemma, wm), dy.cmult(pos, posm)])
                          for word, pre, flag, lemma, pos, (wm, posm) in
                          zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, emb_masks)]

        else:
            emb_inputs = [dy.concatenate([word, pre, flag, lemma, pos])
                          for word, pre, flag, lemma, pos in
                          zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs)]

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_arg, b_arg = dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b)
        W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter(self.mlp_pred_b)
        arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur]))
        # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur]))
        predicates_1D = pred_golds[0]
        pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1)
        pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, pred_recur]))
        if isTrain:
            arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp)
            # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp)
            pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp)

        W_rel = dy.parameter(self.rel_W)

        # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size,
        # 						num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True)
        # # (#pred x rel_size x #arg) x batch_size

        # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size)
        # # (#pred x rel_size) x (#arg x batch_size)

        # predicates_1D = dynet_flatten_numpy(pred_golds)
        # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D)
        # # (rel_size) x (#arg x batch_size)

        rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size,
                              num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True)
        # (1 x rel_size x #arg) x batch_size
        flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size), seq_len * batch_size)
        # (1 x rel_size) x (#arg x batch_size)

        predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0])
        partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D)
        # (1 x rel_size) x (#arg x batch_size)

        if isTrain:
            mask_1D = dynet_flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens
            return rel_accuracy, rel_loss

        # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
        # 									(self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))

        rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                                            (self._vocab.rel_size, 1, seq_len, batch_size), 'F'))
        outputs = []

        # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs):
        # 	msk[0] = 1.
        # 	sent_len = int(np.sum(msk))
        # 	rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold]
        # 	rel_pred = rel_argmax(rel_prob)
        # 	outputs.append(rel_pred[:sent_len])

        for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs):
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            rel_prob = rel_prob[np.arange(len(pred_gold)), 0]
            rel_pred = rel_argmax(rel_prob)
            outputs.append(rel_pred[:sent_len])

        return outputs
Esempio n. 10
0
    def run(self,
            word_inputs,
            lengths,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            isTrain=True):
        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = (np.broadcast_to(np.reshape(np.arange(seq_len), (seq_len, 1)),
                                (seq_len, batch_size)) < lengths).astype(
                                    np.float32)
        mask[0] = 0.
        num_tokens = int(np.sum(mask))

        if isTrain or arc_targets is not None:
            mask_1D = self.dynet_flatten_numpy(mask)
            # batched here means that the last dim is treated as batch dimension, both in input and output
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        # TODO: 注意 _words_in_train
        # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len
        if self.e_ext is not None:
            word_embs = [
                dy.lookup_batch(
                    self.e_form,
                    np.where(w < self.v_train, w,
                             self.vocab_form.stoi["<unk>"])) +
                dy.lookup_batch(self.e_ext, w, update=False)
                for w in word_inputs
            ]  # 两个 embedding 相加 [Expression] * seq_len
        else:
            word_embs = [
                dy.lookup_batch(
                    self.e_form,
                    np.where(w < self.v_train, w,
                             self.vocab_form.stoi["<unk>"]))
                for w in word_inputs
            ]
        tag_embs = [dy.lookup_batch(self.e_tag, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_msk(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        top_recur = dy.concatenate_cols(
            biLSTM(self.lstm_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            # drop some dim for lstm_output for all words, all sentences
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        dep = leaky_relu(
            dy.affine_transform([self.mlp_dep_b, self.mlp_dep_W, top_recur]))
        head = leaky_relu(
            dy.affine_transform([self.mlp_head_b, self.mlp_head_W, top_recur]))
        if isTrain:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)
            # drop dim k means, it is possible that the whole dim k is set to zeros
            # for matrix with batch, ((R, C), B)
            # drop dim 0 means drop some cols, drop dim 1 means drop some rows
            # drop 2 means drop some batches, and it only supports for Tensor with rank <=3

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        arc_logits = bilinear(dep_arc,
                              self.arc_W,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ),
                                     seq_len * batch_size)
        # flatten it to compute loss
        # (#head ) x (#dep x batch_size)

        arc_preds = np.reshape(arc_logits.npvalue().argmax(0),
                               (seq_len, batch_size))
        # seq_len x batch_size
        # here if an Expression's batch size is 1
        # npvalue() will drop the batch dimension
        # so add it back if needed

        if isTrain or arc_targets is not None:
            # tarin it in a neg log likelihood fashion, but enforce tree constraint when testing
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask
            # mask is used to filter <pad>'s out in summing loss
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = self.dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
            # #batch_size x #dep x #head, transpose reverse all, and since layout has changed, it's totally fine

        rel_logits = bilinear(dep_rel,
                              self.rel_W,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=len(self.vocab_deprel),
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, len(self.vocab_deprel)),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = self.dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D  # 这里的形状如此, 需要用 mask1d
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (len(self.vocab_deprel), seq_len, seq_len, batch_size),
                    'F'))
            # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * self.dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by ones
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(
                rel_prob, sent_len, self.vocab_deprel,
                "root" if "root" in self.vocab_deprel.stoi else "ROOT")
            outputs.append(
                (arc_pred[1:sent_len], rel_pred[1:sent_len]))  # w_0 is <roor>
        assert (len(outputs) == batch_size)

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Esempio n. 11
0
 def loss(self, input_, y):
     if self.batched:
         return dy.pickneglogsoftmax_batch(input_, y)
     return dy.pickneglogsoftmax(input_, y)
Esempio n. 12
0
    def calculate_batch_loss(self, batch):
        dy.renew_cg()

        W_y = dy.parameter(self.params["W_y"])
        b_y = dy.parameter(self.params["b_y"])
        s_lookup = self.params["s_lookup"]
        t_lookup = self.params["t_lookup"]

        s_batch = [x[0] for x in batch]
        t_batch = [x[1] for x in batch]

        wids = []

        for i in range(len(s_batch[0])):
            wids.append([sent[i] for sent in s_batch])

        wids_rev = list(reversed(wids))

        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []

        for wid in wids:
            l2r_state = l2r_state.add_input(dy.lookup_batch(s_lookup, wid))
            l2r_contexts.append(l2r_state.output())

        for wid in wids_rev:
            r2l_state = r2l_state.add_input(dy.lookup_batch(s_lookup, wid))
            r2l_contexts.append(r2l_state.output())

        r2l_contexts.reverse()

        losses = []

        H_f = []
        H_f = [dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts)]

        H_f_mat = dy.concatenate_cols(H_f)
        W1_att = dy.parameter(self.params["W1_att"])
        w1dt = W1_att * H_f_mat

        t_wids = []
        masks = []

        num_words = 0

        for i in range(len(t_batch[0])):
            t_wids.append([(sent[i] if len(sent) > i else self.t_vocab[EOS]) for sent in t_batch])
            mask = [(1 if len(sent) > i else 0) for sent in t_batch]
            masks.append(mask)
            num_words += sum(mask)

        c_t = dy.vecInput(2*self.HIDDEN_DIM)

        words = [self.t_vocab[EOS]] * len(t_batch)
        embedding = dy.lookup_batch(t_lookup, words)

        dec_state = self.dec_builder.initial_state()

        for t_wid, mask in zip(t_wids, masks):
            x_t = dy.concatenate([c_t, embedding])
            dec_state = dec_state.add_input(x_t)

            c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_batch[0]), len(wids[0]))

            probs = dy.affine_transform([b_y, W_y, dy.concatenate([c_t, dec_state.output()])])
            loss = dy.pickneglogsoftmax_batch(probs, t_wid)

            if mask[-1] != 1:
                mask_expr = dy.inputVector(mask)
                mask_expr = dy.reshape(mask_expr, (1,), len(t_batch))
                loss = loss * mask_expr

            losses.append(loss)
            embedding = dy.lookup_batch(t_lookup, t_wid)

        loss = dy.sum_batches(dy.esum(losses))  # /len(wids[0])
        return loss, num_words
Esempio n. 13
0
    def static_train(self,\
                    train_treebank,\
                    validation_treebank,\
                    lr=0.001,\
                    hidden_dropout=0.01,\
                    batch_size=64,\
                    max_epochs=200,\
                    max_lexicon_size=9998,\
                    glove_file=None):
        """
        Locally trains a model with a static oracle and a multi-task standard feedforward NN.  
        @param train_treebank      : a list of dependency trees
        @param validation_treebank : a list of dependency trees
        @param lr                  : learning rate
        @param hidden_dropout      : dropout on hidden layer
        @param batch_size          : size of mini batches
        @param max_epochs          : max number of epochs
        @param max_lexicon_size    : max number of entries in the lexicon
        @param glove_file          : file where to find pre-trained word embeddings   
        """
        print("Encoding dataset from %d trees."%len(train_treebank))

        #(1) build dictionaries
        self.code_symbols(train_treebank,lexicon_size = max_lexicon_size)

        #(2) encode data sets
        lex_train_gen , struct_train_gen  = self.make_data_generators(train_treebank,batch_size)
        lex_dev_gen   , struct_dev_gen    = self.make_data_generators(validation_treebank,batch_size)
        
        print(self,flush=True)
        print("epochs %d\nstructural training examples  [N] = %d\nlexical training examples  [N] = %d\nBatch size = %d\nDropout = %f\nlearning rate = %f"%(max_epochs,struct_train_gen.N,lex_train_gen.N,batch_size,hidden_dropout,lr),flush=True)

        #(3) make network
        self.model = dy.ParameterCollection()
        self.hidden_weights   = self.model.add_parameters((self.hidden_size,self.embedding_size*self.input_length))
        self.action_weights   = self.model.add_parameters((self.actions_size,self.hidden_size))
        if glove_file is None:
            self.input_embeddings  = self.model.add_parameters((self.lexicon_size,self.embedding_size))
        else:
            self.input_embeddings  = self.model.parameters_from_numpy(self.read_glove_embeddings(glove_file))
        if not self.tied:
            self.output_embeddings = self.model.add_parameters((self.lexicon_size,self.hidden_size))

        #(4) fitting
        lex_gen       = lex_train_gen.next_batch()
        struct_gen    = struct_train_gen.next_batch()
        max_batches = max( lex_train_gen.get_num_batches(), struct_train_gen.get_num_batches() )
        print(lex_train_gen.get_num_batches(), struct_train_gen.get_num_batches(),flush=True)
        
        lex_valid_gen       = lex_dev_gen.next_batch()
        struct_valid_gen    = struct_dev_gen.next_batch()
        
        min_nll = float('inf')
        trainer = dy.AdamTrainer(self.model,alpha=lr)
        history_log = []
        for e in range(max_epochs):
            struct_loss,lex_loss = 0,0
            struct_N,lex_N       = 0,0
            start_t = time.time()
            for b in range(max_batches):
                #struct
                X_struct,Y_struct = next(struct_gen)
                #question of proportions : should struct and lex be evenly sampled or not (??):
                #here the parity oversamples approx twice the lexical actions
                dy.renew_cg()
                W = dy.parameter(self.hidden_weights)
                E = dy.parameter(self.input_embeddings)
                A = dy.parameter(self.action_weights)
                batched_X        = zip(*X_struct)  #transposes the X matrix                           
                lookups          = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X]
                xdense           = dy.concatenate(lookups)
                ybatch_preds     = dy.pickneglogsoftmax_batch(A * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_struct)
                loss             = dy.sum_batches(ybatch_preds)
                struct_N         += len(Y_struct)
                struct_loss      += loss.value()
                loss.backward()
                trainer.update()
                #lex
                X_lex,Y_lex = next(lex_gen)
                if self.tied:
                    dy.renew_cg()
                    W = dy.parameter(self.hidden_weights)
                    E = dy.parameter(self.input_embeddings)
                    batched_X        = zip(*X_lex) #transposes the X matrix
                    lookups          = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X]
                    xdense           = dy.concatenate(lookups)
                    ybatch_preds     = dy.pickneglogsoftmax_batch(E * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_lex)
                    loss             = dy.sum_batches(ybatch_preds)
                else:
                    dy.renew_cg()
                    W = dy.parameter(self.hidden_weights)
                    E = dy.parameter(self.input_embeddings)
                    O = dy.parameter(self.output_embeddings)
                    batched_X        = zip(*X_lex) #transposes the X matrix
                    lookups          = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X]
                    xdense           = dy.concatenate(lookups)
                    ybatch_preds     = dy.pickneglogsoftmax_batch(O * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_lex)
                    loss             = dy.sum_batches(ybatch_preds)
                lex_N            += len(Y_lex)
                lex_loss         += loss.value()
                loss.backward()
                trainer.update()
            end_t = time.time()
            # (5) validation
            X_lex_valid,Y_lex_valid = lex_dev_gen.batch_all()
            lex_valid_nll           = -sum(self.predict_logprobs(X_lex_valid,Y_lex_valid,structural=False))
            
            X_struct_valid,Y_struct_valid = struct_dev_gen.batch_all()
            struct_valid_nll              = -sum(self.predict_logprobs(X_struct_valid,Y_struct_valid,structural=True))
            
            history_log.append((e,end_t-start_t,\
                                exp(lex_loss/lex_N),\
                                exp(struct_loss/struct_N),\
                                exp(lex_valid_nll/lex_dev_gen.N),\
                                exp(struct_valid_nll/struct_dev_gen.N),\
                                exp((lex_valid_nll+struct_valid_nll) /(struct_dev_gen.N+lex_dev_gen.N))))
            print('Epoch %d (%.2f sec.) TRAIN:: PPL_lex = %f, PPL_struct = %f / VALID:: PPL_lex = %f, PPL_struct = %f, PPL_all = %f'%tuple(history_log[-1]),flush=True)
            if  lex_valid_nll+struct_valid_nll < min_nll:
                df = pd.DataFrame(history_log,columns=['epoch','wall_time','ppl_lex_train','ppl_struct_train','ppl_lex_valid','ppl_struct_valid','ppl_all_valid'])
                self.save_model('best_model_dump',epoch = e, learning_curve=df)
            
        return pd.DataFrame(history_log,columns=['epoch','wall_time','ppl_lex_train','ppl_struct_train','ppl_lex_valid','ppl_struct_valid','ppl_all_valid'])
Esempio n. 14
0
def train(args, network, train_batches, dev_batches, log=None):
    """Estimate model parameters on `train_batches`
    with early stopping on`dev_batches`"""
    # Logger
    log = log or util.Logger(verbose=args.verbose, flush=True)
    # Optimizer
    trainer = dy.AdamTrainer(network.pc, alpha=args.lr)
    # Start training
    log("Starting training")
    best_accuracy = 0
    deadline = 0
    running_nll = n_processed = 0
    report_every = ceil(len(train_batches) / 10)
    # Start training
    for epoch in range(1, args.n_epochs + 1):
        # Time the epoch
        start_time = time.time()
        for batch, y in train_batches:
            # Renew the computation graph
            dy.renew_cg()
            # Initialize layers
            network.init(test=False, update=True)
            # Compute logits
            logits = network(batch)
            # Loss function
            nll = dy.mean_batches(dy.pickneglogsoftmax_batch(logits, y))
            # Backward pass
            nll.backward()
            # Update the parameters
            trainer.update()
            # Keep track of the nll
            running_nll += nll.value() * batch.batch_size
            n_processed += batch.batch_size
            # Print the current loss from time to time
            if train_batches.just_passed_multiple(report_every):
                avg_nll = running_nll / n_processed
                log(f"Epoch {epoch}@{train_batches.percentage_done():.0f}%: "
                    f"NLL={avg_nll:.3f}")
                running_nll = n_processed = 0
        # End of epoch logging
        avg_nll = running_nll / n_processed
        log(f"Epoch {epoch}@100%: NLL={avg_nll:.3f}")
        log(f"Took {time.time()-start_time:.1f}s")
        log("=" * 20)
        # Validate
        accuracy = evaluate(args, network, dev_batches)
        # Print final result
        log(f"Dev accuracy: {accuracy*100:.2f}%")
        # Early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            dynn.io.save(network.pc, args.model_file)
            deadline = 0
        else:
            if deadline < args.patience:
                dynn.io.populate(network.pc, args.model_file)
                trainer.learning_rate *= args.lr_decay
                deadline += 1
            else:
                log("Early stopping with best accuracy "
                    f"{best_accuracy*100:.2f}%")
                break
    # Load best model
    dynn.io.populate(network.pc, args.model_file)
    return best_accuracy
Esempio n. 15
0
    def run(self,
            word_inputs,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            isTrain=True):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))

        if isTrain or arc_targets is not None:
            mask_1D = dynet_flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        word_embs = [
            dy.lookup_batch(self.word_embs,
                            np.where(w < self._vocab.words_in_train, w,
                                     self._vocab.UNK),
                            update=True)
            #+ dy.lookup_batch(self.pret_word_embs, w, update = False) # remove 1 line
            for w in word_inputs
        ]
        tag_embs = [
            dy.lookup_batch(self.tag_embs, pos, update=True)
            for pos in tag_inputs
        ]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep, head = leaky_relu(dy.affine_transform([
            b_dep, W_dep, top_recur
        ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if isTrain:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ),
                                     seq_len * batch_size)
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        # seq_len x batch_size

        if isTrain or arc_targets is not None:
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
            # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
            # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Esempio n. 16
0
    def run(self,
            word_inputs,
            lemma_inputs,
            tag_inputs,
            pred_golds,
            rel_targets=None,
            isTrain=True,
            syn_mask=None,
            seq_lens=None):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.PAD).astype(np.float32)
        num_tokens = int(np.sum(mask))

        word_embs = [
            dy.lookup_batch(
                self.word_embs,
                np.where(w < self._vocab.words_in_train, w, self._vocab.UNK))
            for w in word_inputs
        ]

        if self.use_lm:
            lm_embs = np.zeros((batch_size, seq_len, self.lm_dims),
                               dtype=float)
            for idx in range(batch_size):
                if self._unified:
                    txt = [
                        self._vocab.id2word(w) for w in word_inputs[1:, idx]
                        if self._vocab.id2word(w) != '<PAD>'
                    ]
                    key = ' '.join(txt)
                    key = self.lm_dict.get(key, None)
                    if key is None:
                        for sidx in range(len(self.lm_sentences)):
                            line = self.lm_sentences[sidx]
                            if len(line) != len(txt):
                                continue
                            found = True
                            for mdx in range(len(line)):
                                if line[mdx] != txt[mdx] and txt[
                                        mdx] != '<UNK>':
                                    found = False
                                    break
                            if found:
                                key = str(sidx)
                                self.lm_dict[' '.join(txt)] = key
                                break
                    assert key is not None
                    lm_embs[idx, 1:1 + len(txt), :] = self.lm_data[key][...]
                else:
                    txt = [
                        self._vocab.id2word(w) for w in word_inputs[:, idx]
                        if self._vocab.id2word(w) != '<PAD>'
                    ]
                    key = ' '.join(txt)
                    key = self.lm_dict.get(key, None)
                    if key is None:
                        for sidx in range(len(self.lm_sentences)):
                            line = self.lm_sentences[sidx]
                            if len(line) != len(txt):
                                continue
                            found = True
                            for mdx in range(len(line)):
                                if line[mdx] != txt[mdx] and txt[
                                        mdx] != '<UNK>':
                                    found = False
                                    break
                            if found:
                                key = str(sidx)
                                self.lm_dict[' '.join(txt)] = key
                                break
                    assert key is not None
                    lm_embs[idx, :len(txt), :] = self.lm_data[key][...]
            lm_embs = lm_embs.transpose(1, 2, 0)
            lm_embs = [dy.inputTensor(e, batched=True) for e in list(lm_embs)]

        pre_embs = [
            dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs
        ]
        flag_embs = [
            dy.lookup_batch(self.flag_embs, np.array(w == i + 1, dtype=np.int))
            for i, w in enumerate(pred_golds)
        ]
        if self.use_lemma:
            lemma_embs = [
                dy.lookup_batch(self.lemma_embs, lemma)
                for lemma in lemma_inputs
            ]
        if self.use_pos:
            tag_embs = [
                dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs
            ]

        if self.use_lm:
            if isTrain:
                emb_masks = self.generate_emb_mask(seq_len, batch_size)
                if self.use_lemma and self.use_pos:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lemma, wm),
                            dy.cmult(lme, wm),
                            dy.cmult(pos, posm)
                        ]) for word, pre, flag, lemma, pos, lme, (wm, posm) in
                        zip(word_embs, pre_embs, flag_embs, lemma_embs,
                            tag_embs, lm_embs, emb_masks)
                    ]
                elif self.use_lemma:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lemma, wm),
                            dy.cmult(lme, wm)
                        ]) for word, pre, flag, lemma, pos, lme, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             lemma_embs, lm_embs, emb_masks)
                    ]
                elif self.use_pos:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lme, wm),
                            dy.cmult(pos, posm)
                        ]) for word, pre, flag, pos, lme, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             tag_embs, lm_embs, emb_masks)
                    ]
                else:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lme, wm)
                        ]) for word, pre, flag, lme, (wm, posm) in zip(
                            word_embs, pre_embs, flag_embs, lm_embs, emb_masks)
                    ]

            else:
                if self.use_lemma and self.use_pos:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lemma, lme, pos])
                        for word, pre, flag, lemma, lme, pos in zip(
                            word_embs, pre_embs, flag_embs, lemma_embs,
                            lm_embs, tag_embs)
                    ]
                elif self.use_lemma:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lme, pos])
                        for word, pre, flag, lemma, lme, pos in zip(
                            word_embs, pre_embs, flag_embs, lm_embs, tag_embs)
                    ]
                elif self.use_pos:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lemma, lme])
                        for word, pre, flag, lemma, lme in zip(
                            word_embs, pre_embs, flag_embs, lemma_embs,
                            lm_embs)
                    ]
                else:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lme])
                        for word, pre, flag, lme in zip(
                            word_embs, pre_embs, flag_embs, lm_embs)
                    ]
        else:
            if isTrain:
                emb_masks = self.generate_emb_mask(seq_len, batch_size)
                if self.use_lemma and self.use_pos:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lemma, wm),
                            dy.cmult(pos, posm)
                        ]) for word, pre, flag, lemma, pos, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             lemma_embs, tag_embs, emb_masks)
                    ]
                elif self.use_lemma:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(lemma, wm)
                        ]) for word, pre, flag, lemma, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             lemma_embs, emb_masks)
                    ]
                elif self.use_pos:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm),
                            dy.cmult(pos, posm)
                        ]) for word, pre, flag, pos, (
                            wm, posm) in zip(word_embs, pre_embs, flag_embs,
                                             tag_embs, emb_masks)
                    ]
                else:
                    emb_inputs = [
                        dy.concatenate([
                            dy.cmult(word, wm),
                            dy.cmult(pre, wm),
                            dy.cmult(flag, wm)
                        ]) for word, pre, flag, (wm, posm) in zip(
                            word_embs, pre_embs, flag_embs, emb_masks)
                    ]

            else:
                if self.use_lemma and self.use_pos:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lemma, pos])
                        for word, pre, flag, lemma, pos in zip(
                            word_embs, pre_embs, flag_embs, lemma_embs,
                            tag_embs)
                    ]
                elif self.use_lemma:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, lemma])
                        for word, pre, flag, lemma in zip(
                            word_embs, pre_embs, flag_embs, lemma_embs)
                    ]
                elif self.use_pos:
                    emb_inputs = [
                        dy.concatenate([word, pre, flag, pos])
                        for word, pre, flag, pos in zip(
                            word_embs, pre_embs, flag_embs, tag_embs)
                    ]
                else:
                    emb_inputs = [
                        dy.concatenate([word, pre,
                                        flag]) for word, pre, flag in zip(
                                            word_embs, pre_embs, flag_embs)
                    ]

        if self.encoder_type == 'rnn':
            top_recur = dy.concatenate_cols(
                biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                       self.dropout_lstm_input if isTrain else 0.,
                       self.dropout_lstm_hidden if isTrain else 0.))
        else:

            emb_inputs = dy.concatenate_cols(emb_inputs)

            emb_inputs = emb_inputs * math.sqrt(self.input_dims)

            emb_inputs = emb_inputs + dy.transpose(
                dy.inputTensor(self.pe[:seq_len]))

            emb_inputs = dy.transpose(emb_inputs)

            encoder_outputs = self.transformer(emb_inputs,
                                               src_len=seq_lens,
                                               train=isTrain)

            top_recur = encoder_outputs.output

            top_recur = dy.concatenate_cols(top_recur)

            #print(top_recur.dim())

        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_arg, b_arg = self.mlp_arg_W.expr(), self.mlp_arg_b.expr(
        )  #dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b)
        W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter(
            self.mlp_pred_b)
        arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur]))
        # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur]))
        predicates_1D = pred_golds[0]
        pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1)
        pred_hidden = leaky_relu(
            dy.affine_transform([b_pred, W_pred, pred_recur]))
        if isTrain:
            arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp)
            # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp)
            pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp)

        W_rel = dy.parameter(self.rel_W)

        # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size,
        # 						num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True)
        # # (#pred x rel_size x #arg) x batch_size

        # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size)
        # # (#pred x rel_size) x (#arg x batch_size)

        # predicates_1D = dynet_flatten_numpy(pred_golds)
        # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D)
        # # (rel_size) x (#arg x batch_size)

        if self.use_si_droput and syn_mask is not None:
            syn_mask = np.expand_dims(syn_mask,
                                      axis=0)  # (1, seq_len, batch_size)
            arg_hidden = dy.cmult(arg_hidden,
                                  dy.inputTensor(syn_mask, batched=True))

        rel_logits = bilinear(arg_hidden,
                              W_rel,
                              pred_hidden,
                              self.mlp_size,
                              seq_len,
                              1,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # if self.use_biaffine:
        # 	rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size,
        # 							num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True)
        # else:
        # 	pred_hidden = dy.reshape(pred_hidden, (self.mlp_size, 1), batch_size)
        # 	preds_hidden = [pred_hidden for _ in xrange(seq_len)]
        # 	preds_hidden = dy.concatenate(preds_hidden, d=1)
        # 	rel_hidden = dy.concatenate([preds_hidden, arg_hidden], d=0)  # (2*mlp_size x seq_len) x batch_size
        # 	flat_rel_hidden = dy.reshape(rel_hidden, (self.mlp_size*2, ), seq_len * batch_size)

        # 	W_ffn_layer1 = dy.parameter(self.ffn_layer1_W)
        # 	b_ffn_layer1 = dy.parameter(self.ffn_layer1_b)
        # 	W_ffn_layer2 = dy.parameter(self.ffn_layer2_W)
        # 	b_ffn_layer2 = dy.parameter(self.ffn_layer2_b)

        # 	flat_rel_hidden = leaky_relu(dy.affine_transform([b_ffn_layer1, W_ffn_layer1, flat_rel_hidden]))
        # 	flat_rel_hidden = leaky_relu(dy.affine_transform([b_ffn_layer2, W_ffn_layer2, flat_rel_hidden]))
        # 	flat_rel_hidden = W_rel * flat_rel_hidden
        # 	rel_logits = dy.reshape(flat_rel_hidden, (1, self._vocab.rel_size, seq_len), batch_size)

        # (1 x rel_size x #arg) x batch_size
        flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (1 x rel_size) x (#arg x batch_size)

        predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0])
        partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D)
        # (1 x rel_size) x (#arg x batch_size)

        if isTrain:
            mask_1D = dynet_flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens
            return rel_accuracy, rel_loss

        # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
        # 									(self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))

        rel_probs = np.transpose(
            np.reshape(
                dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                (self._vocab.rel_size, 1, seq_len, batch_size), 'F'))
        outputs = []

        # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs):
        # 	msk[0] = 1.
        # 	sent_len = int(np.sum(msk))
        # 	rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold]
        # 	rel_pred = rel_argmax(rel_prob)
        # 	outputs.append(rel_pred[:sent_len])

        for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T,
                                            rel_probs):
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            rel_prob = rel_prob[np.arange(len(pred_gold)), 0]
            rel_pred = rel_argmax(rel_prob)
            outputs.append(rel_pred[:sent_len])

        return outputs
Esempio n. 17
0
    def __call__(self, inputs, masks, truth, is_train=True, is_tree=True):
        sent_len = len(inputs)
        batch_size = inputs[0].dim()[1]
        flat_len = sent_len * batch_size

        # H -> hidden size, L -> sentence length, B -> batch size
        # ((H, L), B)
        X = dy.concatenate_cols(inputs)
        if is_train:
            X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP)
        # M_H -> MLP hidden size
        # ((M_H, L), B)
        # head_mat = leaky_relu(self.head_MLP(X, is_train))
        head_mat = self.head_MLP(X, is_train)
        # ((M_H, L), B)
        dept_mat = self.dept_MLP(X, is_train)
        if is_train:
            total_token = sum(masks['flat'].tolist())
            head_mat = dy.dropout_dim(head_mat, 1, self.cfg.MLP_DROP)
            dept_mat = dy.dropout_dim(dept_mat, 1, self.cfg.MLP_DROP)

        # A_H -> Arc hidden size, R_H -> Label hidden size, A_H + R_H = M_H
        head_arc = head_mat[:self.arc_size]  # ((A_H, L), B)
        dept_arc = dept_mat[:self.arc_size]  # ((A_H, L), B)
        head_rel = head_mat[self.arc_size:]  # ((R_H, L), B)
        dept_rel = dept_mat[self.arc_size:]  # ((R_H, L), B)

        # ((L, L), B)
        masks_2D = dy.inputTensor(masks['2D'], True)
        # (1, L*B)
        masks_flat = dy.inputTensor(masks['flat'], True)

        gnn_losses = []
        for k in range(self.cfg.GRAPH_LAYERS):
            # Graph Weights
            # ((L, L), B)
            arc_mat = self.arc_attn_mat[k](head_arc,
                                           dept_arc) - 1e9 * (1 - masks_2D)
            arc_prob = dy.softmax(arc_mat)

            # Layer-wise Loss
            if is_train:
                # ((L,), L*B)
                arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
                # ((1,), L*B)
                arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
                # (1,)
                arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token
                gnn_losses.append(arc_loss)

            # Aggregation Function
            # Fusion head and dept representation
            # ((A_H, L), B)
            HX = head_arc * arc_prob
            DX = dept_arc * dy.transpose(arc_prob)
            FX = HX + DX

            # Async Update Function
            # Head-first
            # ((A_H, L), B)
            head_arc = self.head_gnn(FX, head_arc)
            FX_new = head_arc * arc_prob + DX
            dept_arc = self.dept_gnn(FX_new, dept_arc)

        # ((L, L), B)
        arc_mat = self.arc_attn_mat[-1](head_arc,
                                        dept_arc) - 1e9 * (1 - masks_2D)
        # ((L,), L*B)
        arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
        # Predict Relation
        # (R_H, L*B)
        head_rel = dy.reshape(head_rel, (self.rel_size, flat_len))
        # ((R_H,), L*B)
        dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len)
        if is_train:
            # ((1,), L*B)
            arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
            # (1,)
            arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token
            # ((R_H,), L*B)
            truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1)
            # R -> Relation Set Size
            # ((R,), L*B)
            rel_mat = self.rel_attn(dept_rel, truth_rel)
        else:
            if is_tree:
                # MST Inference, Achieve Tree Edge.
                arc_probs = dy.softmax(arc_mat).npvalue()
                arc_probs = np.reshape(arc_probs,
                                       (sent_len, sent_len, batch_size), 'F')
                arc_probs = np.transpose(arc_probs)
                # Mask PAD
                arc_masks = [
                    np.array(masks['flat'][i:i + sent_len])
                    for i in range(0, flat_len, sent_len)
                ]
                arc_pred = []
                # Inference One By One.
                for msk, arc_prob in zip(arc_masks, arc_probs):
                    msk[0] = 1
                    seq_len = int(np.sum(msk))
                    tmp_pred = MST_inference(arc_prob, seq_len, msk)
                    tmp_pred[0] = 0
                    arc_pred.extend(tmp_pred)
            else:
                # Greedy Inference (argmax)
                arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # Pick Predicted Edge's <Head, Dept> pair.
            flat_pred = [
                j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred)
            ]
            pred_rel = dy.pick_batch(head_rel, flat_pred, 1)
            # Predict Relation (mask ROOT)
            rel_mat = self.rel_attn(dept_rel, pred_rel)
            rel_mask = dy.inputTensor(self.rel_mask)
            rel_mat = rel_mat - 1e9 * rel_mask
        if is_train:
            # Calculate Relation Classification Loss
            # ((1,), L*B)
            rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel'])
            # (1,)
            rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token
            # Final Total Loss with Layer-wise
            losses = (rel_loss + arc_loss) * self.cfg.LAMBDA2
            if gnn_losses:
                losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1
            losses_list = gnn_losses + [arc_loss, rel_loss]
            return losses, losses_list
        else:
            rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue()
            rel_pred = np.argmax(rel_mat, 0)
            pred = {}
            pred['head'], pred['rel'] = arc_pred, rel_pred
            return pred
Esempio n. 18
0
    def __call__(self, inputs, masks, truth, iters, is_train=True, is_tree=True):
        if type(inputs) == list:
            sent_len = len(inputs)
            batch_size = inputs[0].dim()[1]
            X = dy.concatenate_cols(inputs)

        else:
            sent_len = inputs.dim()[0][0]
            batch_size = inputs.dim()[1]
            X = dy.transpose(inputs, [1, 0])
 
        
        
        flat_len = sent_len * batch_size
        

        #sent_len = len(inputs) 
        #batch_size = inputs[0].dim()[1]
        #flat_len = sent_len * batch_size

        # H -> hidden size, L -> sentence length, B -> batch size
        # ((H, L), B)
        #X = dy.concatenate_cols(inputs)
        if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP)
        # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size
        # ((A_H, L), B)
        head_arc = self.head_arc_MLP(X, is_train)
        dept_arc = self.dept_arc_MLP(X, is_train)
        # ((R_H, L), B)
        head_rel = self.head_rel_MLP(X, is_train)
        dept_rel = self.dept_rel_MLP(X, is_train)

        if is_train:
            total_token = sum(masks['flat'].tolist())
            head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP)
            head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP)
            dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP)
            dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP)

        # ((L, L), B)
        masks_2D = 1e9*(1-dy.inputTensor(masks['2D'], True))
        # (1, L*B)
        masks_flat = dy.inputTensor(masks['flat'], True)

        gnn_losses = []
        arc_norm = math.sqrt(self.arc_size)
        rel_norm = math.sqrt(self.rel_size)
        for k in range(self.cfg.GRAPH_LAYERS):
            # Graph Weights
            # ((L, L), B)
            arc_mat = self.arc_attn_mat[k](head_arc, dept_arc)/arc_norm-masks_2D
            arc_prob = dy.softmax(arc_mat)
            # Layer-wise Loss
            if is_train:
                arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP)
                # ((L,), L*B)
                arc_mat = dy.reshape(arc_mat, (sent_len,), flat_len)
                # ((1,), L*B)
                arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
                # (1,)
                arc_loss = dy.sum_batches(arc_loss*masks_flat)/total_token
                gnn_losses.append(arc_loss)

            # Aggregation Function
            # Fusion head and dept representation
            # ((A_H, L), B)
            HX = head_arc * arc_prob
            DX = dept_arc * dy.transpose(arc_prob)
            FX = HX + DX
            
            # Async Update Function
            # Head-first
            # ((A_H, L), B)
            head_arc = self.head_gnn(FX, head_arc)
            FX_new = head_arc * arc_prob + DX
            dept_arc = self.dept_gnn(FX_new, dept_arc)

            # Relation Aggregation Function
            # Sync update 
            # ((R_H, L), B)
            HR = head_rel * arc_prob
            DR = dept_rel * dy.transpose(arc_prob)
            FX = HR+DR
            head_rel = self.head_rel_gnn(FX, head_rel) + head_rel
            dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel

        # ((L, L), B)
        arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc)/arc_norm-masks_2D

        is_tree_computed_val = is_tree_computed(arc_mat.npvalue())
        print(is_tree_computed_val)
        exit(0)
        # ((L,), L*B)
        arc_mat = dy.reshape(arc_mat, (sent_len,), flat_len)
        # Predict Relation
        # (R_H, L*B)
        head_rel = dy.reshape(head_rel, (self.rel_size, flat_len))
        # ((R_H,), L*B)
        dept_rel = dy.reshape(dept_rel, (self.rel_size,), flat_len)
        if is_train:
            
            # print(arc_mat.dim()) # ((3,), 300)
            # arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # print(arc_pred.shape) # (300,)
            # print(arc_pred) # all 0's and 1's

            # ((1,), L*B)
            arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
            # (1,)
            arc_loss = dy.sum_batches(arc_losses*masks_flat)/total_token
            # ((R_H,), L*B)
            truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1)
            # R -> Relation Set Size
            # ((R,), L*B)
            rel_mask = 1e9*dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, truth_rel)/rel_norm - rel_mask

            # Calculate Relation Classification Loss
            # ((1,), L*B)
            rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel'])
            # (1,)
            rel_loss = dy.sum_batches(rel_losses*masks_flat) / total_token
            # Final Total Loss with Layer-wise
            warm = [int(iters>=x) for x in self.warm_list]
            losses = rel_loss*self.cfg.LAMBDA2*warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1]
            if gnn_losses:
                for i in range(self.cfg.GRAPH_LAYERS):
                    gnn_losses[i] *= warm[i]
                losses += dy.esum(gnn_losses)*self.cfg.LAMBDA1
            losses_list = gnn_losses + [arc_loss, rel_loss]
            return losses, losses_list
        else:
            if is_tree:
                # MST Inference, Achieve Tree Edge.
                arc_probs = dy.softmax(arc_mat).npvalue()
                arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F')
                arc_probs = np.transpose(arc_probs)
                # Mask PAD
                arc_masks = [np.array(masks['flat'][i:i+sent_len])
                             for i in range(0, flat_len, sent_len)]
                arc_pred = []
                # Inference One By One.
                for msk, arc_prob in zip(arc_masks, arc_probs):
                    msk[0] = 1
                    seq_len = int(np.sum(msk))
                    tmp_pred = MST_inference(arc_prob, seq_len, msk)
                    tmp_pred[0] = 0
                    arc_pred.extend(tmp_pred)
            else:
                # Greedy Inference (argmax)
                arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # Pick Predicted Edge's <Head, Dept> pair.
            flat_pred = [j+(i//sent_len)*sent_len for i, j in enumerate(arc_pred)]
            pred_rel = dy.pick_batch(head_rel, flat_pred, 1)
            # Predict Relation (mask ROOT)
            rel_mask = 1e9*dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, pred_rel)/rel_norm-rel_mask
            rel_mat = dy.reshape(rel_mat, (self.rel_num,)).npvalue()
            rel_pred = np.argmax(rel_mat, 0)
            pred = {}
            pred['head'], pred['rel'] = arc_pred, rel_pred
            return pred
Esempio n. 19
0
 def loss(self, input_, y):
     if self.batched:
         return dy.pickneglogsoftmax_batch(input_, y)
     return dy.pickneglogsoftmax(input_, y)
Esempio n. 20
0
# regular lookup
a = lp[1].npvalue()
b = lp[2].npvalue()
c = lp[3].npvalue()

# batch lookup instead of single elements.
# two ways of doing this.
abc1 = dy.lookup_batch(lp, [1,2,3])
print(abc1.npvalue())

abc2 = lp.batch([1,2,3])
print(abc2.npvalue())

print(np.hstack([a,b,c]))


# use pick and pickneglogsoftmax in batch mode
# (must be used in conjunction with lookup_batch):
print("\nPick")
W = dy.parameter( m.add_parameters((5, 10)) )
h = W * lp.batch([1,2,3])
print(h.npvalue())
print(dy.pick_batch(h,[1,2,3]).npvalue())
print(dy.pick(W*lp[1],1).value(), dy.pick(W*lp[2],2).value(), dy.pick(W*lp[3],3).value())

# using pickneglogsoftmax_batch
print("\nPick neg log softmax")
print((-dy.log(dy.softmax(h))).npvalue())
print(dy.pickneglogsoftmax_batch(h,[1,2,3]).npvalue())
Esempio n. 21
0
    def run_parser(self,
                   word_inputs,
                   common_top_recur,
                   private_top_recur,
                   arc_targets=None,
                   rel_targets=None,
                   isTrain=True):
        # inputs, targets: seq_len x batch_size

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))
        top_recur = dy.concatenate([common_top_recur, private_top_recur])

        if isTrain or arc_targets is not None:
            mask_1D = self.dynet_flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep = leaky_relu(dy.affine_transform([b_dep, W_dep, top_recur]))
        head = leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if isTrain:
            dep = dy.dropout_dim(dep, 1, self.dropout_mlp)
            head = dy.dropout_dim(head, 1, self.dropout_mlp)

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ),
                                     seq_len * batch_size)
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        # seq_len x batch_size

        if isTrain or arc_targets is not None:
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = self.dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
        # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        # dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        # head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = self.dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
        # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * self.dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Esempio n. 22
0
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #get the outputs of the first LSTM
    src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])]
    src_output = src_outputs[-1]

    #gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix

    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()
        att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Esempio n. 23
0
    def run(self,
            char_vocab,
            cased_word_inputs,
            word_inputs,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            is_train=True):
        """
        Train or test
        :param char_vocab:
        :param cased_word_inputs: seq_len x batch_size
        :param word_inputs: seq_len x batch_size
        :param tag_inputs: seq_len x batch_size
        :param arc_targets: seq_len x batch_size
        :param rel_targets: seq_len x batch_size
        :param is_train: is training or test
        :return:
        """
        def flatten_numpy(ndarray):
            """
            Flatten nd-array to 1-d column vector
            :param ndarray:
            :return:
            """
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))  # non padding, non root token number

        if is_train or arc_targets is not None:
            mask_1D = flatten_numpy(mask)
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)
            #  if batched=True, the last dimension is used as a batch dimension if arr is a list of numpy ndarrays

        if self.char_lstm:
            # Subword model
            char_w = dy.parameter(self.char_w)

            def LSTM_attention(lstm, inputs, dropout_x=0., dropout_h=0.):
                ss = LSTM(lstm, inputs, None, dropout_x, dropout_h)
                hs = [s.h()[0] for s in ss]
                return dy.concatenate([attention(hs, char_w), ss[-1].s()[0]])

            subword_embs = []
            for char_ids in char_vocab:
                char_inputs = [
                    dy.lookup(self.char_embs, char) for char in char_ids
                ]
                subword_embs.append(
                    LSTM_attention(
                        self.char_lstm, char_inputs,
                        self.dropout_lstm_input if is_train else 0.,
                        self.dropout_lstm_hidden if is_train else 0.))
            subword_embs = dy.concatenate_cols(subword_embs)

            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) + subword_embs *
                dy.inputTensor(one_hot(cw, len(char_vocab)).T, batched=True) +
                0 if self.pret_word_embs is None else dy.lookup_batch(
                    self.pret_word_embs, w, update=False)
                for cw, w in zip(cased_word_inputs, word_inputs)
            ]
        else:
            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) +
                0 if self.pret_word_embs is None else dy.lookup_batch(
                    self.pret_word_embs, w, update=False) for w in word_inputs
            ]

        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        # Dropout
        if is_train:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]  # seq_len x batch_size

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if is_train else 0.,
                   self.dropout_lstm_hidden if is_train else 0.))
        if is_train:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep, head = leaky_relu(dy.affine_transform([
            b_dep, W_dep, top_recur
        ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if is_train:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ),
                                     seq_len * batch_size)
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        if len(arc_preds.shape) == 1:  # dynet did unnecessary jobs
            arc_preds = np.expand_dims(arc_preds, axis=1)
        # seq_len x batch_size

        if is_train or arc_targets is not None:
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
        # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        # dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        # head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if is_train else flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if is_train or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
        # batch_size x #dep x #head x #nclasses

        if is_train or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if is_train:
            return arc_accuracy * 100., rel_accuracy * 100., overall_accuracy * 100., loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy * 100., rel_accuracy * 100., overall_accuracy * 100., outputs
        return outputs
 def create_network_return_loss(self, inputs, expected_output, dropout=False):
     out = self(inputs, dropout)
     loss = dy.pickneglogsoftmax_batch(out, expected_output)
     # loss = -dy.log(dy.pick(out, expected_output))
     return loss
Esempio n. 25
0
    def batch_predict_next_best_action(self,config_batched,prev_action_batched,sentence_batch):
        """
        Predicts greedily the next transition for a batch of configs,
        actions leading to that config,and related sentences
        @param config_batched: a list of configurations
        @param prev_action_batched: a list of actions (or None if no prev actions)
        @param sentence_batch: a list of sentences
        @return a list of new configurations, a list of actions generating these new configs
        """
    
        B = len(config_batched)
        idxes = list(range(B))
        new_configs = [None] * B
        new_actions = [None] * B

        if prev_action_batched is None:
            prev_action_batched = [None]*B
                
        #(1) sort out the lexical and structural batches
        def is_lexical(config):
            S,F,B,A,prefix_score = config
            return F is None and len(B) > 0

        lexical_idxes    = [idx for idx in idxes if     is_lexical(config_batched[idx])]
        structural_idxes = [idx for idx in idxes if not is_lexical(config_batched[idx])]

        #(2) lexical predictions
        if len(lexical_idxes) > 0:

            def make_ref_lex_action(config,sentence):
                S,F,B,A,prefix_score = config
                return (ArcEagerGenerativeParser.GENERATE,sentence[B[0]])

            X = []
            Y = []
            for idx in lexical_idxes:
                x,y = self.make_representation(config_batched[idx],make_ref_lex_action(config_batched[idx],sentence_batch[idx]),sentence_batch[idx],structural=False)
                X.append(x)
                Y.append(y)

            Xt = zip(*X)    #transpose
        
            if self.tied:
                dy.renew_cg()
                W = dy.parameter(self.hidden_weights)
                E = dy.parameter(self.input_embeddings)
                embeddings = [dy.pick_batch(E, xcol) for xcol in Xt]
                xdense     = dy.concatenate(embeddings)
                preds      = dy.pickneglogsoftmax_batch(E * dy.tanh( W * xdense ),Y).npvalue()[0]
            else:
                dy.renew_cg()
                W = dy.parameter(self.hidden_weights)
                E = dy.parameter(self.input_embeddings)
                O = dy.parameter(self.output_embeddings)
                embeddings = [dy.pick_batch(E, xcol) for xcol in Xt]
                xdense     = dy.concatenate(embeddings)
                preds      = dy.pickneglogsoftmax_batch(O * dy.tanh( W * xdense ),Y).npvalue()[0]

            preds = np.atleast_1d(preds)
                
            for pred_score,idx in zip(preds,lexical_idxes): 
                new_configs[idx] = self.generate(config_batched[idx],local_score= -pred_score)# execs the actions  
                new_actions[idx] = (ArcEagerGenerativeParser.GENERATE,sentence_batch[idx][config_batched[idx][2][0]])

        #(3) structural predictions
        if len(structural_idxes) > 0 :
            action_masks = np.array([self.mask_actions(config_batched[idx],prev_action_batched[idx],len(sentence_batch[idx])) for idx in structural_idxes])
            X = [self.make_representation(config_batched[idx],None,sentence_batch[idx],structural=True) for idx in structural_idxes]
            Xt = zip(*X)    #transpose
            dy.renew_cg()
            W = dy.parameter(self.hidden_weights)
            E = dy.parameter(self.input_embeddings)
            A = dy.parameter(self.action_weights)
            embeddings = [dy.pick_batch(E, xcol) for xcol in Xt]
            xdense     = dy.concatenate(embeddings)
            preds      = dy.softmax(A * dy.tanh( W * xdense )).npvalue().transpose()

            max_idxes      = np.argmax(preds * action_masks,axis=1) 
            max_scores     = np.log(preds[np.arange(preds.shape[0]),max_idxes])
            for argmax_idx,max_score,idx in zip(max_idxes,max_scores,structural_idxes): 
                new_configs[idx] = self.actions[argmax_idx](config_batched[idx],local_score=max_score)  #execs the actions  
                new_actions[idx] = self.rev_action_codes[argmax_idx]
        return (new_configs, new_actions)
    def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size):
        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # initial "input feeding" vectors to feed decoder - 3*h
        init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size)

        # initial feedback embeddings for the decoder, use begin seq symbol embedding
        init_feedback = dn.lookup_batch(self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size)

        # init decoder rnn
        decoder_init = dn.concatenate([init_feedback, init_input_feeding])
        s = s_0.add_input(decoder_init)

        # loss per timestep
        losses = []

        # run the decoder through the output sequences and aggregate loss
        for i, step_word_ids in enumerate(output_word_ids):

            # returns h x batch size matrix
            decoder_rnn_output = s.output()

            # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix)
            attention_output_vector, alphas = self.attend(encoded_inputs, decoder_rnn_output, input_masks)

            # compute output scores (returns vocab_size x batch size matrix)
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform([self.bias, self.readout, attention_output_vector])

            # encourage diversity by punishing highly confident predictions
            # TODO: support batching - esp. w.r.t. scalar inputs
            if self.diverse:
                soft = dn.softmax(dn.tanh(h))
                batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \
                    - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4))
            else:
                # get batch loss for this timestep
                batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids)

            # mask the loss if at least one sentence is shorter
            if output_masks and output_masks[i][-1] != 1:
                mask_expr = dn.inputVector(output_masks[i])
                # noinspection PyArgumentList
                mask_expr = dn.reshape(mask_expr, (1,), batch_size)
                batch_loss = batch_loss * mask_expr

            # input feeding approach - input h (attention_output_vector) to the decoder
            # prepare for the next iteration - "feedback"
            feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids)
            decoder_input = dn.concatenate([feedback_embeddings, attention_output_vector])
            s = s.add_input(decoder_input)

            losses.append(batch_loss)

        # sum the loss over the time steps and batch
        total_batch_loss = dn.sum_batches(dn.esum(losses))

        return total_batch_loss
Esempio n. 27
0
    def run(self,
            word_inputs,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            isTrain=True):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))

        if isTrain or arc_targets is not None:
            mask_1D = dynet_flatten_numpy(mask)
            # batched here means that the last dim is treated as batch dimension, both in input and output
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        # TODO: 注意 _words_in_train
        # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len
        if self.pre_train_emb:
            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) +
                dy.lookup_batch(self.pret_word_embs, w, update=False)
                for w in word_inputs
            ]  # 两个 embedding 相加 [Expression] * seq_len
        else:
            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) for w in word_inputs
            ]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep, head = leaky_relu(dy.affine_transform([
            b_dep, W_dep, top_recur
        ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if isTrain:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)
            # 1 就意味着某些情况下整个 dim 1 变成0, dim=0 就是 drop 列, dim=1 就是 drop 行, 第三维是 batch

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len *
                                     batch_size)  # 这种风格的平坦是为了计算 loss 啦
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        # seq_len x batch_size

        if isTrain or arc_targets is not None:
            # 用得分最高的去计算 loss, 并不意味着我就选这个作为解码结果的哦, 但是必须削减它
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask  # mask 你真厉害呀现在还活着
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
            # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D  # 这里的形状如此, 需要用 mask1d
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
            # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by ones
            # 我非常赞同, parse 的解码这一部分根本没法 batch
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len],
                            rel_pred[1:sent_len]))  # 难道第 0 个真的是 ROOT, 确实如此

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
def decode_sentences(dec_lstm, vectors, outputs):
# Takes in [l*(2hE+e)*n] as input and returns [l*2hD*n] as output
    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)
    w1 = dy.parameter(attention_w1)
    w1_array = []
    # Concatenate the columns of the BiLSTM encodings
    bidirectional_vectors = []
    for (i,v) in enumerate(vectors):
         bidirectional_vectors.append(dy.concatenate_cols(v))
         w1_array.append(w1)  
    # Repeat w1 and make it a tensor    
    w1_repeated = [[w1]] * len(vectors)

    if debug_dimensions:
        print "In Decoder"
        print "  The dimensions of w1: ", get_tensor_size(w1.value())
        print "  The dimensions of w1 array: " , get_tensor_size(w1_array)
        print "  The dimensions of v are: ", get_tensor_size(v)
        print "  The dimensions of the bidirectional encodings is ", get_tensor_size(bidirectional_vectors)
        print "  The dimensions of w1_repeated: ", get_tensor_size(w1_repeated)
        print "  The dimensions of the first dimension of the w1 is : ", get_matrix_size(w1_repeated[2])

    dots = char2int[EOS]
    dots_batch = [[dots]]*len(bidirectional_vectors)
    last_output_embeddings = dy.lookup(output_lookup, dots)
    last_output_embeddings_batch = [[last_output_embeddings]]*len(outputs[0])       
    concatenated_stuff = [[dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings])]] * len(outputs[0])
    dec_state_array = dec_lstm.initial_state().add_inputs((k[0] for k in concatenated_stuff)) 

    if debug_dimensions:
       print "  The dimensions of last output embeddings batch: ", get_tensor_size(last_output_embeddings_batch) 
       print "  The dimensions of the concatenated stuff: ", get_tensor_size(concatenated_stuff)
    loss = 0

    decodings_transposed = []
    masks = []
    for i in range(len(outputs[0])):
       decodings_transposed.append([(sent[i] if len(sent)> i else output_lookup[0]) for sent in outputs])
       mask = [(1 if len(sent)>i else 0) for sent in outputs]
       masks.append(mask)
    if debug:
        print "Transposed Decodings: ", decodings_transposed

    # Get w1dt
    w1dt_array = []
    for (w,b) in zip(w1_array, bidirectional_vectors):
         w1dt_array.append(w*b)

    batch_loss = []
    for (y_batch, mask) in zip(decodings_transposed, masks):
      attention_output = attend(bidirectional_vectors, dec_state_array, w1dt_array)
      if debug_dimensions:
           print "Back in Decoder"
           print "  The dimensions of the concatenated stuff: ", get_tensor_size(concatenated_stuff)
           print "  The dimensions of the attention output: ", get_tensor_size(attention_output)
           #print "  The dimensions of Dec state : ", get_tensor_size(dec_state)
           print dec_state_array
      vector_array = []
      for (a,b) in zip(attention_output, concatenated_stuff):
            vector = dy.concatenate([a,b[0]])
            vector_array.append(vector)
      for (dec_state, vector) in zip(dec_state_array, vector_array):
           dec_state.add_input(vector)
      out_vectors_array = []
      for dec_state in dec_state_array:
           out_vectors = w * dec_state.output()
           out_vectors_array.append(out_vectors)
      print out_vectors_array
      loss = dy.pickneglogsoftmax_batch(out_vectors, y_batch)
      batch_loss.append(loss)
    return dy.esum(batch_loss)
Esempio n. 29
0
    def train_nn_lm(self,\
                    train_sentences,\
                    validation_sentences,\
                    lr=0.001,\
                    hidden_dropout=0.1,\
                    batch_size=100,\
                    max_epochs=100,\
                    glove_file=None):
        """
        Locally trains a model with a static oracle and a standard feedforward NN.  
        @param train_sentences        : a list of sentences
        @param validation_sentences   : a list of sentences
        @return learning curves for various metrics as a pandas dataframe
        """
        #(1) build dictionaries
        self.code_symbols(train_sentences)
        print("Dictionaries built.")

        #(2) read off treebank and builds data set
        print("Encoding dataset from %d sentences." % len(train_sentences))
        training_generator = self.make_data_generator(train_sentences,
                                                      batch_size)
        validation_generator = self.make_data_generator(
            validation_sentences, batch_size)

        print(self, flush=True)
        print(
            "max_epochs = %d\ntraining examples [N] = %d\nBatch size = %d\nDropout = %f\nlearning rate = %f"
            %
            (max_epochs, training_generator.N, batch_size, hidden_dropout, lr),
            flush=True)

        #(3) Model structure
        self.model = dy.ParameterCollection()
        self.hidden_weights = self.model.add_parameters(
            (self.hidden_size, self.embedding_size * self.input_length))
        if glove_file is None:
            self.embedding_matrix = self.model.add_parameters(
                (self.lexicon_size, self.embedding_size))
        else:
            self.embedding_matrix = self.model.parameters_from_numpy(
                self.read_glove_embeddings(glove_file))
        if not self.tied:
            self.output_weights = self.model.add_parameters(
                (self.lexicon_size, self.hidden_size))

        #fitting
        xgen = training_generator.next_batch()
        trainer = dy.AdamTrainer(self.model, alpha=lr)
        min_nll = float('inf')
        history_log = []
        for e in range(max_epochs):
            L = 0
            N = 0
            start_t = time.time()
            for b in range(training_generator.get_num_batches()):
                X, Y = next(xgen)
                if self.tied:
                    dy.renew_cg()
                    W = dy.parameter(self.hidden_weights)
                    E = dy.parameter(self.embedding_matrix)
                    batched_X = zip(*X)  #transposes the X matrix
                    lookups = [
                        dy.pick_batch(E, xcolumn) for xcolumn in batched_X
                    ]
                    xdense = dy.concatenate(lookups)
                    ybatch_preds = dy.pickneglogsoftmax_batch(
                        E * dy.dropout(dy.tanh(W * xdense), hidden_dropout), Y)
                    loss = dy.sum_batches(ybatch_preds)
                else:
                    dy.renew_cg()
                    O = dy.parameter(self.output_weights)
                    W = dy.parameter(self.hidden_weights)
                    E = dy.parameter(self.embedding_matrix)
                    batched_X = zip(*X)  #transposes the X matrix
                    lookups = [
                        dy.pick_batch(E, xcolumn) for xcolumn in batched_X
                    ]
                    xdense = dy.concatenate(lookups)
                    ybatch_preds = dy.pickneglogsoftmax_batch(
                        O * dy.dropout(dy.tanh(W * xdense), hidden_dropout), Y)
                    loss = dy.sum_batches(ybatch_preds)

                N += len(Y)
                L += loss.value()
                loss.backward()
                trainer.update()

            end_t = time.time()

            #validation and auto-saving
            Xvalid, Yvalid = validation_generator.batch_all()
            valid_nll = -sum(self.predict_logprobs(Xvalid, Yvalid))
            valid_ppl = exp(valid_nll / len(Yvalid))
            history_log.append(
                (e, end_t - start_t, L, exp(L / N), valid_nll, valid_ppl))
            print(
                'Epoch %d (%.2f sec.) NLL (train) = %f, PPL (train) = %f, NLL(valid) = %f, PPL(valid) = %f'
                % tuple(history_log[-1]),
                flush=True)

            if valid_nll == min(valid_nll, min_nll):
                min_nll = valid_nll
                lc = pd.DataFrame(history_log,
                                  columns=[
                                      'epoch', 'wall_time', 'NLL(train)',
                                      'PPL(train)', 'NLL(dev)', 'PPL(dev)'
                                  ])
                self.save_model('best_model_dump', epoch=e, learning_curve=lc)

        return pd.DataFrame(history_log,
                            columns=[
                                'epoch', 'wall_time', 'NLL(train)',
                                'PPL(train)', 'NLL(dev)', 'PPL(dev)'
                            ])
Esempio n. 30
0
 def _loss(outputs, labels):
     losses = [dy.pickneglogsoftmax_batch(out, label) for out, label in zip(outputs, labels)]
     loss = dy.mean_batches(dy.average(losses))
     return loss
Esempio n. 31
0
    def BuildLMGraph_batch(self, sents, sent_args=None):
        dynet.renew_cg()
        init_state = self.rnn.initial_state()
        mb_size = len(sents)
        #MASK SENTENCES
        wids = []  # Dimension: maxSentLength * minibatch_size

        # List of lists to store whether an input is
        # present(1)/absent(0) for an example at a time step
        masks = []  # Dimension: maxSentLength * minibatch_size

        #No of words processed in this batch
        tot_words = 0
        maxSentLength = max([len(sent) for sent in sents])

        for k in range(maxSentLength):
            wids.append([(self.vocab.s2t[sent[k]]
                          if len(sent) > k else self.vocab.END_TOK)
                         for sent in sents])
            mask = [(1 if len(sent) > k else 0) for sent in sents]
            masks.append(mask)
            tot_words += sum(mask)

        R = dynet.parameter(self.R)
        bias = dynet.parameter(self.bias)
        losses = []  # will hold losses
        state = init_state
        spellings = []  # list of lists containing spellings of the word

        for (mask, curr_words, next_words) in zip(masks, wids, wids[1:]):
            # print curr_words
            # print next_words
            maxWordLen = max([len(word.s) for word in curr_words])
            wordLengths = [len(word.s) for word in curr_words]

            for k in range(maxWordLen):
                spellings.append([
                    (self.s2s.src_vocab[word.s[k].upper()].i
                     if len(word.s) > k else self.s2s.src_vocab.END_TOK.i)
                    for word in curr_words
                ])

            spellings_rev = list(reversed(spellings))
            embedded_spellings = self.s2s.embed_batch_seq(spellings)
            embedded_spellings_rev = self.s2s.embed_batch_seq(spellings_rev)

            pron_vectors = self.s2s.encode_batch_seq(embedded_spellings,
                                                     embedded_spellings_rev,
                                                     wordLengths)[-1]

            fpv = dynet.nobackprop(pron_vectors)

            curr_words_idx = [word.i for word in curr_words]
            curr_words_lookup = dynet.lookup_batch(self.lookup, curr_words_idx)

            temp = dynet.concatenate([curr_words_lookup, fpv])
            x_t = temp
            state = state.add_input(x_t)
            y_t = state.output()
            r_t = bias + (R * y_t)
            next_words_idx = [word.i for word in next_words]
            loss = dynet.pickneglogsoftmax_batch(r_t, next_words_idx)
            # loss is a list of losses
            # mask the loss if at least one sentence is shorter
            if 0 in mask:
                mask_expr = dynet.inputVector(mask)
                mask_expr = dynet.reshape(mask_expr, (1, ), mb_size)
                loss = loss * mask_expr
            losses.append(loss)

        netloss = dynet.sum_batches(dynet.esum(losses))
        return netloss
Esempio n. 32
0
def train_model(
    net,
    model_name,
    max_sentence_length,
    parsed = False,
    restart = None,
):
    print("Maximum sentence length is set to "+str(max_sentence_length))
    # load pre-parsed data
    if parsed:
        with open("data/snli2/training_parsed.pkl", "rb") as fin:
            training_data = pickle.load(fin)
            training_total = len(training_data)
            training_data = [(l, s1, s2) for l, s1, s2, ls1, ls2  in training_data if ls1 <= max_sentence_length and ls1 > 1 and ls2 <= max_sentence_length and ls2 > 1]
        with open("data/snli2/dev_parsed.pkl", "rb") as fin:
            dev_data = pickle.load(fin)
        num_batches = len(training_data)
        print("Training data contains "+str(num_batches) + " batches (originally "+str(training_total)+") of size 1")
    # or load raw data
    else:
        with open("data/snli2/training.pkl", "rb") as fin:
            training_data = pickle.load(fin)
            training_total = len(training_data)
            training_data = [(l, s1, s2) for l, s1, s2 in training_data if len(s1) <= max_sentence_length and len(s1) > 1 and len(s2) <= max_sentence_length and len(s2) > 1]
        with open("data/snli2/dev.pkl", "rb") as fin:
            dev_data = pickle.load(fin)
            dev_data = [(l, s1, s2) for l, s1, s2 in dev_data if len(s1) <= max_sentence_length and len(s1) > 1 and len(s2) <= max_sentence_length and len(s2) > 1]
        num_batches = len(training_data)
        batch_size = len(training_data[0][0])
        print("Training data contains "+str(num_batches) + " batches (originally "+str(training_total)+") of size "+str(batch_size))

    classifier = networks.SNLIClassifier(model, net.hidden_dim)
    trainer = dy.SimpleSGDTrainer(model, e0=0.01)

    # hyperparameters
    report_frequency = 500
    validate_frequency = num_batches // 10
    if parsed:
        report_frequency = 500 * 16

    start_time = time()
    last_validated = None
    last_reported = None
    best_validation = 0
    validations = []
    validation_means = []
    avg_window_size = 5
    patience = 12
    frustration = 0
    early_stop = False
    epoch = 0
    batches_seen = 0
    if isinstance(restart, int):
        model.load(model_name)
        epoch = restart
        batches_seen = epoch * num_batches
        print("Restarting interrupted training from epoch "+str(epoch))
    while True:
        print("Start of epoch #"+str(epoch))
        for batch_num, data in enumerate(training_data):
            dy.renew_cg()
            ls, s1, s2 = data
            if parsed:
                output1 = net.do_parse_tree(s1)
                output2 = net.do_parse_tree(s2)
            else:
                output1, _ = net(s1)
                output2, _ = net(s2)

            predicted_labels = classifier(output1, output2)
            if parsed:
                loss = dy.pickneglogsoftmax(predicted_labels, ls)
            else:
                loss = dy.sum_batches(dy.pickneglogsoftmax_batch(predicted_labels, ls))

            # optimise
            loss.forward()
            loss.backward()
            trainer.update()

            # Evaluate on development data
            if batches_seen % validate_frequency == 0 and last_validated != batches_seen:
                last_validated = batches_seen
                acc = eval_nli_dataset(net, classifier, dev_data, parsed)
                validations.append(acc)
                validation_means.append(np.mean(validations[-avg_window_size:]))
                print("Validation: accuracy "+str(acc)+", moving average "+str(validation_means[-1]))
                if acc >= best_validation:
                    best_validation = acc
                    model.save(model_name)
                    print("(model saved)")
                    frustration = 0

                # Write to log file
                with open(model_name+".log", "a") as flog:
                    prog = batches_seen
                    if parsed:
                        prog = batches_seen / 16
                    flog.write(str(prog)+"\t"+str(acc)+"\n")

                # Decide if it's time to stop
                if len(validation_means) > patience and validation_means[-1] <= np.array(validation_means[:-patience]).max():
                    frustration += 1
                    if frustration > patience:
                        print("Early stop!")
                        early_stop = True
                        break
                else:
                    frustration = 0

            # Report progress
            if batches_seen % report_frequency == 0 and last_reported != batches_seen:
                last_reported = batches_seen
                fraction_done = batch_num / num_batches
                elapsed_minutes = (time() - start_time)/60.0
                # Update temperature
                if isinstance(net, networks.CYK):
                    net.inv_temp = (float(epoch) + fraction_done)*100.0 + 1.0 # max(1.0 / pow(2.0, float(epoch) + fraction_done), 0.005)
                print(
                    "Processed "+str(round(fraction_done*100,2))+"% "+
                    "of epoch #"+str(epoch)+
                    " after "+str(round(elapsed_minutes))+" mins"+
                    (", inv. temp. "+str(net.inv_temp) if isinstance(net, networks.CYK) else "")
                )

            batches_seen += 1
        if early_stop:
            break
        epoch += 1
    print("Training "+str(model_name)+" finished.")
Esempio n. 33
0
    def step(self, instances):
        dy.renew_cg()

        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        W1_att_f = dy.parameter(self.W1_att_f)
        W1_att_e = dy.parameter(self.W1_att_e)
        w2_att = dy.parameter(self.w2_att)

        #instances : a list [(src0,tgt0),(src1,tgt1),(src2,tgt2)]
        maxLen = max(map(lambda x: len(x[1]), instances))
        src_sents = []
        src_sents_rev = []
        tgt_sents = []
        srcSenLen = len(
            instances[0][0]) + 2  #the length of the src sentence, all the same
        tgtSenLen = maxLen + 1
        masks = [
            [] for i in range(tgtSenLen)
        ]  #mask for each position. each item in this list is a list with length=batchsize
        num_words = 0

        for item in instances:
            #item[0]:src ; item[1]:tgt
            num_words += (len(item[1]) + 1)
            padNum = maxLen - len(item[1])
            for i in range(len(item[1]) + 1):
                masks[i].append(1)
            for i in range(len(item[1]) + 1, tgtSenLen):
                masks[i].append(0)
            thisSrc = [startSymbol] + item[0] + [endSymbol]
            src_sents.append(thisSrc)
            src_sents_rev.append(list(reversed(thisSrc)))
            thisTgt = item[1] + [endSymbol for i in range(padNum + 1)]
            tgt_sents.append(thisTgt)

        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for i in range(srcSenLen):
            batchSrc = dy.lookup_batch(
                self.src_lookup,
                [self.src_token_to_id[x[i]] for x in src_sents])
            batchSrc_rev = dy.lookup_batch(
                self.src_lookup,
                [self.src_token_to_id[x[i]] for x in src_sents_rev])
            l2r_state = l2r_state.add_input(batchSrc)
            r2l_state = r2l_state.add_input(batchSrc_rev)
            l2r_contexts.append(l2r_state.output())
            r2l_contexts.append(r2l_state.output())

        r2l_contexts.reverse()

        # Combine the left and right representations for every word
        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)

        losses = []

        # Decoder
        c_t = dy.vecInput(self.hidden_size * 2)
        start = dy.concatenate([
            dy.lookup_batch(self.tgt_lookup,
                            [self.tgt_token_to_id['<S>'] for i in tgt_sents]),
            c_t
        ])
        dec_state = self.dec_builder.initial_state().add_input(start)
        loss = dy.pickneglogsoftmax_batch(
            W_y * dec_state.output() + b_y,
            [self.tgt_token_to_id[tgt_sent[0]] for tgt_sent in tgt_sents])
        losses.append(loss)

        for i in range(tgtSenLen - 1):
            #cw : item[i] nw:item[i+1]
            h_e = dec_state.output()
            c_t = self.__attention_mlp(h_fs_matrix, h_e)[0]
            # Get the embedding for the current target word
            embed_t = dy.lookup_batch(
                self.tgt_lookup,
                [self.tgt_token_to_id[tgt_sent[i]] for tgt_sent in tgt_sents])
            # Create input vector to the decoder
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            loss = dy.pickneglogsoftmax_batch(W_y * dec_state.output() + b_y, [
                self.tgt_token_to_id[tgt_sent[i + 1]] for tgt_sent in tgt_sents
            ])
            thisMask = dy.inputVector(masks[i + 1])
            thisMask = dy.reshape(thisMask, (1, ), len(instances))
            losses.append(loss * thisMask)

        return dy.sum_batches(dy.esum(losses)), num_words
    def __step_batch(self, batch):
        dy.renew_cg()

        W_s = dy.parameter(self.W_s)
        b_s = dy.parameter(self.b_s)
        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        W_m = dy.parameter(self.W_m)
        b_m = dy.parameter(self.b_m)
        W1_att_f = dy.parameter(self.W1_att_f)
        w2_att = dy.parameter(self.w2_att)

        src_batch = [x[0] for x in batch]
        tgt_batch = [x[1] for x in batch]
        batch_size = len(src_batch)

        attended_batch = []
        for src_sent in src_batch:
            attended = []
            c_t_sense = dy.vecInput(self.embed_size)
            sense_start = dy.concatenate([
                self.lookup_frozen(self.src_lookup,
                                   self.src_token_to_id['<S>'][0]),
                dy.tanh(c_t_sense)
            ])
            sense_state = self.sense_builder.initial_state().add_input(
                sense_start)

            for cw in src_sent:
                cw_sense_ids = self.src_token_to_id[cw]
                cw_senses = [
                    self.lookup_frozen(self.src_lookup, sense_id)
                    for sense_id in cw_sense_ids
                ]
                h_senses = dy.concatenate_cols(cw_senses)
                h_m = sense_state.output()
                c_t_sense = self.__sense_attention_mlp(h_senses, h_m)
                sense_state = sense_state.add_input(
                    dy.concatenate([c_t_sense, dy.tanh(c_t_sense)]))
                attended.append(c_t_sense)

            attended_batch.append(attended)
        attended_batch_rev = [list(reversed(sent)) for sent in attended_batch]

        # Encoder
        src_cws_l2r = []
        src_cws_r2l = []
        src_len = [len(sent) for sent in attended_batch]
        max_src_len = np.max(src_len)

        for i in range(max_src_len):
            src_cws_l2r.append([sent[i] for sent in attended_batch])
            src_cws_r2l.append([sent[i] for sent in attended_batch_rev])

        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for i, (cws_l2r, cws_r2l) in enumerate(zip(src_cws_l2r, src_cws_r2l)):
            l2r_batch = dy.reshape(dy.concatenate_cols(cws_l2r),
                                   (self.embed_size, ),
                                   batch_size=batch_size)
            l2r_state = l2r_state.add_input(l2r_batch)
            r2l_batch = dy.reshape(dy.concatenate_cols(cws_r2l),
                                   (self.embed_size, ),
                                   batch_size=batch_size)
            r2l_state = r2l_state.add_input(r2l_batch)
            l2r_contexts.append(l2r_state.output())
            r2l_contexts.append(r2l_state.output())
        r2l_contexts.reverse()

        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)
        fixed_attentional_component = W1_att_f * h_fs_matrix

        losses = []
        num_words = 0

        # Decoder
        tgt_cws = []
        tgt_len = [len(sent) for sent in tgt_batch]
        max_tgt_len = np.max(tgt_len)
        masks = []

        for i in range(max_tgt_len):
            tgt_cws.append([
                self.tgt_token_to_id[sent[i]]
                if len(sent) > i else self.tgt_token_to_id['</S>']
                for sent in tgt_batch
            ])
            mask = [(1 if len(sent) > i else 0) for sent in tgt_batch]
            masks.append(mask)
            num_words += sum(mask)

        c_t = dy.vecInput(self.hidden_size * 2)
        start_state = dy.affine_transform([b_s, W_s, h_fs[-1]])
        dec_state = self.word_dec_builder.initial_state().set_s(
            [start_state, dy.tanh(start_state)])
        for i, (cws, nws, mask) in enumerate(zip(tgt_cws, tgt_cws[1:], masks)):
            embed_t = dy.lookup_batch(self.tgt_lookup, cws)
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            h_e = dec_state.output()
            c_t = self.__word_attention_mlp(h_fs_matrix, h_e,
                                            fixed_attentional_component)
            m_t = dy.tanh(
                dy.affine_transform([b_m, W_m,
                                     dy.concatenate([h_e, c_t])]))
            y_star = dy.affine_transform([b_y, W_y, m_t])
            loss = dy.pickneglogsoftmax_batch(y_star, nws)
            mask_expr = dy.inputVector(mask)
            mask_expr = dy.reshape(mask_expr, (1, ), len(batch))
            mask_loss = loss * mask_expr
            losses.append(mask_loss)

        return dy.sum_batches(dy.esum(losses)), num_words
Esempio n. 35
0
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])

    #get the outputs of the first LSTM
    src_outputs = [
        dy.concatenate([x.output(), y.output()])
        for x, y in LSTM_SRC.add_inputs(
            [dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])
    ]
    src_output = src_outputs[-1]

    #gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix

    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append(
            [sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)

    current_state = LSTM_TRG_BUILDER.initial_state().set_s(
        [src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the
        current_state = current_state.add_input(
            dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()
        att_output, _ = calc_attention(src_output_matrix, output_embedding,
                                       fixed_attentional_component)
        middle_expr = dy.tanh(
            dy.affine_transform(
                [b_m, W_m,
                 dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1, ), len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Esempio n. 36
0
    def __call__(self,
                 inputs,
                 masks,
                 truth,
                 iters,
                 is_train=True,
                 is_tree=True):
        sent_len = len(inputs)
        batch_size = inputs[0].dim()[1]
        flat_len = sent_len * batch_size

        print('===Vào call===')
        print('input length: ', inputs.__len__())  # input length:  46
        print('input dim: ', inputs[1].dim())  # input dim:  ((400,), 2)
        print('sent_len', sent_len)  # sent_len 46
        print('batch_size', batch_size)  # batch_size 2
        print('flat_len', flat_len)  # flat_len 92

        # H -> hidden size, L -> sentence length, B -> batch size
        # ((H, L), B)
        X = dy.concatenate_cols(inputs)
        print('X dim: ', X.dim())  # X dim:  ((400, 46), 2)
        if is_train:
            X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP)

        # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size
        # ((A_H, L), B)
        head_arc = self.head_arc_MLP(X, is_train)
        dept_arc = self.dept_arc_MLP(X, is_train)
        print('head_arc dim: ', head_arc.dim())
        print('dept_arc dim: ', dept_arc.dim())
        # head_arc dim:  ((300, 46), 2)
        # dept_arc dim:  ((300, 46), 2)

        # ((R_H, L), B)
        head_rel = self.head_rel_MLP(X, is_train)
        dept_rel = self.dept_rel_MLP(X, is_train)
        print('head_rel dim: ', head_rel.dim())
        print('dept_rel dim: ', dept_rel.dim())
        # head_rel dim:  ((100, 46), 2)
        # dept_rel dim:  ((100, 46), 2)

        if is_train:
            total_token = sum(masks['flat'].tolist())
            head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP)
            head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP)
            dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP)
            dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP)

        # ((L, L), B)

        masks_2D = 1e9 * (1 - dy.inputTensor(masks['2D'], True))

        masks_flat = dy.inputTensor(masks['flat'], True)

        gnn_losses = []
        arc_norm = math.sqrt(self.arc_size)
        rel_norm = math.sqrt(self.rel_size)
        for k in range(self.cfg.GRAPH_LAYERS):
            print('----layer-----', k)
            # Graph Weights
            # ((L, L), B)
            arc_mat = self.arc_attn_mat[k](head_arc,
                                           dept_arc) / arc_norm - masks_2D
            arc_prob = dy.softmax(arc_mat)

            # arc_mat dim:  ((46, 46), 2)
            # arc_prob dim:  ((46, 46), 2)

            # Layer-wise Loss
            if is_train:
                arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP)
                # ((L,), L*B)
                arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
                # ((1,), L*B)
                print('arc_mat val', arc_mat.value())
                print('arc_mat dim', arc_mat.dim())
                print("truth['head'] value", truth['head'])
                print("truth['head'] lengt", truth['head'].__len__())

                arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
                print('arc_loss', arc_loss.value())
                print('arc_loss', arc_loss.dim())

                # (1,)

                arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token
                print('arc_loss', arc_loss.value)
                print('arc_loss', arc_loss.dim())

                gnn_losses.append(arc_loss.value())
                input("pause")

            # Aggregation Function
            # Fusion head and dept representation
            # ((A_H, L), B)
            HX = head_arc * arc_prob
            DX = dept_arc * dy.transpose(arc_prob)
            FX = HX + DX

            print('HX dim: ', HX.dim())
            print('DX dim: ', DX.dim())
            print('FX dim: ', FX.dim())
            # HX dim:  ((300, 46), 2)
            # DX dim:  ((300, 46), 2)
            # FX dim:  ((300, 46), 2)

            # Async Update Function
            # Head-first
            # ((A_H, L), B)
            head_arc = self.head_gnn(FX, head_arc)
            FX_new = head_arc * arc_prob + DX
            dept_arc = self.dept_gnn(FX_new, dept_arc)

            print('head_arc dim: ', head_arc.dim())
            print('FX_new dim: ', FX_new.dim())
            print('dept_arc dim: ', dept_arc.dim())
            # head_arc dim:  ((300, 46), 2)
            # FX_new dim:  ((300, 46), 2)
            # dept_arc dim:  ((300, 46), 2)

            # Relation Aggregation Function
            # Sync update
            # ((R_H, L), B)
            HR = head_rel * arc_prob
            DR = dept_rel * dy.transpose(arc_prob)
            FX = HR + DR
            head_rel = self.head_rel_gnn(FX, head_rel) + head_rel
            dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel

            print('HR dim: ', HR.dim())
            print('DR dim: ', DR.dim())
            print('FX dim: ', FX.dim())
            # HR dim:  ((100, 46), 2)
            # DR dim:  ((100, 46), 2)
            # FX dim:  ((100, 46), 2)

            print('head_rel dim: ', head_rel.dim())
            print('dept_rel dim: ', dept_rel.dim())
# head_rel dim:  ((100, 46), 2)
# dept_rel dim:  ((100, 46), 2)

# ((L, L), B)
        arc_mat = self.arc_attn_mat[-1](head_arc,
                                        dept_arc) / arc_norm - masks_2D
        # ((L,), L*B)
        arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len)
        # Predict Relation
        # (R_H, L*B)
        head_rel = dy.reshape(head_rel, (self.rel_size, flat_len))
        # ((R_H,), L*B)
        dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len)

        print('arc_mat dim: ', arc_mat.dim())
        print('head_rel dim: ', head_rel.dim())
        print('dept_rel dim: ', dept_rel.dim())
        # arc_mat dim:  ((46,), 92)
        # head_rel dim:  ((100, 92), 1)
        # dept_rel dim:  ((100,), 92)

        if is_train:
            # ((1,), L*B)
            arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head'])
            # (1,)
            arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token
            # ((R_H,), L*B)
            truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1)
            # R -> Relation Set Size
            # ((R,), L*B)
            rel_mask = 1e9 * dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, truth_rel) / rel_norm - rel_mask
            # Calculate Relation Classification Loss
            # ((1,), L*B)
            rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel'])
            # (1,)
            rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token
            # Final Total Loss with Layer-wise
            warm = [int(iters >= x) for x in self.warm_list]
            losses = rel_loss*self.cfg.LAMBDA2 * \
                warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1]
            if gnn_losses:
                for i in range(self.cfg.GRAPH_LAYERS):
                    gnn_losses[i] *= warm[i]
                losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1
            losses_list = gnn_losses + [arc_loss, rel_loss]
            return losses, losses_list
        else:
            if is_tree:
                # MST Inference, Achieve Tree Edge.
                arc_probs = dy.softmax(arc_mat).npvalue()
                arc_probs = np.reshape(arc_probs,
                                       (sent_len, sent_len, batch_size), 'F')
                arc_probs = np.transpose(arc_probs)
                # Mask PAD
                arc_masks = [
                    np.array(masks['flat'][i:i + sent_len])
                    for i in range(0, flat_len, sent_len)
                ]
                arc_pred = []
                # Inference One By One.
                for msk, arc_prob in zip(arc_masks, arc_probs):
                    msk[0] = 1
                    seq_len = int(np.sum(msk))
                    tmp_pred = MST_inference(arc_prob, seq_len, msk)
                    tmp_pred[0] = 0
                    arc_pred.extend(tmp_pred)
            else:
                # Greedy Inference (argmax)
                arc_pred = np.argmax(arc_mat.npvalue(), 0)
            # Pick Predicted Edge's <Head, Dept> pair.
            flat_pred = [
                j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred)
            ]
            pred_rel = dy.pick_batch(head_rel, flat_pred, 1)
            # Predict Relation (mask ROOT)
            rel_mask = 1e9 * dy.inputTensor(self.rel_mask)
            rel_mat = self.rel_attn(dept_rel, pred_rel) / rel_norm - rel_mask
            rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue()
            rel_pred = np.argmax(rel_mat, 0)
            pred = {}
            pred['head'], pred['rel'] = arc_pred, rel_pred
            return pred
Esempio n. 37
0
    def step_batch(self, batch, lang):
        dy.renew_cg()

        W_y = dy.parameter(self.W_y[lang])
        b_y = dy.parameter(self.b_y[lang])
        W1_att_e = dy.parameter(self.W1_att_e)
        W1_att_f = dy.parameter(self.W1_att_f)
        w2_att = dy.parameter(self.w2_att)

        M_s = self.src_lookup
        M_t = self.tgt_lookup[lang]
        src_sent, tgt_sent = zip(*batch)
        src_sent = zip(*src_sent)
        tgt_sent = zip(*tgt_sent)
        src_sent_rev = list(reversed(src_sent))

        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()

        l2r_contexts = []
        r2l_contexts = []
        for (cw_l2r, cw_r2l) in zip(src_sent, src_sent_rev):
            l2r_state = l2r_state.add_input(dy.lookup_batch(M_s, cw_l2r))
            r2l_state = r2l_state.add_input(dy.lookup_batch(M_s, cw_r2l))
            l2r_contexts.append(l2r_state.output())  # [<S>, x_1, x_2, ..., </S>]
            r2l_contexts.append(r2l_state.output())  # [</S> x_n, x_{n-1}, ... <S>]

        # encoded_h1 = l2r_state.output()
        # tem1 = encoded_h1.npvalue()

        r2l_contexts.reverse()  # [<S>, x_1, x_2, ..., </S>]

        # Combine the left and right representations for every word
        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))

        encoded_h = h_fs[-1]

        h_fs_matrix = dy.concatenate_cols(h_fs)
        # h_fs_matrix_t = dy.transpose(h_fs_matrix)

        losses = []
        num_words = 0

        # Decoder
        c_t = dy.vecInput(self.hidden_size * 2)
        c_t.set([0 for i in xrange(self.contextsize)])
        encoded_h = dy.concatenate([encoded_h])
        dec_state = self.dec_builder[lang].initial_state([encoded_h])
        for (cw, nw) in zip(tgt_sent[0:-1], tgt_sent[1:]):
            embed = dy.lookup_batch(M_t, cw)
            dec_state = dec_state.add_input(dy.concatenate([embed, c_t]))
            h_e = dec_state.output()
            #calculate attention
            '''
            a_t = h_fs_matrix_t * h_e
            alignment = dy.softmax(a_t)
            c_t = h_fs_matrix * alignment'''
            c_t = self.__attention_mlp_batch(h_fs_matrix, h_e, W1_att_e, W1_att_f, w2_att)
            ind_tem = dy.concatenate([h_e, c_t])
            ind_tem1 = W_y * ind_tem
            ind_tem2 = ind_tem1 + b_y
            loss = dy.pickneglogsoftmax_batch(ind_tem2, nw)  # to modify
            losses.append(loss)
            num_words += 1
        return dy.sum_batches(dy.esum(losses)), num_words
Esempio n. 38
0
 def _loss(outputs, labels):
     losses = [dy.pickneglogsoftmax_batch(out, label) for out, label in zip(outputs, labels)]
     loss = dy.mean_batches(dy.average(losses))
     return loss