Example #1
0
 def score_sentence(self, score_vecs, tags):
     assert(len(score_vecs)==len(tags))
     tags.insert(0, START_TAG) # add start
     total = dynet.scalarInput(.0)
     for i, obs in enumerate(score_vecs):
         # transition to next from i and emission
         next_tag = tags[i + 1]
         total += dynet.pick(self.trans_mat[next_tag],tags[i]) + dynet.pick(obs,next_tag)
     total += dynet.pick(self.trans_mat[END_TAG],tags[-1])
     return total
Example #2
0
 def score_sentence(self, score_vecs, tags):
     assert(len(score_vecs)==len(tags))
     tags.insert(0, START_TAG) # add start
     total = dynet.scalarInput(.0)
     for i, obs in enumerate(score_vecs):
         # transition to next from i and emission
         next_tag = tags[i + 1]
         total += dynet.pick(self.trans_mat[next_tag],tags[i]) + dynet.pick(obs,next_tag)
     total += dynet.pick(self.trans_mat[END_TAG],tags[-1])
     return total
Example #3
0
    def forward(self, observations):
        # calculate forward pass
        def log_sum_exp(scores):
            npval = scores.npvalue()
            argmax_score = np.argmax(npval)
            max_score_expr = dynet.pick(scores, argmax_score)
            max_score_expr_broadcast = dynet.concatenate([max_score_expr] *
                                                         self.num_tags)
            return max_score_expr + dynet.logsumexp_dim(
                (scores - max_score_expr_broadcast), 0)

        init_alphas = [-1e10] * self.num_tags
        init_alphas[START_TAG] = 0
        for_expr = dynet.inputVector(init_alphas)
        for obs in observations:
            alphas_t = []
            for next_tag in range(self.num_tags):
                obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] *
                                                  self.num_tags)
                next_tag_expr = for_expr + self.trans_mat[
                    next_tag] + obs_broadcast
                alphas_t.append(log_sum_exp(next_tag_expr))
            for_expr = dynet.concatenate(alphas_t)
        terminal_expr = for_expr + self.trans_mat[END_TAG]
        alpha = log_sum_exp(terminal_expr)
        return alpha
Example #4
0
    def _get_loss(self, input, targets, epsilon=1e-10):
        layers = self.compute_output_layer(input)

        log_out = dy.log(layers[-1] + epsilon)

        loss = dy.zeros(1)
        for t in targets:
            loss += dy.pick(log_out, t)

        r = np.random.randint(self.dim_out)
        while r in targets:
            r = np.random.randint(self.dim_out)
        loss += dy.log(1 - dy.pick(layers[-1], r) + epsilon)
        #loss -= dy.pick(log_out, r)

        return -loss
Example #5
0
 def pick_neg_log(self, pred, gold):
     # TODO make this a static function in both classes
     if not isinstance(gold, int) and not isinstance(gold, np.int64):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
Example #6
0
 def log_sum_exp(scores):
     npval = scores.npvalue()
     argmax_score = np.argmax(npval)
     max_score_expr = dynet.pick(scores, argmax_score)
     max_score_expr_broadcast = dynet.concatenate([max_score_expr] *
                                                  self.num_tags)
     return max_score_expr + dynet.logsumexp_dim(
         (scores - max_score_expr_broadcast), 0)
Example #7
0
    def _get_loss_and_prediction(self, input, targets, epsilon=1e-10):
        layers = self.compute_output_layer(input)
        output = layers[-1].value()
        res = {i for i in output if i > 0.5}

        log_out = dy.log(layers[-1] + epsilon)

        loss = dy.zeros(1)
        for t in targets:
            loss += dy.pick(log_out, t)

        r = np.random.randint(self.dim_out)
        while r in targets:
            r = np.random.randint(self.dim_out)
        loss += dy.log(1 - dy.pick(layers[-1], r) + epsilon)
        #loss -= dy.pick(log_out, r)

        return -loss, res
Example #8
0
 def compute_loss_multilabel(self, task, seq, multi_y):
     """
     computes the loss for multi-label instances by summing over the negative log probabilities of all correct labels
     """
     out_probs = self(task, seq)
     losses = []
     for y in multi_y:
         assigned_prob = dn.pick(out_probs, y)
         losses.append(-dn.log(assigned_prob) / len(multi_y))
     return dn.esum(losses)
Example #9
0
    def viterbi(self, observations, unk_tag=None, dictionary=None):
        #if dictionary:
        #    raise NotImplementedError("type constraints not yet implemented for CRF")
        backpointers = []
        init_vvars = [-1e10] * self.num_tags
        init_vvars[START_TAG] = 0  # <Start> has all the probability
        for_expr = dynet.inputVector(init_vvars)
        trans_exprs = [self.trans_mat[idx] for idx in range(self.num_tags)]
        for obs in observations:
            bptrs_t = []
            vvars_t = []
            for next_tag in range(self.num_tags):
                next_tag_expr = for_expr + trans_exprs[next_tag]
                next_tag_arr = next_tag_expr.npvalue()
                best_tag_id = np.argmax(next_tag_arr)
                if unk_tag:
                    best_tag = self.index2tag[best_tag_id]
                    if best_tag == unk_tag:
                        next_tag_arr[np.argmax(next_tag_arr)] = 0  # set to 0
                        best_tag_id = np.argmax(
                            next_tag_arr)  # get second best

                bptrs_t.append(best_tag_id)
                vvars_t.append(dynet.pick(next_tag_expr, best_tag_id))
            for_expr = dynet.concatenate(vvars_t) + obs
            backpointers.append(bptrs_t)
        # Perform final transition to terminal
        terminal_expr = for_expr + trans_exprs[END_TAG]
        terminal_arr = terminal_expr.npvalue()
        best_tag_id = np.argmax(terminal_arr)
        path_score = dynet.pick(terminal_expr, best_tag_id)
        # Reverse over the backpointers to get the best path
        best_path = [best_tag_id
                     ]  # Start with the tag that was best for terminal
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()  # Remove the start symbol
        best_path.reverse()
        assert start == START_TAG
        # Return best path and best path's score
        return best_path, path_score
Example #10
0
    def __call__(self, x_embs):
        x_len = len(x_embs)

        # BiGRU
        hf = dy.concatenate_cols(
            self.fGRUBuilder.initial_state().transduce(x_embs))
        hb = dy.concatenate_cols(self.bGRUBuilder.initial_state().transduce(
            x_embs[::-1])[::-1])
        h = dy.concatenate([hf, hb])

        # Selective Gate
        hb_1 = dy.pick(hb, index=0, dim=1)
        hf_n = dy.pick(hf, index=x_len - 1, dim=1)
        s = dy.concatenate([hb_1, hf_n])

        # Selection
        sGate = dy.logistic(dy.colwise_add(self.Ws * h, self.Us * s + self.bs))
        hp = dy.cmult(h, sGate)

        return hp, hb_1
Example #11
0
    def viterbi(self, observations, unk_tag=None, dictionary=None):
        #if dictionary:
        #    raise NotImplementedError("type constraints not yet implemented for CRF")
        backpointers = []
        init_vvars   = [-1e10] * self.num_tags
        init_vvars[START_TAG] = 0 # <Start> has all the probability
        for_expr     = dynet.inputVector(init_vvars)
        trans_exprs  = [self.trans_mat[idx] for idx in range(self.num_tags)]
        for obs in observations:
            bptrs_t = []
            vvars_t = []
            for next_tag in range(self.num_tags):
                next_tag_expr = for_expr + trans_exprs[next_tag]
                next_tag_arr = next_tag_expr.npvalue()
                best_tag_id  = np.argmax(next_tag_arr)
                if unk_tag:
                    best_tag = self.index2tag[best_tag_id]
                    if best_tag == unk_tag:
                        next_tag_arr[np.argmax(next_tag_arr)] = 0 # set to 0
                        best_tag_id = np.argmax(next_tag_arr) # get second best

                bptrs_t.append(best_tag_id)
                vvars_t.append(dynet.pick(next_tag_expr, best_tag_id))
            for_expr = dynet.concatenate(vvars_t) + obs
            backpointers.append(bptrs_t)
        # Perform final transition to terminal
        terminal_expr = for_expr + trans_exprs[END_TAG]
        terminal_arr  = terminal_expr.npvalue()
        best_tag_id   = np.argmax(terminal_arr)
        path_score    = dynet.pick(terminal_expr, best_tag_id)
        # Reverse over the backpointers to get the best path
        best_path = [best_tag_id] # Start with the tag that was best for terminal
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop() # Remove the start symbol
        best_path.reverse()
        assert start == START_TAG
        # Return best path and best path's score
        return best_path, path_score
    def decode_loss(self, src1, src2, tgt):
        src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward(
            src1, src2
        )
        _, prev_coverage = self.get_coverage(
            a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1))
        )

        loss = []
        cov_loss = []
        diag_loss = []

        embedded_tgt = self.embed_idx(tgt, self.tgt_lookup)
        last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)]

        for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)):
            a_t, c1_t = self.attend(
                src1_mat,
                decoder_state,
                src1_w1dt,
                self.att1_w2,
                self.att1_v,
                prev_coverage,
            )
            if not self.single_source:
                _, c2_t = self.attend(
                    src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None
                )
            else:
                c2_t = dy.vecInput(2 * HIDDEN_DIM)

            x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings])
            decoder_state = decoder_state.add_input(x_t)

            out_vector = self.dec_w * decoder_state.output() + self.dec_b
            probs = dy.softmax(out_vector)
            probs, _ = self.get_pointergen_probs(
                c1_t, decoder_state, x_t, a_t, probs, src1
            )

            loss.append(-dy.log(dy.pick(probs, char)))
            cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage)
            cov_loss.append(cov_loss_cur)
            diag_loss.append(self.get_diag_loss(a_t, t))

            last_output_embeddings = embedded_char

        loss = dy.esum(loss)
        cov_loss = dy.esum(cov_loss)
        diag_loss = dy.esum(diag_loss)
        return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
Example #13
0
    def forward(self, observations):
        # calculate forward pass
        def log_sum_exp(scores):
            npval = scores.npvalue()
            argmax_score = np.argmax(npval)
            max_score_expr = dynet.pick(scores, argmax_score)
            max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags)
            return max_score_expr + dynet.logsumexp_dim((scores - max_score_expr_broadcast),0)

        init_alphas = [-1e10] * self.num_tags
        init_alphas[START_TAG] = 0
        for_expr = dynet.inputVector(init_alphas)
        for obs in observations:
            alphas_t = []
            for next_tag in range(self.num_tags):
                obs_broadcast = dynet.concatenate([dynet.pick(obs, next_tag)] * self.num_tags)
                next_tag_expr = for_expr + self.trans_mat[next_tag] + obs_broadcast
                alphas_t.append(log_sum_exp(next_tag_expr))
            for_expr = dynet.concatenate(alphas_t)
        terminal_expr = for_expr + self.trans_mat[END_TAG]
        alpha = log_sum_exp(terminal_expr)
        return alpha
Example #14
0
 def pick_neg_log(self, pred, gold):
     if hasattr(gold, "__len__"):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
Example #15
0
 def pick_neg_log(self, pred, gold):
     if not isinstance(gold, int):
         # calculate cross-entropy loss against the whole vector
         dy_gold = dynet.inputVector(gold)
         return -dynet.sum_elems(dynet.cmult(dy_gold, dynet.log(pred)))
     return -dynet.log(dynet.pick(pred, gold))
Example #16
0
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence,
                    output, env, first, previous):
    pos_lookup = params_encoder["pos_lookup"]
    char_lookup = params_encoder["char_lookup"]
    char_v = params_decoder["attention_v"]
    char_w1 = params_decoder["attention_wc"]
    char_w2 = params_decoder["attention_bc"]
    sc_vector = []
    for i, world in enumerate(_state(env)):
        world = world
        sc0 = char_encoder.initial_state()
        sc = sc0
        for char in world:
            sc = sc.add_input(char_lookup[char2int[char]])
        sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]]))
    dy_sc_vector = dy.concatenate(sc_vector, d=1)
    s0 = encoder.initial_state()
    s = s0
    lookup = params_encoder["lookup"]
    attention_w = params_decoder["attention_w"]
    attention_b = params_decoder["attention_b"]
    sentence = sentence + ' <end>'
    sentence = [
        vocab.index(c) if c in vocab else vocab.index('<unknown>')
        for c in sentence.split(' ')
    ]
    loss = []
    generate = []
    s_vector = []
    for word in (sentence):
        s = s.add_input(lookup[word])
        s_vector.append(dy.softmax(attention_w * s.output() + attention_b))
    encode_output = s.output()
    dy_s_vector = dy.concatenate(s_vector, d=1)
    _s0 = decoder.initial_state(s.s())
    _s = _s0
    R = params_decoder["R"]
    bias = params_decoder["bias"]
    index = 1
    input_word = "<start>"
    _lookup = params_decoder["lookup"]
    while True:
        dy_env = dy.inputTensor(get_state_embed3(env))
        word = vocab_out.index(input_word)
        gt_y = vocab_out.index(output[index])

        weight = dy.softmax(
            dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector]))
        weight_char = dy.softmax(
            dy.concatenate([
                char_v * dy.tanh(char_w1 * x + char_w2 * _s.output())
                for x in sc_vector
            ]))

        encode_output = dy_s_vector * weight
        encode_state = dy_sc_vector * weight_char
        _s = _s.add_input(
            dy.concatenate([_lookup[word], encode_output, encode_state]))
        probs = dy.softmax((R) * _s.output() + bias)
        prediction = np.argsort(probs.npvalue())[-1]
        if (vocab_out[prediction]) == '<start>':
            prediction = np.argsort(probs.npvalue())[-2]
        generate.append(vocab_out[prediction])
        loss.append(-dy.log(dy.pick(probs, gt_y)))
        if output[index] == '<end>':
            break
        index += 1
        input_word = vocab_out[prediction]
        if input_word == '<end>':
            continue
        env = str(execute(env, [input_word]))
        if env == 'None':
            env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_'
    loss = dy.esum(loss)
    while '<start>' in generate:
        generate.remove('<start>')
    previous = s.output()
    return loss, generate, previous
Example #17
0
 def get_loss_and_prediction(self, input, target, epsilon=1e-10):
     layers = self.compute_output_layer(input)
     return -dy.log(dy.pick(layers[-1], target) + epsilon), np.argmax(
         layers[-1].value())
Example #18
0
 def get_loss(self, input, target, epsilon=1e-10):
     layers = self.compute_output_layer(input)
     return -dy.log(dy.pick(layers[-1], target) + epsilon)
Example #19
0
def pick_neg_log(pred, gold):
    return -dynet.log(dynet.pick(pred, gold))
Example #20
0
    def __call__(self, x, tm1s=None, test=False):
        if test:
            # Initial states
            s_tm1 = tm1s[0]
            c_tm1 = tm1s[1]
            w_tm1 = x

            # GRU
            s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input(
                dy.concatenate([w_tm1, c_tm1])).output()

            # Attention
            e_t = dy.pick(
                self.va *
                dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0)
            a_t = dy.softmax(e_t)
            c_t = dy.esum([
                dy.cmult(a_t_i, h_i)
                for a_t_i, h_i in zip(a_t, dy.transpose(self.hp))
            ])
            #c_t = self.hp*a_t # memory error?

            # Output
            r_t = dy.concatenate_cols([
                Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t
                for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr)
            ])  # Maxout
            m_t = dy.max_dim(r_t, d=1)
            y_t = dy.softmax(self.Wo * m_t)

            return s_t, c_t, y_t

        else:
            w_embs = x
            # Initial states
            s_tm1 = self.s_0
            c_tm1 = self.c_0
            GRU = self.GRUBuilder.initial_state().set_s([s_tm1])

            y = []
            for w_tm1 in w_embs:
                # GRU
                GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1]))
                s_t = GRU.output()

                # Attention
                e_t = dy.pick(
                    self.va * dy.tanh(
                        dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0)
                a_t = dy.softmax(e_t)
                c_t = dy.esum([
                    dy.cmult(a_t_i, h_i)
                    for a_t_i, h_i in zip(a_t, dy.transpose(self.hp))
                ])
                #c_t = self.hp*a_t # memory error?

                # Output
                r_t = dy.concatenate_cols([
                    Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t
                    for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr)
                ])  # Maxout
                m_t = dy.max_dim(r_t, d=1)

                y_t = self.Wo * m_t
                y.append(y_t)

                # t -> tm1
                s_tm1 = s_t
                c_tm1 = c_t

            return y
Example #21
0
def pick_neg_log(pred, gold):
    return -dynet.log(dynet.pick(pred, gold))
Example #22
0
def train(builder,
          model,
          model_parameters,
          X_train,
          y_train,
          nepochs,
          alpha=0.01,
          update=True,
          dropout=0.0,
          x_y_vectors=None,
          num_hidden_layers=0):
    """
    Train the LSTM
    :param builder: the LSTM builder
    :param model: LSTM RNN model
    :param model_parameters: the model parameters
    :param X_train: the lstm instances
    :param y_train: the lstm labels
    :param nepochs: number of epochs
    :param alpha: the learning rate (only for SGD)
    :param update: whether to update the lemma embeddings
    :param dropout: dropout probability for all component embeddings
    :param x_y_vectors: the word vectors of x and y
    :param num_hidden_layers The number of hidden layers for the term-pair classification network
    """
    trainer = dy.AdamTrainer(model, alpha=alpha)
    minibatch_size = min(MINIBATCH_SIZE, len(y_train))
    nminibatches = int(math.ceil(len(y_train) / minibatch_size))
    previous_loss = 1000

    for epoch in range(nepochs):

        total_loss = 0.0

        epoch_indices = np.random.permutation(len(y_train))

        for minibatch in range(nminibatches):

            path_cache = {}
            batch_indices = epoch_indices[minibatch *
                                          minibatch_size:(minibatch + 1) *
                                          minibatch_size]

            dy.renew_cg()

            loss = dy.esum([
                -dy.log(
                    dy.pick(
                        process_one_instance(
                            builder,
                            model,
                            model_parameters,
                            X_train[batch_indices[i]],
                            path_cache,
                            update,
                            dropout,
                            x_y_vectors=x_y_vectors[batch_indices[i]]
                            if x_y_vectors is not None else None,
                            num_hidden_layers=num_hidden_layers),
                        y_train[batch_indices[i]]))
                for i in range(minibatch_size)
            ])
            total_loss += loss.value()  # forward computation
            loss.backward()
            trainer.update()

        # deprecated http://dynet.readthedocs.io/en/latest/python_ref.html#optimizers GB
        # and requires an argument (would be epoch i guess...)
        # trainer.update_epoch()
        trainer.update()
        total_loss /= len(y_train)
        print 'Epoch', (epoch + 1), '/', nepochs, 'Loss =', total_loss

        # Early stopping
        if math.fabs(previous_loss - total_loss) < LOSS_EPSILON:
            break

        previous_loss = total_loss
Example #23
0
 def log_sum_exp(scores):
     npval = scores.npvalue()
     argmax_score = np.argmax(npval)
     max_score_expr = dynet.pick(scores, argmax_score)
     max_score_expr_broadcast = dynet.concatenate([max_score_expr] * self.num_tags)
     return max_score_expr + dynet.logsumexp_dim((scores - max_score_expr_broadcast),0)