예제 #1
0
파일: model_RL.py 프로젝트: xcgfth/TaxoRL
    def selection_by_tree(self, tree, mode, idx=0):
        input_layers, pairs = self._select_by_tree(tree, mode, True)
        if len(pairs) == 0:
            if not self.opt['allow_partial']:
                input_layers, pairs = self._select_by_tree(tree, mode, False)
            else:
                print 'early stop! discard {} / {}.'.format(
                    len(tree.V), len(tree.terms))
                return None, None
        W1_rl = dy.parameter(self.model_parameters['W1_rl'])
        b1_rl = dy.parameter(self.model_parameters['b1_rl'])
        if not self.opt['one_layer']:
            W2_rl = dy.parameter(self.model_parameters['W2_rl'])
            b2_rl = dy.parameter(self.model_parameters['b2_rl'])

        # pr = W2_rl * dy.rectify(W1_rl * dy.concatenate_to_batch(input_layers) + b1_rl) + b2_rl
        # (V x N)x160 160x50 50x60 60x1
        input_layers = dy.concatenate_cols(input_layers)
        input_layers = dy.transpose(input_layers)

        if not self.opt['one_layer']:
            if self.opt['use_history']:
                pr = input_layers * dy.rectify(W2_rl * dy.rectify(
                    W1_rl * self.history[idx].output() + b1_rl) + b2_rl)
            else:
                pr = dy.rectify(input_layers * W2_rl + b2_rl) * W1_rl + b1_rl
        else:
            if self.opt['use_history']:
                pr = input_layers * dy.rectify(
                    W1_rl * self.history[idx].output() + b1_rl)
            else:
                pr = input_layers * W1_rl + b1_rl
        # (#actions, )
        pr = dy.reshape(pr, (len(pairs), ))
        return dy.softmax(pr), pairs
 def attend(self, input_mat, state, w1dt, w2, v, coverage):
     w2dt = w2 * dy.concatenate(list(state.s()))
     if coverage:
         w1dt = w1dt + self.w_cov * dy.transpose(coverage)
     a_t = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt)))
     a_t = dy.softmax(a_t)
     return a_t, (input_mat * a_t)
예제 #3
0
    def generate(self, num, limit=40, beam=3):
        dy.renew_cg()

        generated = []

        W = dy.parameter(self.W)
        b = dy.parameter(self.b)

        for wordi in range(num):

            # Initialize the LSTM state with EOW token.
            start_state = self.lstm.initial_state()
            start_state = start_state.add_input(self.lookup[self.c2i[EOW]])
            best_states = [('', start_state, 0)]

            final_hypotheses = []

            # Perform beam search.
            while len(final_hypotheses) < beam and len(best_states) > 0:
                new_states = []

                for hyp, s, p in best_states:

                    # Cutoff when we exceed the character limit.
                    if len(hyp) >= limit:
                        final_hypotheses.append((hyp, p))
                        continue

                    # Get the prediction from the current LSTM state.
                    unnormalized = dy.affine_transform([b, W, s.output()])
                    softmax = dy.softmax(unnormalized).npvalue()

                    # Sample beam number of times.
                    for beami in range(beam):
                        ci = sample_softmax(softmax)
                        c = self.i2c[ci]
                        next_p = softmax[ci]
                        logp = p - np.log(next_p)

                        if c == EOW:
                            # Add final hypothesis if we reach end of word.
                            final_hypotheses.append((hyp, logp))
                        else:
                            # Else add to states to search next time step.
                            new_states.append((hyp + c,
                                               s.add_input(self.lookup[ci]),
                                               logp))

                # Sort and prune the states to within the beam.
                new_states.sort(key=lambda t: t[-1])
                best_states = new_states[:beam]

            final_hypotheses.sort(key=lambda t: t[-1])

            generated.append(final_hypotheses[0][0])

        return generated
예제 #4
0
 def compute_output_layer(self, input):
     res = [input]
     for i, p in enumerate(self.parameters):
         W, b = dy.parameter(p[0]), dy.parameter(p[1])
         if i == len(self.parameters) - 1:
             res.append(dy.softmax(W * res[-1] + b))
         else:
             res.append(self.activation(W * res[-1] + b))
     return res
예제 #5
0
    def __call__(self, x, h_matrix, noprob=False):
        s_t = x
        for i in range(self.layers - 1):
            e_t = self.V[i] * dy.tanh(self.W1[i] * h_matrix + self.W2[i] * s_t)
            a_t = dy.softmax(dy.transpose(e_t))
            c_t = h_matrix * a_t
            s_t = dy.concatenate([x, c_t])

        e_t = self.V[-1] * dy.tanh(self.W1[-1] * h_matrix + self.W2[-1] *
                                   s_t) + self.B1 * h_matrix + self.B2 * s_t

        if len(h_matrix.dim()[0]) > 1:
            e_t = dy.reshape(e_t,
                             (self.V[-1].dim()[0][0] * h_matrix.dim()[0][1], ))
        if not noprob:
            p_t = dy.softmax(e_t)
            return p_t
        else:
            return e_t
예제 #6
0
    def get_top_k_paths(self, all_paths, relation_index, threshold):
        """
        Get the top k scoring paths
        """
        builder = self.builder
        model = self.model
        model_parameters = self.model_parameters
        lemma_lookup = model_parameters['lemma_lookup']
        pos_lookup = model_parameters['pos_lookup']
        dep_lookup = model_parameters['dep_lookup']
        dir_lookup = model_parameters['dir_lookup']

        path_scores = []

        for i, path in enumerate(all_paths):

            if i % 1000 == 0:
                cg = dy.renew_cg()
                W1 = dy.parameter(model_parameters['W1'])
                b1 = dy.parameter(model_parameters['b1'])
                W2 = None
                b2 = None

                if self.num_hidden_layers == 1:
                    W2 = dy.parameter(model_parameters['W2'])
                    b2 = dy.parameter(model_parameters['b2'])

            path_embedding = get_path_embedding(builder, lemma_lookup,
                                                pos_lookup, dep_lookup,
                                                dir_lookup, path)

            if self.use_xy_embeddings:
                zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim)
                path_embedding = dy.concatenate(
                    [zero_word, path_embedding, zero_word])

            h = W1 * path_embedding + b1

            if self.num_hidden_layers == 1:
                h = W2 * dy.tanh(h) + b2

            path_score = dy.softmax(h).npvalue().T
            path_scores.append(path_score)

        path_scores = np.vstack(path_scores)

        top_paths = []
        for i in range(len(relation_index)):
            indices = np.argsort(-path_scores[:, i])
            top_paths.append([
                (all_paths[index], path_scores[index, i]) for index in indices
                if threshold is None or path_scores[index, i] >= threshold
            ])

        return top_paths
예제 #7
0
 def __call__(self, sent, n, caches):
     caches = self._restart_caches(sent, caches)
     # s: list(len==steps) of {(n_s,), batch_size}, n: {(n_h,), batch_size}
     wn_t = dy.reshape(n, (1, self.n_h), batch_size=bs(n))
     att_e = dy.reshape(wn_t * caches["V"], (BK.dims(caches["V"])[1], ),
                        batch_size=bs(n))
     att_alpha = dy.softmax(att_e)
     ctx = caches["S"] * att_alpha
     # append and return
     caches["ctx"] = ctx
     caches["att"] = att_alpha
     return caches
    def decode_loss(self, src1, src2, tgt):
        src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward(
            src1, src2
        )
        _, prev_coverage = self.get_coverage(
            a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1))
        )

        loss = []
        cov_loss = []
        diag_loss = []

        embedded_tgt = self.embed_idx(tgt, self.tgt_lookup)
        last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)]

        for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)):
            a_t, c1_t = self.attend(
                src1_mat,
                decoder_state,
                src1_w1dt,
                self.att1_w2,
                self.att1_v,
                prev_coverage,
            )
            if not self.single_source:
                _, c2_t = self.attend(
                    src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None
                )
            else:
                c2_t = dy.vecInput(2 * HIDDEN_DIM)

            x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings])
            decoder_state = decoder_state.add_input(x_t)

            out_vector = self.dec_w * decoder_state.output() + self.dec_b
            probs = dy.softmax(out_vector)
            probs, _ = self.get_pointergen_probs(
                c1_t, decoder_state, x_t, a_t, probs, src1
            )

            loss.append(-dy.log(dy.pick(probs, char)))
            cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage)
            cov_loss.append(cov_loss_cur)
            diag_loss.append(self.get_diag_loss(a_t, t))

            last_output_embeddings = embedded_char

        loss = dy.esum(loss)
        cov_loss = dy.esum(cov_loss)
        diag_loss = dy.esum(diag_loss)
        return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
예제 #9
0
    def calc_attend(self, a_vecs, b_vecs, dropout):
        l_a = a_vecs.dim()[1]
        l_b = b_vecs.dim()[1]

        fa = self.attend.evaluate_network(a_vecs, True, dropout)
        fb = self.attend.evaluate_network(b_vecs, True, dropout)

        e_ij = list()
        for i in range(l_a):
            e_ij.append(list())
            for j in range(l_b):
                e_ij[i].append(
                    dy.dot_product(dy.pick_batch_elem(fa, i),
                                   dy.pick_batch_elem(fb, j)))

        beta_softmaxes = [
            dy.softmax(dy.concatenate(e_ij[i])) for i in range(l_a)
        ]
        alpha_softmaxes = [
            dy.softmax(dy.concatenate([e_ij[i][j] for j in range(l_b)]))
            for i in range(l_a)
        ]

        betas = [
            dy.esum([
                dy.pick_batch_elem(b_vecs, j) * beta_softmaxes[i][j]
                for j in range(l_b)
            ]) for i in range(l_a)
        ]
        alphas = [
            dy.esum([
                dy.pick_batch_elem(a_vecs, i) * alpha_softmaxes[i][j]
                for i in range(l_a)
            ]) for j in range(l_b)
        ]
        return alphas, betas
예제 #10
0
 def __call__(self, sent, n, caches):
     # s: list(len==steps) of {(n_s,), batch_size}, n: {(n_h,), batch_size}
     caches = self._restart_caches(sent, caches)
     val_h = self.iparams["h2e"] * n  # {(n_hidden,), batch_size}
     att_hidden_bef = dy.colwise_add(
         caches["V"], val_h)  # {(n_didden, steps), batch_size}
     att_hidden = dy.tanh(att_hidden_bef)
     # if self.hdrop > 0:     # save some space
     #     att_hidden = dy.dropout(att_hidden, self.hdrop)
     att_e = dy.reshape(self.iparams["v"] * att_hidden,
                        (BK.dims(caches["V"])[1], ),
                        batch_size=bs(att_hidden))
     att_alpha = dy.softmax(att_e)
     ctx = caches["S"] * att_alpha  # {(n_s, sent_len), batch_size}
     # append and return
     caches["ctx"] = ctx
     caches["att"] = att_alpha
     return caches
예제 #11
0
    def attend(self, encoded_inputs, h_t, input_masks=None):
        # encoded_inputs dimension is: seq len x 2*h x batch size, h_t dimension is h x batch size (for bilstm encoder)
        if len(encoded_inputs) == 1:
            # no need to attend if only one input state, compute output directly
            h_output = dn.tanh(self.w_c *
                               dn.concatenate([h_t, encoded_inputs[0]]))
            # return trivial alphas (all 1's since one input gets all attention)
            if input_masks:
                # if batching
                alphas = dn.inputTensor([1] * len(input_masks[0]),
                                        batched=True)
            else:
                alphas = dn.inputTensor([1], batched=True)
            return h_output, alphas

        # iterate through input states to compute attention scores
        # scores = [v_a * dn.tanh(w_a * h_t + u_a * h_input) for h_input in blstm_outputs]
        w_a_h_t = self.w_a * h_t
        scores = [
            self.v_a *
            dn.tanh(dn.affine_transform([w_a_h_t, self.u_a, h_input]))
            for h_input in encoded_inputs
        ]

        concatenated = dn.concatenate(scores)
        if input_masks:
            # if batching, multiply attention scores with input masks to zero-out scores for padded inputs
            dn_masks = dn.inputTensor(input_masks, batched=True)
            concatenated = dn.cmult(concatenated, dn_masks)

        # normalize scores
        alphas = dn.softmax(concatenated)

        # compute context vector with weighted sum for each seq in batch
        bo = dn.concatenate_cols(encoded_inputs)
        c = bo * alphas
        # c = dn.esum([h_input * dn.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)])

        # compute output vector using current decoder state and context vector
        h_output = dn.tanh(self.w_c * dn.concatenate([h_t, c]))

        return h_output, alphas
예제 #12
0
def build_network(params, x_data):
    _, E, b, U, W, bp = params
    if type(x_data) == dict:
        # print("DICT")
        prefix_ordinals = x_data['prefix']
        suffix_ordinals = x_data['suffix']
        x_ordinals = x_data['fullwords']
    else:
        prefix_ordinals = None
        suffix_ordinals = None
        x_ordinals = x_data
    x = dy.concatenate([E[ord] for ord in x_ordinals])
    if prefix_ordinals:
        x_pre = dy.concatenate([E[ord] for ord in prefix_ordinals])
        x = x + x_pre
    if suffix_ordinals:
        x_suf = dy.concatenate([E[ord] for ord in suffix_ordinals])
        x = x + x_suf
    output = dy.softmax(U * (dy.tanh(W * x + b)) + bp)
    return output
예제 #13
0
def generator(encoder, decoder, params_encoder, params_decoder, sentence, env,
              first, previous):
    pos_lookup = params_encoder["pos_lookup"]
    char_lookup = params_encoder["char_lookup"]
    char_v = params_decoder["attention_v"]
    char_w1 = params_decoder["attention_wc"]
    char_w2 = params_decoder["attention_bc"]
    sc_vector = []
    for i, world in enumerate(_state(env)):
        world = world
        sc0 = char_encoder.initial_state()
        sc = sc0
        for char in world:
            sc = sc.add_input(char_lookup[char2int[char]])
        sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]]))
    dy_sc_vector = dy.concatenate(sc_vector, d=1)
    s0 = encoder.initial_state()
    s = s0
    lookup = params_encoder["lookup"]
    attention_w = params_decoder["attention_w"]
    attention_b = params_decoder["attention_b"]
    sentence = sentence + ' <end>'
    sentence = [
        vocab.index(c) if c in vocab else vocab.index('<unknown>')
        for c in sentence.split()
    ]
    s_vector = []
    generate = []
    for word in (sentence):
        s = s.add_input(lookup[word])
        s_vector.append(dy.softmax(attention_w * s.output() + attention_b))
    encode_output = s.output()
    dy_s_vector = dy.concatenate(s_vector, d=1)
    _s0 = decoder.initial_state(s.s())
    _s = _s0
    R = params_decoder["R"]
    bias = params_decoder["bias"]
    input_word = "<start>"
    _lookup = params_decoder["lookup"]
    repeat = 0
    while True:
        dy_env = dy.inputTensor(get_state_embed3(env))
        repeat += 1
        word = vocab_out.index(input_word)
        weight = dy.softmax(
            dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector]))
        weight_char = dy.softmax(
            dy.concatenate([
                char_v * dy.tanh(char_w1 * x + char_w2 * _s.output())
                for x in sc_vector
            ]))
        encode_state = dy_sc_vector * weight_char
        encode_output = dy_s_vector * weight
        _s = _s.add_input(
            dy.concatenate([_lookup[word], encode_output, encode_state]))
        probs = dy.softmax((R) * _s.output() + bias)
        top = 0
        while True:
            top += 1
            if top == 50:
                top = 1
                break
            prediction = np.argsort(probs.vec_value())[-top]
            if (vocab_out[prediction] == '<end>'): break
            if (vocab_out[prediction] == '<start>'): continue
            new_env = str(execute(env, [vocab_out[prediction]]))
            if new_env == 'None': continue
            break
        prediction = np.argsort(probs.vec_value())[-top]
        input_word = vocab_out[prediction]
        if input_word == '<end>':
            break
        if repeat >= 10:
            break
        generate.append(input_word)
        env = str(execute(env, [input_word]))
        if env == 'None':
            env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_'
    while '<start>' in generate:
        generate.remove('<start>')
    previous = s.output()
    return generate, previous
예제 #14
0
    def __call__(self, x, tm1s=None, test=False):
        if test:
            # Initial states
            s_tm1 = tm1s[0]
            c_tm1 = tm1s[1]
            w_tm1 = x

            # GRU
            s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input(
                dy.concatenate([w_tm1, c_tm1])).output()

            # Attention
            e_t = dy.pick(
                self.va *
                dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0)
            a_t = dy.softmax(e_t)
            c_t = dy.esum([
                dy.cmult(a_t_i, h_i)
                for a_t_i, h_i in zip(a_t, dy.transpose(self.hp))
            ])
            #c_t = self.hp*a_t # memory error?

            # Output
            r_t = dy.concatenate_cols([
                Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t
                for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr)
            ])  # Maxout
            m_t = dy.max_dim(r_t, d=1)
            y_t = dy.softmax(self.Wo * m_t)

            return s_t, c_t, y_t

        else:
            w_embs = x
            # Initial states
            s_tm1 = self.s_0
            c_tm1 = self.c_0
            GRU = self.GRUBuilder.initial_state().set_s([s_tm1])

            y = []
            for w_tm1 in w_embs:
                # GRU
                GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1]))
                s_t = GRU.output()

                # Attention
                e_t = dy.pick(
                    self.va * dy.tanh(
                        dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0)
                a_t = dy.softmax(e_t)
                c_t = dy.esum([
                    dy.cmult(a_t_i, h_i)
                    for a_t_i, h_i in zip(a_t, dy.transpose(self.hp))
                ])
                #c_t = self.hp*a_t # memory error?

                # Output
                r_t = dy.concatenate_cols([
                    Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t
                    for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr)
                ])  # Maxout
                m_t = dy.max_dim(r_t, d=1)

                y_t = self.Wo * m_t
                y.append(y_t)

                # t -> tm1
                s_tm1 = s_t
                c_tm1 = c_t

            return y
    def generate_beam(self, src1, src2):
        src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward(
            src1, src2
        )

        hypothesis_list = [
            Hypothesis(
                text_list=[self.tgt_vocab.str2int(EOS)],
                decoder_state=decoder_state,
                c1_t=dy.vecInput(2 * HIDDEN_DIM),
                c2_t=dy.vecInput(2 * HIDDEN_DIM),
                prev_coverage=self.get_coverage(
                    a_t=dy.vecInput(len(src1)),
                    training=False,
                    prev_coverage=dy.vecInput(len(src1)),
                ),
                score=0.0,
                p_gens=[],
            )
        ]
        completed_list = []

        for t in range(int(len(src1) * 1.1)):
            new_hyp_list = []
            new_hyp_scores = []
            for hyp in hypothesis_list:
                last_output_embeddings = self.tgt_lookup[hyp.text_list[-1]]

                a_t, c1_t = self.attend(
                    src1_mat,
                    hyp.decoder_state,
                    src1_w1dt,
                    self.att1_w2,
                    self.att1_v,
                    hyp.prev_coverage,
                )
                if not self.single_source:
                    _, c2_t = self.attend(
                        src2_mat,
                        hyp.decoder_state,
                        src2_w1dt,
                        self.att2_w2,
                        self.att2_v,
                        None,
                    )
                else:
                    c2_t = dy.vecInput(2 * HIDDEN_DIM)

                x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings])
                decoder_state = hyp.decoder_state.add_input(x_t)

                probs = dy.softmax(self.dec_w * decoder_state.output() + self.dec_b)
                probs, cur_p_gen = self.get_pointergen_probs(
                    c1_t, decoder_state, x_t, a_t, probs, src1
                )
                probs = probs.npvalue()

                for ind in range(len(probs)):
                    text_list = hyp.text_list + [ind]
                    p_gens = hyp.p_gens + [cur_p_gen]
                    score = (hyp.score + math.log(probs[ind])) / (len(text_list) ** 0.0)
                    coverage = self.get_coverage(a_t, hyp.prev_coverage, training=False)
                    new_hyp_list.append(
                        Hypothesis(
                            text_list=text_list,
                            decoder_state=decoder_state,
                            c1_t=c1_t,
                            c2_t=c2_t,
                            prev_coverage=coverage,
                            score=score,
                            p_gens=p_gens,
                        )
                    )
                    new_hyp_scores.append(score)

            top_inds = np.argpartition(np.array(new_hyp_scores), -self.beam_size)[
                -self.beam_size :
            ]
            new_hyp_list = np.array(new_hyp_list)[top_inds]

            hypothesis_list = []

            for new_hyp in new_hyp_list:
                if new_hyp.text_list[-1] == self.tgt_vocab.str2int(EOS) and t > 0:
                    completed_list.append(new_hyp)
                else:
                    hypothesis_list.append(new_hyp)

            if len(completed_list) >= self.beam_size:
                break

        if len(completed_list) == 0:
            sorted(hypothesis_list, key=lambda x: x.score, reverse=True)
            completed_list = [hypothesis_list[0]]

        for hyp in completed_list:
            hyp.text_list = [self.tgt_vocab.int2str(i) for i in hyp.text_list]

        top_hyp = sorted(completed_list, key=lambda x: x.score, reverse=True)[0]
        return "".join(top_hyp.text_list).replace(EOS, "").strip(), top_hyp.p_gens[1:-1]
예제 #16
0
 def __call__(self, s_t, h_matrix):
     e_t = self.v * dy.tanh(self.W1*h_matrix + self.W2 * s_t)
     a_t = dy.softmax(dy.transpose(e_t))
     c_t = h_matrix * a_t
     return c_t
예제 #17
0
def process_one_instance(builder,
                         model,
                         model_parameters,
                         instance,
                         path_cache,
                         update=True,
                         dropout=0.0,
                         x_y_vectors=None,
                         num_hidden_layers=0):
    """
    Return the LSTM output vector of a single term-pair - the average path embedding
    :param builder: the LSTM builder
    :param model: the LSTM model
    :param model_parameters: the model parameters
    :param instance: a Counter object with paths
    :param path_cache: the cache for path embeddings
    :param update: whether to update the lemma embeddings
    :param dropout: word dropout rate
    :param x_y_vectors: the current word vectors for x and y
    :param num_hidden_layers The number of hidden layers for the term-pair classification network
    :return: the LSTM output vector of a single term-pair
    """
    W1 = dy.parameter(model_parameters['W1'])
    b1 = dy.parameter(model_parameters['b1'])
    W2 = None
    b2 = None

    if num_hidden_layers == 1:
        W2 = dy.parameter(model_parameters['W2'])
        b2 = dy.parameter(model_parameters['b2'])

    lemma_lookup = model_parameters['lemma_lookup']
    pos_lookup = model_parameters['pos_lookup']
    dep_lookup = model_parameters['dep_lookup']
    dir_lookup = model_parameters['dir_lookup']

    # Use the LSTM output vector and feed it to the MLP

    # Add the empty path
    paths = instance

    if len(paths) == 0:
        paths[EMPTY_PATH] = 1

    # Compute the averaged path
    num_paths = reduce(lambda x, y: x + y, instance.itervalues())
    path_embbedings = [
        get_path_embedding_from_cache(path_cache, builder, lemma_lookup,
                                      pos_lookup, dep_lookup, dir_lookup, path,
                                      update, dropout) * count
        for path, count in instance.iteritems()
    ]
    input_vec = dy.esum(path_embbedings) * (1.0 / num_paths)

    # Concatenate x and y embeddings
    if x_y_vectors is not None:
        x_vector, y_vector = dy.lookup(lemma_lookup,
                                       x_y_vectors[0]), dy.lookup(
                                           lemma_lookup, x_y_vectors[1])
        input_vec = dy.concatenate([x_vector, input_vec, y_vector])

    h = W1 * input_vec + b1

    if num_hidden_layers == 1:
        h = W2 * dy.tanh(h) + b2

    output = dy.softmax(h)

    return output
예제 #18
0
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence,
                    output, env, first, previous):
    pos_lookup = params_encoder["pos_lookup"]
    char_lookup = params_encoder["char_lookup"]
    char_v = params_decoder["attention_v"]
    char_w1 = params_decoder["attention_wc"]
    char_w2 = params_decoder["attention_bc"]
    sc_vector = []
    for i, world in enumerate(_state(env)):
        world = world
        sc0 = char_encoder.initial_state()
        sc = sc0
        for char in world:
            sc = sc.add_input(char_lookup[char2int[char]])
        sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]]))
    dy_sc_vector = dy.concatenate(sc_vector, d=1)
    s0 = encoder.initial_state()
    s = s0
    lookup = params_encoder["lookup"]
    attention_w = params_decoder["attention_w"]
    attention_b = params_decoder["attention_b"]
    sentence = sentence + ' <end>'
    sentence = [
        vocab.index(c) if c in vocab else vocab.index('<unknown>')
        for c in sentence.split(' ')
    ]
    loss = []
    generate = []
    s_vector = []
    for word in (sentence):
        s = s.add_input(lookup[word])
        s_vector.append(dy.softmax(attention_w * s.output() + attention_b))
    encode_output = s.output()
    dy_s_vector = dy.concatenate(s_vector, d=1)
    _s0 = decoder.initial_state(s.s())
    _s = _s0
    R = params_decoder["R"]
    bias = params_decoder["bias"]
    index = 1
    input_word = "<start>"
    _lookup = params_decoder["lookup"]
    while True:
        dy_env = dy.inputTensor(get_state_embed3(env))
        word = vocab_out.index(input_word)
        gt_y = vocab_out.index(output[index])

        weight = dy.softmax(
            dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector]))
        weight_char = dy.softmax(
            dy.concatenate([
                char_v * dy.tanh(char_w1 * x + char_w2 * _s.output())
                for x in sc_vector
            ]))

        encode_output = dy_s_vector * weight
        encode_state = dy_sc_vector * weight_char
        _s = _s.add_input(
            dy.concatenate([_lookup[word], encode_output, encode_state]))
        probs = dy.softmax((R) * _s.output() + bias)
        prediction = np.argsort(probs.npvalue())[-1]
        if (vocab_out[prediction]) == '<start>':
            prediction = np.argsort(probs.npvalue())[-2]
        generate.append(vocab_out[prediction])
        loss.append(-dy.log(dy.pick(probs, gt_y)))
        if output[index] == '<end>':
            break
        index += 1
        input_word = vocab_out[prediction]
        if input_word == '<end>':
            continue
        env = str(execute(env, [input_word]))
        if env == 'None':
            env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_'
    loss = dy.esum(loss)
    while '<start>' in generate:
        generate.remove('<start>')
    previous = s.output()
    return loss, generate, previous
예제 #19
0
    def compute_decoder_batch_loss(self, encoded_inputs, input_masks,
                                   output_word_ids, output_masks, batch_size):
        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # initial "input feeding" vectors to feed decoder - 3*h
        init_input_feeding = dn.lookup_batch(self.init_lookup,
                                             [0] * batch_size)

        # initial feedback embeddings for the decoder, use begin seq symbol embedding
        init_feedback = dn.lookup_batch(
            self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size)

        # init decoder rnn
        decoder_init = dn.concatenate([init_feedback, init_input_feeding])
        s = s_0.add_input(decoder_init)

        # loss per timestep
        losses = []

        # run the decoder through the output sequences and aggregate loss
        for i, step_word_ids in enumerate(output_word_ids):

            # returns h x batch size matrix
            decoder_rnn_output = s.output()

            # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix)
            attention_output_vector, alphas = self.attend(
                encoded_inputs, decoder_rnn_output, input_masks)

            # compute output scores (returns vocab_size x batch size matrix)
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform(
                [self.bias, self.readout, attention_output_vector])

            # encourage diversity by punishing highly confident predictions
            # TODO: support batching - esp. w.r.t. scalar inputs
            if self.diverse:
                soft = dn.softmax(dn.tanh(h))
                batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \
                    - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4))
            else:
                # get batch loss for this timestep
                batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids)

            # mask the loss if at least one sentence is shorter
            if output_masks and output_masks[i][-1] != 1:
                mask_expr = dn.inputVector(output_masks[i])
                # noinspection PyArgumentList
                mask_expr = dn.reshape(mask_expr, (1, ), batch_size)
                batch_loss = batch_loss * mask_expr

            # input feeding approach - input h (attention_output_vector) to the decoder
            # prepare for the next iteration - "feedback"
            feedback_embeddings = dn.lookup_batch(self.output_lookup,
                                                  step_word_ids)
            decoder_input = dn.concatenate(
                [feedback_embeddings, attention_output_vector])
            s = s.add_input(decoder_input)

            losses.append(batch_loss)

        # sum the loss over the time steps and batch
        total_batch_loss = dn.sum_batches(dn.esum(losses))

        return total_batch_loss
예제 #20
0
    def predict_beamsearch(self, encoder, input_seq):
        if len(input_seq) == 0:
            return []

        dn.renew_cg()

        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        alphas_mtx = []

        # encode input sequence
        blstm_outputs, input_masks = encoder.encode_batch([input_seq])

        # complete sequences and their probabilities
        final_states = []

        # initialize the decoder rnn
        s_0 = self.decoder_rnn.initial_state()

        # holds beam step index mapped to (sequence, probability, decoder state, attn_vector) tuples
        beam = {-1: [([common.BEGIN_SEQ], 1.0, s_0, self.init_lookup[0])]}
        i = 0

        # expand another step if didn't reach max length and there's still beams to expand
        #while i < self.max_prediction_len and len(beam[i - 1]) > 0:
        while ((self.max_prediction_len is None) or
               (i < self.max_prediction_len)) and len(beam[i - 1]) > 0:

            # create all expansions from the previous beam:
            new_hypos = []
            for hypothesis in beam[i - 1]:
                prefix_seq, prefix_prob, prefix_decoder, prefix_attn = hypothesis
                last_hypo_symbol = prefix_seq[-1]

                # cant expand finished sequences
                if last_hypo_symbol == common.END_SEQ:
                    continue

                # expand from the last symbol of the hypothesis
                try:
                    prev_output_vec = self.output_lookup[
                        self.y2int[last_hypo_symbol]]
                except KeyError:
                    # not a known symbol
                    print 'impossible to expand, key error: ' + str(
                        last_hypo_symbol)
                    continue

                decoder_input = dn.concatenate([prev_output_vec, prefix_attn])
                s = prefix_decoder.add_input(decoder_input)
                decoder_rnn_output = s.output()

                # perform attention step
                attention_output_vector, alphas = self.attend(
                    blstm_outputs, decoder_rnn_output)

                # save attention weights for plotting
                # TODO: add attention weights properly to allow building the attention matrix for the best path
                if self.plot:
                    val = alphas.vec_value()
                    alphas_mtx.append(val)

                # compute output probabilities
                # h = readout * attention_output_vector + bias
                h = dn.affine_transform(
                    [self.bias, self.readout, attention_output_vector])

                # TODO: understand why diverse needs tanh before softmax
                if self.diverse:
                    h = dn.tanh(h)
                probs = dn.softmax(h)
                probs_val = probs.npvalue()

                # TODO: maybe should choose nbest from all expansions and not only from nbest of each hypothesis?
                # find best candidate outputs
                n_best_indices = common.argmax(probs_val, self.beam_size)
                for index in n_best_indices:
                    p = probs_val[index]
                    new_seq = prefix_seq + [self.int2y[index]]
                    new_prob = prefix_prob * p
                    #if new_seq[-1] == common.END_SEQ or i == self.max_prediction_len - 1:
                    if new_seq[-1] == common.END_SEQ or (
                        (self.max_prediction_len is not None) and
                        (i == self.max_prediction_len - 1)):
                        # TODO: add to final states only if fits in k best?
                        # if found a complete sequence or max length - add to final states
                        final_states.append((new_seq[1:-1], new_prob))
                    else:
                        new_hypos.append(
                            (new_seq, new_prob, s, attention_output_vector))

            # add the most probable expansions from all hypotheses to the beam
            new_probs = np.array([p for (s, p, r, a) in new_hypos])
            argmax_indices = common.argmax(new_probs, self.beam_size)
            beam[i] = [new_hypos[l] for l in argmax_indices]
            i += 1

        # get nbest results from final states found in search
        final_probs = np.array([p for (s, p) in final_states])
        argmax_indices = common.argmax(final_probs, self.beam_size)
        nbest_seqs = [final_states[l] for l in argmax_indices]

        return nbest_seqs, alphas_mtx
예제 #21
0
    def predict_greedy(self, encoder, input_seq):
        dn.renew_cg()

        self.readout = dn.parameter(self.params['readout'])
        self.bias = dn.parameter(self.params['bias'])
        self.w_c = dn.parameter(self.params['w_c'])
        self.u_a = dn.parameter(self.params['u_a'])
        self.v_a = dn.parameter(self.params['v_a'])
        self.w_a = dn.parameter(self.params['w_a'])

        alphas_mtx = []

        if len(input_seq) == 0:
            return []

        # encode input sequence
        blstm_outputs, input_masks = encoder.encode_batch([input_seq])

        # initialize the decoder rnn
        s = self.decoder_rnn.initial_state()

        # set prev_output_vec for first lstm step as BEGIN_WORD concatenated with special padding vector
        prev_output_vec = dn.concatenate([
            self.output_lookup[self.y2int[common.BEGIN_SEQ]],
            self.init_lookup[0]
        ])
        predicted_sequence = []
        i = 0

        # run the decoder through the sequence and predict output symbols
        while (self.max_prediction_len is None) or (i <
                                                    self.max_prediction_len):

            # get current h of the decoder
            s = s.add_input(prev_output_vec)
            decoder_rnn_output = s.output()

            # perform attention step
            attention_output_vector, alphas = self.attend(
                blstm_outputs, decoder_rnn_output)

            if self.plot:
                val = alphas.vec_value()
                alphas_mtx.append(val)

            # compute output probabilities
            # h = readout * attention_output_vector + bias
            h = dn.affine_transform(
                [self.bias, self.readout, attention_output_vector])

            # TODO: understand why diverse needs tanh before softmax
            if self.diverse:
                h = dn.tanh(h)
            probs = dn.softmax(h)

            # find best candidate output - greedy
            next_element_index = np.argmax(probs.npvalue())
            predicted_sequence.append(self.int2y[next_element_index])

            # check if reached end of word
            if predicted_sequence[-1] == common.END_SEQ:
                break

            # prepare for the next iteration - "feedback"
            prev_output_vec = dn.concatenate([
                self.output_lookup[next_element_index], attention_output_vector
            ])
            i += 1

        # remove the end seq symbol
        return predicted_sequence[0:-1], alphas_mtx
예제 #22
0
    def __call__(self, x=None, t=None, test=False):
        if test:
            tt_embs = [dy.lookup(self.E, t_t) for t_t in t]

            if self.encoder_type == 'bow':
                # Neural language model
                tt_c = dy.concatenate(tt_embs)
                h = dy.tanh(self.U * tt_c)

                # Output with softmax
                y_t = dy.softmax(self.V * h + self.W_enc)

            elif self.encoder_type == 'attention':
                ttp_embs = [dy.lookup(self.G, t_t) for t_t in t]

                # Neural language model
                tt_c = dy.concatenate(tt_embs)
                h = dy.tanh(self.U * tt_c)

                # Attention
                ttp_c = dy.concatenate(ttp_embs)
                p = dy.softmax(self.xt * self.P * ttp_c)  # Attention weight
                enc = self.xb * p  # Context vector

                # Output with softmax
                y_t = dy.softmax(self.V * h + self.W * enc)

            return y_t

        else:
            xt_embs = [dy.lookup(self.F, x_t) for x_t in x]
            tt_embs = [dy.lookup(self.E, t_t) for t_t in t]

            y = []
            if self.encoder_type == 'bow':
                # BoW
                enc = dy.average(xt_embs)
                W_enc = self.W * enc
                for i in range(len(t) - self.c + 1):
                    # Neural language model
                    tt_c = dy.concatenate(tt_embs[i:i + self.c])
                    h = dy.tanh(self.U * tt_c)

                    # Output without softmax
                    y_t = self.V * h + W_enc
                    y.append(y_t)

            elif self.encoder_type == 'attention':
                xb = dy.concatenate([
                    dy.esum(xt_embs[max(i - self.q, 0
                                        ):min(len(x) - 1 + 1, i + self.q + 1)])
                    / self.q for i in range(len(x))
                ],
                                    d=1)
                xt = dy.transpose(dy.concatenate(xt_embs, d=1))
                ttp_embs = [dy.lookup(self.G, t_t) for t_t in t]

                for i in range(len(t) - self.c + 1):
                    # Neural language model
                    tt_c = dy.concatenate(tt_embs[i:i + self.c])
                    h = dy.tanh(self.U * tt_c)

                    # Attention
                    ttp_c = dy.concatenate(
                        ttp_embs[i:i + self.c])  # Window-sized embedding
                    p = dy.softmax(xt * self.P * ttp_c)  # Attention weight
                    enc = xb * p  # Context vector

                    # Output without softmax
                    y_t = self.V * h + self.W * enc
                    y.append(y_t)

            return y
예제 #23
0
 def classify_single(self, embedding):
     return vocab.getw(
         np.argmax(dy.softmax(self.get_graph(embedding)).npvalue()))