def attend(blstm_outputs, h_t, W_c, v_a, W__a, U__a):
    # iterate through input states to compute alphas
    # print 'computing scores...'
    # scores = [W_a * pc.concatenate([h_t, h_input]) for h_input in blstm_outputs]
    scores = [v_a * pc.tanh(W__a * h_t + U__a * h_input) for h_input in blstm_outputs]
    # print 'computed scores'
    # normalize to alphas using softmax
    # print 'computing alphas...'
    alphas = pc.softmax(pc.concatenate(scores))
    # print 'computed alphas...'
    # compute c using alphas
    # print 'computing c...'

    # import time
    # s = time.time()
    # dim = len(blstm_outputs[0].vec_value())
    # stacked_alphas = pc.concatenate_cols([alphas for j in xrange(dim)])
    # stacked_vecs = pc.concatenate_cols([h_input for h_input in blstm_outputs])
    # c = pc.esum(pc.cwise_multiply(stacked_vecs, stacked_alphas))
    # print "stack time:", time.time() - s

    # s = time.time()
    c = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)])
    # print "pick time:", time.time() - s
    # print 'computed c'
    # print 'c len is {}'.format(len(c.vec_value()))
    # compute output state h~ using c and the decoder's h (global attention variation from Loung and Manning 2015)
    # print 'computing h~...'
    h_output = pc.tanh(W_c * pc.concatenate([h_t, c]))
    # print 'len of h_output is {}'.format(len(h_output.vec_value()))
    # print 'computed h~'

    return h_output, alphas, W__a.value()
Ejemplo n.º 2
0
def generate(in_seq, enc_fwd_lstm, enc_bwd_lstm, dec_lstm):
    embedded = embed_sentence(in_seq)
    encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded)

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)
    w1 = dy.parameter(attention_w1)
    input_mat = dy.concatenate_cols(encoded)
    w1dt = None

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings]))

    out = ''
    count_EOS = 0
    for i in range(len(in_seq)*2):
        if count_EOS == 2: break
        # w1dt can be computed and cached once for the entire decoding phase
        w1dt = w1dt or w1 * input_mat
        vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings])
        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector).vec_value()
        next_char = probs.index(max(probs))
        last_output_embeddings = output_lookup[next_char]
        if int2char[next_char] == EOS:
            count_EOS += 1
            continue

        out += int2char[next_char]
    return out
Ejemplo n.º 3
0
Archivo: dy_model.py Proyecto: jcyk/CWS
    def word_repr(self, char_seq):
        # obtain the word representation when given its character sequence
        wlen = len(char_seq)
        if 'rgW%d'%wlen not in self.param_exprs:
            self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1])
            self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1])
            self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1])
            self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1])
            self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1])
            self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1])
          
        chars = dy.concatenate(char_seq)
        reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen])
        comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars])
        update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen]
        
        update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])]))
        
        # The following implementation of Softmax fucntion is not safe, but faster...
        #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1)))
        #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1)))
        #assert (not np.isnan(update_gate.npvalue()).any())

        word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1))))
        return word
Ejemplo n.º 4
0
def generate(input, enc_fwd_lstm, enc_bwd_lstm, dec_lstm):
    def sample(probs):
        rnd = random.random()
        for i, p in enumerate(probs):
            rnd -= p
            if rnd <= 0: break
        return i

    embedded = embed_sentence(input)
    encoded = encode_sentence(enc_fwd_lstm, enc_bwd_lstm, embedded)

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE * 2), last_output_embeddings]))
    out = ''
    count_EOS = 0
    for i in range(len(input)*2):
        if count_EOS == 2: break
        vector = dy.concatenate([attend(encoded, s), last_output_embeddings])

        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector)
        probs = probs.vec_value()
        next_char = sample(probs)
        last_output_embeddings = output_lookup[next_char]
        if int2char[next_char] == EOS:
            count_EOS += 1
            continue

        out += int2char[next_char]
    return out
Ejemplo n.º 5
0
    def learn(self, batch_size):
        exps = self.memory.sample(batch_size)
        obss, actions, rewards, obs_nexts, dones = self._process(exps)

        # Update critic
        dy.renew_cg()
        target_actions = self.actor_target(obs_nexts, batched=True)
        target_values = self.critic_target(dy.concatenate([dy.inputTensor(obs_nexts, batched=True), target_actions]),
                                           batched=True)
        target_values = rewards + 0.99 * target_values.npvalue() * (1 - dones)

        dy.renew_cg()
        values = self.critic(np.concatenate([obss, actions]), batched=True)
        loss = dy.mean_batches((values - dy.inputTensor(target_values, batched=True)) ** 2)
        loss_value_critic = loss.npvalue()
        loss.backward()
        self.trainer_critic.update()

        # update actor
        dy.renew_cg()
        actions = self.actor(obss, batched=True)
        obs_and_actions = dy.concatenate([dy.inputTensor(obss, batched=True), actions])
        loss = -dy.mean_batches(self.critic(obs_and_actions, batched=True))
        loss_value_actor = loss.npvalue()
        loss.backward()
        self.trainer_actor.update()

        self.noise_stddev = (
                    self.noise_stddev - self.noise_stddev_decrease) if self.noise_stddev > self.noise_stddev_lower else self.noise_stddev_lower

        self.actor_target.update(self.actor, soft=True)
        self.critic_target.update(self.critic, soft=True)

        return loss_value_actor + loss_value_critic
Ejemplo n.º 6
0
def decode(dec_lstm, vectors, output):
    output = [EOS] + list(output) + [EOS]
    output = [char2int[c] for c in output]

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)
    w1 = dy.parameter(attention_w1)
    input_mat = dy.concatenate_cols(vectors)
    w1dt = None

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings]))
    loss = []

    for char in output:
        # w1dt can be computed and cached once for the entire decoding phase
        w1dt = w1dt or w1 * input_mat
        vector = dy.concatenate([attend(input_mat, s, w1dt), last_output_embeddings])
        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector)
        last_output_embeddings = output_lookup[char]
        loss.append(-dy.log(dy.pick(probs, char)))
    loss = dy.esum(loss)
    return loss
Ejemplo n.º 7
0
def generate(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent

    #get the output of the first LSTM
    src_outputs =  [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])]

    src_output = src_outputs[-1]

    #gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix



    #generate until a eos tag or max is reached
    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])

    prev_word = sos_trg
    trg_sent = []
    attention_matrix = []
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)



    for i in range(MAX_SENT_SIZE):
        #feed the previous word into the lstm, calculate the most likely word, add it to the sentence
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()
        att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        attention_matrix.append(alignment)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        probs = (-dy.log_softmax(s)).value()
        next_word = np.argmax(probs)

        if next_word == eos_trg:
            break
        prev_word = next_word
        trg_sent.append(i2w_trg[next_word])
    return trg_sent, dy.concatenate_cols(attention_matrix).value()
Ejemplo n.º 8
0
 def evaluate(self, inputs, train=False):
     """
     Apply all MLP layers to concatenated input
     :param inputs: (key, vector) per feature type
     :param train: are we training now?
     :return: output vector of size self.output_dim
     """
     input_keys, inputs = list(map(list, zip(*list(inputs))))
     if self.input_keys:
         assert input_keys == self.input_keys, "Got:     %s\nBut expected input keys: %s" % (
             self.input_keys_str(self.input_keys), self.input_keys_str(input_keys))
     else:
         self.input_keys = input_keys
     if self.gated:
         gates = self.params.get("gates")
         if gates is None:  # FIXME attention weights should not be just parameters, but based on biaffine product?
             gates = self.params["gates"] = self.model.add_parameters((len(inputs), self.gated),
                                                                      init=dy.UniformInitializer(1))
         input_dims = [i.dim()[0][0] for i in inputs]
         max_dim = max(input_dims)
         x = dy.concatenate_cols([dy.concatenate([i, dy.zeroes(max_dim - d)])  # Pad with zeros to get uniform dim
                                  if d < max_dim else i for i, d in zip(inputs, input_dims)]) * gates
         # Possibly multiple "attention heads" -- concatenate outputs to one vector
         inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1],))]
     x = dy.concatenate(inputs)
     assert len(x.dim()[0]) == 1, "Input should be a vector, but has dimension " + str(x.dim()[0])
     dim = x.dim()[0][0]
     if self.input_dim:
         assert dim == self.input_dim, "Input dim mismatch: %d != %d" % (dim, self.input_dim)
     else:
         self.init_params(dim)
     self.config.print(self, level=4)
     if self.total_layers:
         if self.weights is None:
             self.weights = [[self.params[prefix + str(i)] for prefix in ("W", "b")]
                             for i in range(self.total_layers)]
             if self.weights[0][0].dim()[0][1] < dim:  # number of columns in W0
                 self.weights[0][0] = dy.concatenate_cols([self.weights[0][0], self.params["W0+"]])
         for i, (W, b) in enumerate(self.weights):
             self.config.print(lambda: x.npvalue().tolist(), level=4)
             try:
                 if train and self.dropout:
                     x = dy.dropout(x, self.dropout)
                 x = self.activation()(W * x + b)
             except ValueError as e:
                 raise ValueError("Error in evaluating layer %d of %d" % (i + 1, self.total_layers)) from e
     self.config.print(lambda: x.npvalue().tolist(), level=4)
     return x
Ejemplo n.º 9
0
def calc_scores_with_previous_tag(words, referent_tags=None):
    """
    Calculate scores using previous tag as input. If the referent tags are provided, then we will sample from previous
    referent tag or previous system prediction.
    :param words:
    :param referent_tags:
    :return:
    """
    dy.renew_cg()

    word_embs = [LOOKUP[x] for x in words]

    # Transduce all batch elements for the backward LSTM, using the original word embeddings.
    bwd_init = bwdLSTM.initial_state()
    bwd_word_reps = bwd_init.transduce(reversed(word_embs))

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)

    scores = []
    # Transduce one by one for the forward LSTM
    fwd_init = fwdLSTM.initial_state()
    s_fwd = fwd_init

    prev_tag = start_tag

    index = 0
    for word, bwd_word_rep in zip(word_embs, reversed(bwd_word_reps)):
        # Concatenate word and tag representation just as training.
        fwd_input = dy.concatenate([word, TAG_LOOKUP[prev_tag]])
        s_fwd = s_fwd.add_input(fwd_input)
        combined_rep = dy.concatenate([s_fwd.output(), bwd_word_rep])
        score = dy.affine_transform([b, W, combined_rep])
        prediction = np.argmax(score.npvalue())

        if referent_tags:
            if sampler.sample_true():
                prev_tag = referent_tags[index]
            else:
                prev_tag = prediction
            index += 1
        else:
            prev_tag = prediction

        scores.append(score)

    return scores
Ejemplo n.º 10
0
def calc_predict_and_activations(wids, tag, words):
    dy.renew_cg()
    if len(wids) < WIN_SIZE:
        wids += [0] * (WIN_SIZE-len(wids))

    cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1)
    cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False)
    filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue()
    activations = filters.argmax(axis=0)

    pool_out = dy.max_dim(cnn_out, d=1)
    pool_out = dy.reshape(pool_out, (FILTER_SIZE,))
    pool_out = dy.rectify(pool_out)

    scores = (W_sm * pool_out + b_sm).npvalue()
    print ('%d ||| %s' % (tag, ' '.join(words)))
    predict = np.argmax(scores)
    print (display_activations(words, activations))
    print ('scores=%s, predict: %d' % (scores, predict))
    features = pool_out.npvalue()
    W = W_sm.npvalue()
    bias = b_sm.npvalue()
    print ('  bias=%s' % bias)
    contributions = W * features
    print (' very bad (%.4f): %s' % (scores[0], contributions[0]))
    print ('      bad (%.4f): %s' % (scores[1], contributions[1]))
    print ('  neutral (%.4f): %s' % (scores[2], contributions[2]))
    print ('     good (%.4f): %s' % (scores[3], contributions[3]))
    print ('very good (%.4f): %s' % (scores[4], contributions[4]))
Ejemplo n.º 11
0
 def expr_for_tree(self, tree):
     if tree.isleaf():
         return self.E[self.w2i.get(tree.label,0)]
     if len(tree.children) == 1:
         assert(tree.children[0].isleaf())
         emb = self.expr_for_tree(tree.children[0])
         Wi,Wo,Wu   = [dy.parameter(w) for w in self.WS]
         bi,bo,bu,_ = [dy.parameter(b) for b in self.BS]
         i = dy.logistic(Wi*emb + bi)
         o = dy.logistic(Wo*emb + bo)
         u = dy.tanh(    Wu*emb + bu)
         c = dy.cmult(i,u)
         expr = dy.cmult(o,dy.tanh(c))
         return expr
     assert(len(tree.children) == 2),tree.children[0]
     e1 = self.expr_for_tree(tree.children[0])
     e2 = self.expr_for_tree(tree.children[1])
     Ui,Uo,Uu = [dy.parameter(u) for u in self.US]
     Uf1,Uf2 = [dy.parameter(u) for u in self.UFS]
     bi,bo,bu,bf = [dy.parameter(b) for b in self.BS]
     e = dy.concatenate([e1,e2])
     i = dy.logistic(Ui*e + bi)
     o = dy.logistic(Uo*e + bo)
     f1 = dy.logistic(Uf1*e1 + bf)
     f2 = dy.logistic(Uf2*e2 + bf)
     u = dy.tanh(    Uu*e + bu)
     c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2)
     h = dy.cmult(o,dy.tanh(c))
     expr = h
     return expr
Ejemplo n.º 12
0
def calc_score_of_history(words):
  # Lookup the embeddings and concatenate them
  emb = dy.concatenate([W_emb[x] for x in words])
  # Create the hidden layer
  h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
  # Calculate the score and return
  return dy.affine_transform([b_sm, W_sm, h])
Ejemplo n.º 13
0
    def decode_loss(self, src_encodings, tgt_seqs):
        """
        :param tgt_seqs: (tgt_heads, tgt_labels): list (length=batch_size) of (src_len)
        """

        # todo(NOTE): Sentences should start with empty token (as root of dependency tree)!

        tgt_heads, tgt_labels = tgt_seqs

        src_len = len(tgt_heads[0])
        batch_size = len(tgt_heads)
        np_tgt_heads = np.array(tgt_heads).flatten()  # (src_len * batch_size)
        np_tgt_labels = np.array(tgt_labels).flatten()
        s_arc, s_label = self.cal_scores(src_encodings)  # (src_len, src_len, bs), ([(src_len, src_len, bs)])

        s_arc_value = s_arc.npvalue()
        s_arc_choice = np.argmax(s_arc_value, axis=0).transpose().flatten()  # (src_len * batch_size)

        s_pick_labels = [dy.pick_batch(dy.reshape(score, (src_len,), batch_size=src_len * batch_size), s_arc_choice)
                     for score in s_label]
        s_argmax_labels = dy.concatenate(s_pick_labels, d=0)  # n_labels, src_len * batch_size

        reshape_s_arc = dy.reshape(s_arc, (src_len,), batch_size=src_len * batch_size)
        arc_loss = dy.pickneglogsoftmax_batch(reshape_s_arc, np_tgt_heads)
        label_loss = dy.pickneglogsoftmax_batch(s_argmax_labels, np_tgt_labels)

        loss = dy.sum_batches(arc_loss + label_loss) / batch_size
        return loss
Ejemplo n.º 14
0
    def embed(self, batch_dict):
        all_embeddings_lists = []
        for k, embedding in self.embeddings.items():
            all_embeddings_lists.append(embedding.encode(batch_dict[k]))

        embedded = dy.concatenate(all_embeddings_lists, d=1)
        return embedded
Ejemplo n.º 15
0
    def build_graph(self, features):
        # extract word and tags ids
        word_ids = [self.vocab.word2id(word_feat) for word_feat in features[0:20]]
        tag_ids = [self.vocab.tag2id(tag_feat) for tag_feat in features[20:40]]
        dep_ids = [self.vocab.dep2id(tag_feat) for tag_feat in features[40:]]

        # extract word embeddings and tag embeddings from features
        word_embeds = [self.word_embedding[wid] for wid in word_ids]
        tag_embeds = [self.tag_embedding[tid] for tid in tag_ids]
        dep_embeds = [self.dep_embedding[tid] for tid in dep_ids]

        # concatenating all features (recall that '+' for lists is equivalent to appending two lists)
        embedding_layer = dynet.concatenate(word_embeds + tag_embeds + dep_embeds)

        # calculating the hidden layer
        # .expr() converts a parameter to a matrix expression in dynet (its a dynet-specific syntax).
        hidden1 = self.transfer(self.hidden_layer1 * embedding_layer + self.hidden_layer_bias1)
        dropout1 = dynet.dropout(hidden1, 0.1)
        hidden2 = self.transfer(self.hidden_layer2 * dropout1 + self.hidden_layer_bias2)
	
	# To implement network without dropout, remove the line with dropout1 and change hidden2 to:
	# hidden2 = self.transfer(self.hidden_layer2 * hidden1 + self.hidden_layer_bias2)

        # calculating the output layer
        output = self.output_layer * hidden2 + self.output_bias

        # return the output as a dynet vector (expression)
        return output
Ejemplo n.º 16
0
def attend(input_vectors, state):
    global attention_w1
    global attention_w2
    global attention_v
    w1 = dy.parameter(attention_w1)
    w2 = dy.parameter(attention_w2)
    v = dy.parameter(attention_v)
    attention_weights = []

    w2dt = w2*dy.concatenate(list(state.s()))
    for input_vector in input_vectors:
        attention_weight = v*dy.tanh(w1*input_vector + w2dt)
        attention_weights.append(attention_weight)
    attention_weights = dy.softmax(dy.concatenate(attention_weights))
    output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)])
    return output_vectors
Ejemplo n.º 17
0
 def conv(input_, _=None):
     dims = tuple([1] + list(input_.dim()[0]))
     input_ = dy.reshape(input_, dims)
     mots = []
     for conv in convs:
         mots.append(mot_pool(conv(input_)))
     return dy.concatenate(mots)
Ejemplo n.º 18
0
    def embed(self, batch_dict):
        all_embeddings_lists = []
        for k, embedding in self.embeddings.items():
            all_embeddings_lists.append(embedding.encode(batch_dict[k], self.train))

        embedded = dy.concatenate(all_embeddings_lists, d=1)
        embed_list = [self.dropout(e) for e in embedded]
        return embed_list
Ejemplo n.º 19
0
def word_rep(w, cf_init, cb_init):
    pad_char = vc.w2i['<*>']
    char_ids = [pad_char] + [vc.w2i[c] for c in w] + [pad_char]
    char_embs = [CHARS_LOOKUP[cid] for cid in char_ids]
    fw_exps = cf_init.transduce(char_embs)
    bw_exps = cb_init.transduce(reversed(char_embs))

    return dy.concatenate([ WORDS_LOOKUP[vw.w2i[w]], fw_exps[-1], bw_exps[-1] ])
Ejemplo n.º 20
0
 def transduce(self, inputs, train):
     xs = inputs[:self.max_length]
     if not xs:
         return []
     for i in range(self.lstm_layers):
         for n, d in ("f", 1), ("b", -1):
             Wr, br, Wh = [self.params["%s%d%s" % (p, i, n)] for p in ("Wr", "br", "Wh")]
             hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d])
             hs = [hs_[0]]
             for t in range(1, len(hs_)):
                 r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br)
                 hs.append(dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t]))
             xs = hs
             if train:
                 x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout)
                 xs = [dy.pick(x, i, 1) for i in range(len(xs))]
     return xs
Ejemplo n.º 21
0
def calc_scores(words):
    dy.renew_cg()
    word_embs = [dy.lookup(W_emb, x) for x in words]
    fwd_init = fwdLSTM.initial_state()
    fwd_embs = fwd_init.transduce(word_embs)
    bwd_init = bwdLSTM.initial_state()
    bwd_embs = bwd_init.transduce(reversed(word_embs))
    return W_sm * dy.concatenate([fwd_embs[-1], bwd_embs[-1]]) + b_sm
Ejemplo n.º 22
0
 def __call__(self, seq):
     """
     seq is a list of vectors (either character embeddings or bilstm outputs)
     """
     fw = self.lstmF.initial_state()
     bw = self.lstmB.initial_state()
     outf = fw.transduce(seq)
     outb = list(reversed(bw.transduce(reversed(seq))))
     return [dy.concatenate([f, b]) for f, b in zip(outf, outb)]
Ejemplo n.º 23
0
def calc_loss(words, labels, heads):
    dy.renew_cg()
    word_embs = [dy.lookup(W_emb, x) for x in words]
    fwd_init = fwdLSTM.initial_state()
    fwd_embs = fwd_init.transduce(word_embs)
    bwd_init = bwdLSTM.initial_state()
    bwd_embs = bwd_init.transduce(reversed(word_embs))
    src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))]
    return biaffineParser.decode_loss(src_encodings, ([heads], [labels]))
Ejemplo n.º 24
0
def encode_sentence(enc_fwd_lstm, enc_bwd_lstm, sentence):
    sentence_rev = list(reversed(sentence))

    fwd_vectors = run_lstm(enc_fwd_lstm.initial_state(), sentence)
    bwd_vectors = run_lstm(enc_bwd_lstm.initial_state(), sentence_rev)
    bwd_vectors = list(reversed(bwd_vectors))
    vectors = [dy.concatenate(list(p)) for p in zip(fwd_vectors, bwd_vectors)]

    return vectors
Ejemplo n.º 25
0
    def __call__(self, embed_in, src_len, train=False, **kwargs):
        """Input Shape: ((T, H), B). Output Shape: [((H,), B)] * T"""
        embed_in = list(embed_in)
        self.dropout(train)
        forward, forward_state = rnn_forward_with_state(self.lstm_forward, embed_in, src_len)
        if self.lstm_backward is not None:

            backward, backward_state = rnn_forward_with_state(self.lstm_backward, embed_in)
            output = [dy.concatenate([f, b]) for f, b in zip(forward, backward)]
            hidden = [dy.concatenate([f, b]) for f, b in zip(forward_state, backward_state)]
        else:
            output = forward
            hidden = forward_state
        return RNNEncoderOutput(
            output=[o + e for o, e in zip(output, embed_in)] if self.residual else output,
            hidden=hidden,
            src_mask=self.src_mask_fn(src_len, len(output))
        )
Ejemplo n.º 26
0
def calc_score_of_history(words, dropout=0.0):
  # Lookup the embeddings and concatenate them
  emb = dy.concatenate([W_emb[x] for x in words])
  # Create the hidden layer
  h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
  # CHANGE 2: perform dropout
  if dropout != 0.0:
    h = dy.dropout(h, dropout)
  # Calculate the score and return
  return dy.affine_transform([b_sm, W_sm, h])
Ejemplo n.º 27
0
def calc_acc(words, labels, heads):
    dy.renew_cg()
    word_embs = [dy.lookup(W_emb, x) for x in words]
    fwd_init = fwdLSTM.initial_state()
    fwd_embs = fwd_init.transduce(word_embs)
    bwd_init = bwdLSTM.initial_state()
    bwd_embs = bwd_init.transduce(reversed(word_embs))
    src_encodings = [dy.reshape(dy.concatenate([f, b]), (HID_SIZE * 2, 1)) for f, b in zip(fwd_embs, reversed(bwd_embs))]
    pred_heads, pred_labels = biaffineParser.decoding(src_encodings)
    return biaffineParser.cal_accuracy(pred_heads, pred_labels, heads, labels)
Ejemplo n.º 28
0
    def _get_expr(self, sentence, i, j):
        # pylint: disable=missing-docstring
        if sentence[i].headfov is None:
            sentence[i].headfov = self.hid_layer_foh.expr() * concatenate(
                [sentence[i].lstms[0], sentence[i].lstms[1]])
        if sentence[j].modfov is None:
            sentence[j].modfov = self.hid_layer_fom.expr() * concatenate(
                [sentence[j].lstms[0], sentence[j].lstms[1]])

        if self.hidden2_units > 0:
            output = \
                self.out_layer.expr() * self.activation(
                    self.hid2_bias.expr() + self.hid2_layer.expr() * self.activation(
                        sentence[i].headfov + sentence[j].modfov
                        + self.hid_bias.expr()))  # + self.outBias
        else:
            output = self.out_layer.expr() * self.activation(
                sentence[i].headfov + sentence[j].modfov + self.hid_bias.expr())  # + self.outBias
        return output
Ejemplo n.º 29
0
    def _evaluate_label(self, sentence, i, j):
        # pylint: disable=missing-docstring
        if sentence[i].rheadfov is None:
            sentence[i].rheadfov = self.rhid_layer_foh.expr() * concatenate(
                [sentence[i].lstms[0], sentence[i].lstms[1]])
        if sentence[j].rmodfov is None:
            sentence[j].rmodfov = self.rhid_layer_fom.expr() * concatenate(
                [sentence[j].lstms[0], sentence[j].lstms[1]])

        if self.hidden2_units > 0:
            output = self.rout_layer.expr() * self.activation(
                self.rhid2_bias.expr() + self.rhid2_layer.expr() *
                self.activation(sentence[i].rheadfov + sentence[j].rmodfov
                                + self.rhid_bias.expr())) + self.rout_bias.expr()
        else:
            output = self.rout_layer.expr() * self.activation(
                sentence[i].rheadfov + sentence[j].rmodfov
                + self.rhid_bias.expr()) + self.rout_bias.expr()
        return output.value(), output
Ejemplo n.º 30
0
def decode(dec_lstm, vectors, output):
    output = [EOS] + list(output) + [EOS]
    output = [char2int[c] for c in output]

    w = dy.parameter(decoder_w)
    b = dy.parameter(decoder_b)

    last_output_embeddings = output_lookup[char2int[EOS]]
    s = dec_lstm.initial_state().add_input(dy.concatenate([dy.vecInput(STATE_SIZE*2), last_output_embeddings]))
    loss = []
    for char in output:
        vector = dy.concatenate([attend(vectors, s), last_output_embeddings])

        s = s.add_input(vector)
        out_vector = w * s.output() + b
        probs = dy.softmax(out_vector)
        last_output_embeddings = output_lookup[char]
        loss.append(-dy.log(dy.pick(probs, char)))
    loss = dy.esum(loss)
    return loss
Ejemplo n.º 31
0
    def generate(self, s_sentence, max_len=150):

        dy.renew_cg()

        W_y = dy.parameter(self.params["W_y"])
        b_y = dy.parameter(self.params["b_y"])
        s_lookup = self.params["s_lookup"]
        t_lookup = self.params["t_lookup"]

        s_sentence = [self.s_vocab[EOS]] + s_sentence + [self.s_vocab[EOS]]
        s_sentence_rev = list(reversed(s_sentence))

        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []

        for cw_l2r in s_sentence:
            l2r_state = l2r_state.add_input(s_lookup[cw_l2r])
            l2r_contexts.append(l2r_state.output())

        for cw_r2l in s_sentence_rev:
            r2l_state = r2l_state.add_input(s_lookup[cw_r2l])
            r2l_contexts.append(r2l_state.output())

        r2l_contexts.reverse()

        H_f = []
        H_f = [
            dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts)
        ]

        H_f_mat = dy.concatenate_cols(H_f)
        W1_att = dy.parameter(self.params["W1_att"])
        w1dt = W1_att * H_f_mat

        c_t = dy.vecInput(2 * self.HIDDEN_DIM)
        embedding = t_lookup[self.t_vocab["<EOS>"]]

        dec_state = self.dec_builder.initial_state()

        t_sentence = []

        count_eos = 0

        for i in range(len(s_sentence) * 2):
            if count_eos == 2:
                break

            x_t = dy.concatenate([c_t, embedding])
            dec_state = dec_state.add_input(x_t)

            c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_sentence), 1)
            probs = dy.softmax(W_y *
                               dy.concatenate([c_t, dec_state.output()]) +
                               b_y).vec_value()
            word = probs.index(max(probs))

            embedding = t_lookup[word]

            if self.t_id_lookup[word] == "<EOS>":
                count_eos += 1
                continue

            t_sentence.append(self.t_id_lookup[word])

        return " ".join(t_sentence)
Ejemplo n.º 32
0
    def _predict(self, src, dst=None, num_predictions=-1, runtime=True):
        # input
        x_list = self._make_input(src, True)
        # encoder
        for fw, bw, dropout in zip(self.encoder_fw, self.encoder_bw,
                                   self.config.encoder_layer_dropouts):
            if runtime:
                fw.set_dropouts(0, 0)
                bw.set_dropouts(0, 0)
            else:
                fw.set_dropouts(0, dropout)
                bw.set_dropouts(0, dropout)

            fw_list = fw.initial_state().transduce(x_list)
            bw_list = list(
                reversed(bw.initial_state().transduce(reversed(x_list))))
            x_list = [
                dy.concatenate([fw_value, bw_value])
                for fw_value, bw_value in zip(fw_list, bw_list)
            ]

        # decoder

        predictions_left = num_predictions
        decoder = self.decoder.initial_state().add_input(
            dy.inputVector(
                [0] *
                (self.config.encoder_layers[-1] * 2 + self.config.input_size)))
        last_dst_we = self.special_we[0]
        softmax_output = []
        aux_output = []
        pred_index = 0
        while predictions_left != 0:
            predictions_left -= 1
            input = dy.concatenate(
                [self._attend(x_list, decoder), last_dst_we])

            decoder = decoder.add_input(input)
            softmax = dy.softmax(self.output_softmax_w.expr() *
                                 decoder.output() +
                                 self.output_softmax_b.expr())
            softmax_output.append(softmax)

            proj = dy.tanh(self.aux_layer_w.expr() * decoder.output() +
                           self.aux_layer_b.expr())
            aux = self.aux_layer_proj_w.expr(
            ) * proj + self.aux_layer_proj_b.expr()
            aux_output.append(aux)
            if runtime:
                out_we_index = np.argmax(softmax.npvalue())
                if out_we_index == self.EOS:
                    break
                last_dst_we = self.hol_we_dst[out_we_index]
            else:
                if pred_index < len(dst):
                    last_word = dst[pred_index].word.decode('utf-8').lower()
                    last_word_index = self.output_encodings.word2int['<UNK>']
                    if last_word in self.output_encodings.word2int:
                        last_word_index = self.output_encodings.word2int[
                            last_word]
                    last_dst_we = self.hol_we_dst[last_word_index]
                    pred_index += 1
            #failsafe
            if len(softmax_output) >= 2 * len(src):
                break

        return softmax_output, aux_output
Ejemplo n.º 33
0
    def __call__(self, words_sequence, word2int, vocab, dataset="train"):

        lookup = self.params["lookup"]
        char_lstm = self.char_builder.initial_state()
        W_con = dy.parameter(self.params["W_con"])
        b_con = dy.parameter(self.params["b_con"])

        sequence = []
        if dataset == "train":
            for word, label in words_sequence:
                char_embed = []
                word_chars = list(word)
                # get char embeddings of words
                for ch in word_chars:
                    char_embed.append(lookup[word2int.get(ch)])

                # get char LSTM encoding
                char_encoder = char_lstm.transduce(char_embed)[-1]

                if word not in vocab:
                    #curr_word_embed = dy.esum(char_embed)
                    curr_word_embed = lookup[word2int.get("<UNK>")]
                else:
                    curr_word_embed = lookup[word2int.get(word)]

                char_word_concat = dy.concatenate(
                    [curr_word_embed, char_encoder])
                sequence.append(W_con * char_word_concat + b_con)
        else:
            for word in words_sequence:
                char_embed = []
                word_chars = list(word)
                # get char embeddings of words
                for ch in word_chars:
                    char_embed.append(lookup[word2int.get(ch)])

                # get char LSTM encoding
                char_encoder = char_lstm.transduce(char_embed)[-1]

                if word not in vocab:
                    #curr_word_embed = dy.esum(char_embed)
                    curr_word_embed = lookup[word2int.get("<UNK>")]
                else:
                    curr_word_embed = lookup[word2int.get(word)]

                char_word_concat = dy.concatenate(
                    [curr_word_embed, char_encoder])
                sequence.append(W_con * char_word_concat + b_con)

        # convert the parameter into an Expession (add it to graph)
        W = dy.parameter(self.params["W"])
        b = dy.parameter(self.params["b"])
        fw_lstm1 = self.fw_builder1.initial_state()
        bw_lstm1 = self.bw_builder1.initial_state()
        fw_lstm2 = self.fw_builder2.initial_state()
        bw_lstm2 = self.bw_builder2.initial_state()

        # get output vectors of all time steps for the first bi-lstm
        fw_lstm1_output = fw_lstm1.transduce(sequence)
        bw_lstm1_output = bw_lstm1.transduce(reversed(sequence))

        # concatenate backward vector to forward vector per each word
        bi1_output = [
            dy.concatenate([fw1, bw1])
            for fw1, bw1 in zip(fw_lstm1_output, reversed(bw_lstm1_output))
        ]

        # get output vectors of all time steps for the second bi-lstm
        fw_lstm2_output = fw_lstm2.transduce(bi1_output)
        bw_lstm2_output = bw_lstm2.transduce(reversed(bi1_output))

        # concatenate backward vector to forward vector per each 1st biLSTM vector
        bi2_output = [
            dy.concatenate([fw2, bw2])
            for fw2, bw2 in zip(fw_lstm2_output, reversed(bw_lstm2_output))
        ]

        # calc net output
        net_output = [dy.softmax(W * out + b) for out in bi2_output]

        return net_output
Ejemplo n.º 34
0
    def post_order_parse(self, words, oracle_actions, oracle_tokens, buffer,
                         stack_top, action_top):
        stack = []
        stack_symbol = []

        output_actions = []
        output_tokens = []

        reduced = 0
        nt_allowed = 1
        ter_allowed = 1
        act_allowed = 1

        #recursively generate the tree until training data is exhausted
        while not (len(stack_symbol) == 1 and reduced != 0):
            valid_actions = []
            if len(stack_symbol) == 0:
                valid_actions += [_ACT]
            if len(stack_symbol) >= 1:
                if act_allowed:
                    valid_actions += [_ACT]
                if ter_allowed:
                    valid_actions += [_TER]
                if nt_allowed:
                    valid_actions += [_NT]

            word_weights = None

            action = valid_actions[0]
            #we make predictions when stack is not empty and _ACT is not the only valid action
            if len(stack_symbol) > 0:
                stack_embedding = stack[-1][0].output(
                ) if stack else self.initial_embedding()
                action_summary = action_top.output()
                word_weights = self.attention(stack_embedding, buffer)
                buffer_embedding = dy.esum([
                    vector * attterion_weight
                    for vector, attterion_weight in zip(buffer, word_weights)
                ])

                parser_state = dy.concatenate(
                    [buffer_embedding, stack_embedding, action_summary])
                h = self.mlp_layer(parser_state)

                if len(valid_actions) > 0:
                    log_probs = dy.log_softmax(self.act_proj_layer(h),
                                               valid_actions)
                    assert action in valid_actions, "action not in scope"
                    action = max(enumerate(log_probs.vec_value()),
                                 key=itemgetter(1))[0]

            if action == _NT:
                #generate non-terminal
                log_probs_nt = dy.log_softmax(self.nt_proj_layer(h))
                nt = max(enumerate(log_probs_nt.vec_value()),
                         key=itemgetter(1))[0]

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                parent_rep = self.nt_input_layer(self.nt_lookup[nt])

                found_start = 0
                path_input = []
                while found_start != 1:
                    top_symbol = stack_symbol.pop()
                    if top_symbol != '|':
                        top = stack.pop()
                        top_raw_rep, top_label, top_rep = top[2], top[1], top[
                            0]
                        path_input.append(top_raw_rep)
                    else:
                        found_start = 1

                composed_rep = self.subtree_input_layer(
                    dy.concatenate([dy.average(path_input), parent_rep]))
                stack_state = stack_state.add_input(composed_rep)
                stack.append((stack_state, 'c', composed_rep))
                stack_symbol.append('c')
                reduced = 1

                output_actions.append(self.act_vocab.token(action))
                output_tokens.append(self.nt_vocab.token(nt))

            elif action == _TER:
                #generate terminal
                log_probs_ter = dy.log_softmax(self.ter_proj_layer(h))
                ter = max(enumerate(log_probs_ter.vec_value()),
                          key=itemgetter(1))[0]

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                ter_embedding = self.ter_input_layer(self.ter_lookup[ter])
                stack_state = stack_state.add_input(ter_embedding)
                stack.append((stack_state, 'c', ter_embedding))
                stack_symbol.append('c')

                output_actions.append(self.act_vocab.token(action))
                output_tokens.append(self.ter_vocab.token(ter))

            else:
                #mark handle
                stack_symbol.append('|')
                output_actions.append(self.act_vocab.token(action))

            action_embedding = self.act_input_layer(self.act_lookup[action])
            action_top = action_top.add_input(action_embedding)

            count_c = stack_symbol.count('c')
            count_h = stack_symbol.count('|')

            nt_allowed = 1
            if count_h == 0 or count_c == 0 or stack_symbol[-1] != 'c':
                nt_allowed = 0

            act_allowed = 1
            if count_c >= 10 or count_h > 10:
                act_allowed = 0

            ter_allowed = 1
            if count_c >= 10:
                ter_allowed = 0

        return output_actions, output_tokens
Ejemplo n.º 35
0
    def transduce(
        self,
        input_: str,
        encoded_input: List[int],
        target: Optional[str] = None,
        rollin: Optional[float] = None,
        external_cg: bool = True,
    ):
        """Runs the transducer for dynamic-oracle training and greedy decoding.

        Args:
            input_: Input string.
            encoded_input: List of integer character codes.
            target: Target string during training, `None` during prediction.
            external_cg: Whether an external computation graph is defined.
            rollin: The probability with which an action sampled from the model
                    is executed. Used during training."""
        if not external_cg:
            dy.renew_cg()

        is_training = bool(target)
        input_emb = self.input_embedding(encoded_input, is_training)
        bidirectional_emb = self.bidirectional_encoding(input_emb)[
            1:]  # drop BEGIN_WORD
        input_length = len(bidirectional_emb)
        decoder = self.dec.initial_state()

        alignment = 0
        action_history: List[int] = [BEGIN_WORD]
        output: List[str] = []
        losses: List[dy.Expression] = []
        log_p = 0.0

        while len(action_history) <= MAX_ACTION_SEQ_LEN:

            length_encoder_suffix = input_length - alignment
            valid_actions = self.compute_valid_actions(length_encoder_suffix)

            input_char_embedding = bidirectional_emb[alignment]
            previous_action_embedding = self.act_lookup[action_history[-1]]
            decoder_input = dy.concatenate(
                [input_char_embedding, previous_action_embedding])
            decoder = decoder.add_input(decoder_input)

            decoder_output = decoder.output()
            logits = self.pW * decoder_output + self.pb
            log_probs = dy.log_softmax(logits, valid_actions)

            log_probs_np = log_probs.npvalue()

            if target is None:
                # argmax decoding
                action = np.argmax(log_probs_np)
            else:
                # training with dynamic oracle

                # 1. ACTIONS TO MAXIMIZE
                optim_actions = self.expert_rollout(input_, target, alignment,
                                                    output)

                loss = self.log_sum_softmax_loss(optim_actions, logits,
                                                 valid_actions)

                # 2. ACTION SPACE EXPLORATION: NEXT ACTION
                if np.random.rand() <= rollin:
                    # action is picked by sampling
                    action = self.sample(log_probs_np)
                else:
                    # action is picked from optim_actions
                    # reinforce model beliefs by picking highest probability
                    # action that is consistent with oracle
                    action = optim_actions[int(
                        np.argmax([log_probs_np[a] for a in optim_actions]))]
                losses.append(loss)

            log_p += log_probs_np[action]
            action_history.append(action)
            # execute the action to update the transducer state
            action = self.vocab.decode_action(action)

            if isinstance(action, ConditionalCopy):
                char_ = input_[alignment]
                alignment += 1
                output.append(char_)
            elif isinstance(action, ConditionalDel):
                alignment += 1
            elif isinstance(action, ConditionalIns):
                output.append(action.new)
            elif isinstance(action, ConditionalSub):
                alignment += 1
                output.append(action.new)
            elif isinstance(action, EndOfSequence):
                break
            else:
                raise ValueError(f"Unknown action: {action}.")

        return Output(action_history, "".join(output), log_p, losses)
Ejemplo n.º 36
0
def build_tagging_graph1(words):
    # Create a new computation graph - clears the current one and starts a new one
    dy.renew_cg()
    # parameters -> expressions
    # Parameters are things need to be trained.
    # Initialize a parameter vector, and add the parameters to be part of the computation graph.

    # initialize the RNNs
    f_init = fwdRNN.initial_state()  # forward
    b_init = bwdRNN.initial_state()  # backword

    second_forward_initialize = secondfwdRNN.initial_state()
    second_backward_initialize = secondbwdRNN.initial_state()

    # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
    wembs = []
    # if the model is a - call the right function to get the match represtention
    if option == 'a':
        for i, w in enumerate(words):
            # convert word to an embbeding vector
            wembs.append(word_rep_1(w))
    if option == 'c':
        for i, w in enumerate(words):
            word, pre, suff = word_rep_3(w)
            wembs.append(word + pre + suff)
    #
    """
    feed word vectors into biLSTM
    transduce takes in a sequence of Expressions, and returns a sequence of Expressions
    """

    # print wembs.__sizeof__()
    fw_exps = f_init.transduce(wembs)  # forward
    bw_exps = b_init.transduce(reversed(wembs))  # backword
    """
         biLSTM states

         Concatenate list of expressions to a single batched expression.
         All input expressions must have the same shape.
    """

    # bi_exps = [dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))]
    bi_exps = [dy.concatenate([f, b]) for f, b in zip(fw_exps, bw_exps)]

    # print bi_exps.__sizeof__()
    # second BILSTM layer, input: b1,b2..bn, output: b'1,b'2, b'3..
    forward_y_tag = second_forward_initialize.transduce(bi_exps)
    backward_y_tag = second_backward_initialize.transduce(reversed(bi_exps))

    # concat the results
    b_tag = [
        dy.concatenate([y1_tag, y2_tag])
        for y1_tag, y2_tag in zip(forward_y_tag, backward_y_tag)
    ]

    # feed each biLSTM state to an MLP
    H = dy.parameter(pH)
    O = dy.parameter(pO)

    exps = []
    for x in b_tag:
        r_t = O * (dy.tanh(H * x))
        exps.append(r_t)

    return exps  # results of model
Ejemplo n.º 37
0
    def span_train(self, words, oracle_actions, oracle_tokens, options, buffer,
                   stack_top, action_top):
        stack = []
        losses = []

        reduced = 0
        nt_allowed = 1
        found_root = 0

        _root = self.nt_vocab[oracle_tokens[-1]]
        #recursively generate the tree until training data is exhausted
        while not (found_root):
            valid_actions = []
            if len(stack) == 0:
                valid_actions += [_TER]
            if len(stack) >= 1:
                valid_actions += [_TER]
            if len(stack) >= 2:
                valid_actions += [_ACT]
            if len(stack) >= 1:
                valid_actions += [_NT]

            action = self.act_vocab[oracle_actions.pop(0)]

            #we make predictions when stack is not empty and _ACT is not the only valid action
            stack_embedding = stack[-1][0].output(
            ) if stack else self.initial_embedding()
            action_summary = action_top.output(
            ) if len(stack) > 0 else self.initial_embedding()
            word_weights = self.attention(stack_embedding, buffer)
            buffer_embedding = dy.esum([
                vector * attterion_weight
                for vector, attterion_weight in zip(buffer, word_weights)
            ])

            parser_state = dy.concatenate(
                [buffer_embedding, stack_embedding, action_summary])
            h = self.mlp_layer(parser_state)

            if options.dropout > 0:
                h = dy.dropout(h, options.dropout)

            if len(valid_actions) > 0:
                log_probs = dy.log_softmax(self.act_proj_layer(h),
                                           valid_actions)
                assert action in valid_actions, "action not in scope"
                losses.append(-dy.pick(log_probs, action))

            if action == _NT:
                #label span
                nt = self.nt_vocab[oracle_tokens.pop(0)]
                log_probs_nt = dy.log_softmax(self.nt_proj_layer(h))
                losses.append(-dy.pick(log_probs_nt, nt))

                if nt == _root:
                    found_root = 1

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                parent_rep = self.nt_input_layer(self.nt_lookup[nt])

                top = stack.pop()
                top_raw_rep, top_label, top_rep = top[2], top[1], top[0]
                composed_rep = self.subtree_input_layer(
                    dy.concatenate([top_raw_rep, parent_rep]))
                stack_state = stack_state.add_input(composed_rep)
                stack.append((stack_state, 'p', composed_rep))
                reduced = 1

            elif action == _TER:
                #generate terminal
                ter = self.ter_vocab[oracle_tokens.pop(0)]
                log_probs_ter = dy.log_softmax(self.ter_proj_layer(h))
                losses.append(-dy.pick(log_probs_ter, ter))

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                ter_embedding = self.ter_input_layer(self.ter_lookup[ter])
                stack_state = stack_state.add_input(ter_embedding)
                stack.append((stack_state, 'c', ter_embedding))

            else:
                #extend span
                assert len(stack) >= 2
                top2 = stack.pop()
                top1 = stack.pop()
                top2_raw_rep = top2[2]
                top1_raw_rep = top1[2]
                span_rep = self.span_input_layer(
                    dy.concatenate([top2_raw_rep, top1_raw_rep]))
                stack_state = stack_state.add_input(span_rep)
                stack.append((stack_state, 'c', span_rep))

            action_embedding = self.act_input_layer(self.act_lookup[action])
            action_top = action_top.add_input(action_embedding)

        return dy.esum(losses)
Ejemplo n.º 38
0
    def _predict_arc(self, seq, runtime=True):
        x_list, encoder_states_list = self._make_input(seq, runtime)

        # BDLSTM
        rnn_outputs = [x_list]
        for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw,
                                   self.config.layer_dropouts):
            if runtime:
                fw.set_dropouts(0, 0)
                bw.set_dropouts(0, 0)
            else:
                fw.set_dropouts(dropout, dropout)
                bw.set_dropouts(dropout, dropout)

            fw_list = fw.initial_state().transduce(x_list)
            bw_list = list(
                reversed(bw.initial_state().transduce(reversed(x_list))))
            x_list = [
                dy.concatenate([x_fw, x_bw])
                for x_fw, x_bw in zip(fw_list, bw_list)
            ]

            rnn_outputs.append(x_list)

        # projections
        arc_projections = [[
            dy.tanh(
                self.proj_arc_w_dep.expr(update=True) * x +
                self.proj_arc_b_dep.expr(update=True)),
            dy.tanh(
                self.proj_arc_w_head.expr(update=True) * x +
                self.proj_arc_b_head.expr(update=True))
        ] for x in rnn_outputs[-1]]
        label_projections = [[
            dy.tanh(
                self.proj_label_w_dep.expr(update=True) * x +
                self.proj_label_b_dep.expr(update=True)),
            dy.tanh(
                self.proj_label_w_head.expr(update=True) * x +
                self.proj_label_b_head.expr(update=True))
        ] for x in rnn_outputs[-1]]
        if not runtime:
            arc_projections = [[
                dy.dropout(x1, self.config.presoftmax_mlp_dropout),
                dy.dropout(x2, self.config.presoftmax_mlp_dropout)
            ] for x1, x2 in arc_projections]
            label_projections = [[
                dy.dropout(x1, self.config.presoftmax_mlp_dropout),
                dy.dropout(x2, self.config.presoftmax_mlp_dropout)
            ] for x1, x2 in label_projections]
        if not self.config.predict_morphology:
            aux_arc_projections = [[
                dy.tanh(
                    self.aux_proj_arc_w_dep.expr(update=True) * x +
                    self.aux_proj_arc_b_dep.expr(update=True)),
                dy.tanh(
                    self.aux_proj_arc_w_head.expr(update=True) * x +
                    self.aux_proj_arc_b_head.expr(update=True))
            ] for x in rnn_outputs[self.config.aux_softmax_layer]]
            if not runtime:
                aux_arc_projections = [[
                    dy.dropout(x1, self.config.presoftmax_mlp_dropout),
                    dy.dropout(x2, self.config.presoftmax_mlp_dropout)
                ] for x1, x2 in aux_arc_projections]

        else:
            drp = self.config.presoftmax_mlp_dropout
            if runtime:
                drp = 0
            upos_softmax = [
                dy.softmax(
                    self.upos_softmax_w.expr(update=True) * dy.dropout(
                        dy.tanh(
                            self.upos_proj_w.expr(update=True) * x +
                            self.upos_proj_b.expr(update=True)), drp) +
                    self.upos_softmax_b.expr(update=True))
                for x in rnn_outputs[self.config.aux_softmax_layer]
            ]
            xpos_softmax = [
                dy.softmax(
                    self.xpos_softmax_w.expr(update=True) * dy.dropout(
                        dy.tanh(
                            self.xpos_proj_w.expr(update=True) * x +
                            self.xpos_proj_b.expr(update=True)), drp) +
                    self.xpos_softmax_b.expr(update=True))
                for x in rnn_outputs[self.config.aux_softmax_layer]
            ]
            attrs_softmax = [
                dy.softmax(
                    self.attrs_softmax_w.expr(update=True) * dy.dropout(
                        dy.tanh(
                            self.attrs_proj_w.expr(update=True) * x +
                            self.attrs_proj_b.expr(update=True)), drp) +
                    self.attrs_softmax_b.expr(update=True))
                for x in rnn_outputs[self.config.aux_softmax_layer]
            ]

            morphology_softmax = [
                [upos, xpos, attrs] for upos, xpos, attrs in zip(
                    upos_softmax, xpos_softmax, attrs_softmax)
            ]

        n = len(seq) + 1
        arc_matrix = [[None] * n for _ in range(n)]
        if not self.config.predict_morphology:
            aux_arc_matrix = [[None] * n for _ in range(n)]
        for iDst in range(n):
            term_bias = self.link_b.expr(
                update=True) * arc_projections[iDst][1]
            term_weight = self.link_w.expr(
                update=True) * arc_projections[iDst][1]
            if not self.config.predict_morphology:
                aux_term_bias = self.aux_link_b.expr(
                    update=True) * aux_arc_projections[iDst][1]
                aux_term_weight = self.aux_link_w.expr(
                    update=True) * aux_arc_projections[iDst][1]
            for iSrc in range(n):
                if iSrc != iDst:
                    attention = dy.reshape(
                        term_weight, (1, self.config.arc_proj_size
                                      )) * arc_projections[iSrc][0] + term_bias
                    arc_matrix[iSrc][iDst] = attention
                    if not self.config.predict_morphology:
                        aux_attention = dy.reshape(aux_term_weight, (1, self.config.arc_proj_size)) * \
                                        aux_arc_projections[iSrc][0] + aux_term_bias
                        aux_arc_matrix[iSrc][iDst] = aux_attention

        # compute softmax for arcs
        a_m = [[None] * n for _ in range(n)]
        if not self.config.predict_morphology:
            aux_a_m = [[None] * n for _ in range(n)]

        for iSrc in range(n):
            s_max = []
            if not self.config.predict_morphology:
                aux_s_max = []
            for iDst in range(n):
                if iSrc != iDst:
                    s_max.append(arc_matrix[iSrc][iDst])
                    if not self.config.predict_morphology:
                        aux_s_max.append(aux_arc_matrix[iSrc][iDst])
            s_max = dy.softmax(dy.concatenate(s_max))
            if not self.config.predict_morphology:
                aux_s_max = dy.softmax(dy.concatenate(aux_s_max))
            ofs = 0
            for iDst in range(n):
                if iSrc == iDst:
                    ofs = -1
                else:
                    a_m[iSrc][iDst] = s_max[iDst + ofs]
                    if not self.config.predict_morphology:
                        aux_a_m[iSrc][iDst] = aux_s_max[iDst + ofs]
        if not self.config.predict_morphology:
            return a_m, aux_a_m, label_projections, None
        else:
            return a_m, None, label_projections, morphology_softmax[1:-1]
Ejemplo n.º 39
0
    def calculate_batch_loss(self, batch):
        dy.renew_cg()

        W_y = dy.parameter(self.params["W_y"])
        b_y = dy.parameter(self.params["b_y"])
        s_lookup = self.params["s_lookup"]
        t_lookup = self.params["t_lookup"]

        s_batch = [x[0] for x in batch]
        t_batch = [x[1] for x in batch]

        wids = []

        for i in range(len(s_batch[0])):
            wids.append([sent[i] for sent in s_batch])

        wids_rev = list(reversed(wids))

        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []

        for wid in wids:
            l2r_state = l2r_state.add_input(dy.lookup_batch(s_lookup, wid))
            l2r_contexts.append(l2r_state.output())

        for wid in wids_rev:
            r2l_state = r2l_state.add_input(dy.lookup_batch(s_lookup, wid))
            r2l_contexts.append(r2l_state.output())

        r2l_contexts.reverse()

        losses = []

        H_f = []
        H_f = [
            dy.concatenate(list(p)) for p in zip(l2r_contexts, r2l_contexts)
        ]

        H_f_mat = dy.concatenate_cols(H_f)
        W1_att = dy.parameter(self.params["W1_att"])
        w1dt = W1_att * H_f_mat

        t_wids = []
        masks = []

        for i in range(len(t_batch[0])):
            t_wids.append([(sent[i] if len(sent) > i else self.t_vocab[EOS])
                           for sent in t_batch])
            mask = [(1 if len(sent) > i else 0) for sent in t_batch]
            masks.append(mask)

        c_t = dy.vecInput(2 * self.HIDDEN_DIM)

        words = [self.t_vocab[EOS]] * len(t_batch)
        embedding = dy.lookup_batch(t_lookup, words)

        dec_state = self.dec_builder.initial_state()

        for t_wid, mask in zip(t_wids, masks):
            x_t = dy.concatenate([c_t, embedding])
            dec_state = dec_state.add_input(x_t)

            c_t = self.attend(H_f_mat, dec_state, w1dt, len(s_batch[0]),
                              len(wids[0]))

            probs = dy.affine_transform(
                [b_y, W_y, dy.concatenate([c_t, dec_state.output()])])
            loss = dy.pickneglogsoftmax_batch(probs, t_wid)

            if mask[-1] != 1:
                mask_expr = dy.inputVector(mask)
                mask_expr = dy.reshape(mask_expr, (1, ), len(t_batch))
                loss = loss * mask_expr

            losses.append(loss)
            embedding = dy.lookup_batch(t_lookup, t_wid)

        loss = dy.sum_batches(dy.esum(losses))  # /len(wids[0])
        return loss
Ejemplo n.º 40
0
    def _make_input(self, seq, runtime):
        x_list = []
        encoder_states_list = [None]
        # add the root
        if not self.config.use_morphology:
            x_list.append(self.unknown_word_embedding[1])
        elif not self.config.use_lexical:
            x_list.append(self.pad_tag_embedding[1])
        else:  # both lexical and morphology are used
            x_list.append(
                dy.concatenate([
                    self.unknown_word_embedding[1], self.pad_tag_embedding[1]
                ]))

        for entry in seq:
            word = entry.word

            if self.config.use_lexical:
                # prepare lexical embeddings
                char_emb, encoder_states = self.character_network.compute_embeddings(
                    word, runtime=runtime)
                encoder_states_list.append(encoder_states)
                if sys.version_info[0] == 2:
                    word_emb, found = self.embeddings.get_word_embeddings(
                        word.decode('utf-8'))
                else:
                    word_emb, found = self.embeddings.get_word_embeddings(word)
                if not found:
                    word_emb = self.unknown_word_embedding[0]
                else:
                    word_emb = dy.tanh(
                        self.input_proj_w_word.expr(update=True) *
                        dy.inputVector(word_emb) +
                        self.input_proj_b_word.expr(update=True))
                if sys.version_info[0] == 2:
                    word = word.decode('utf-8').lower()
                else:
                    word = word.lower()

                if word in self.encodings.word2int:
                    holistic_emb = self.holistic_embeddings[
                        self.encodings.word2int[word]]
                else:
                    holistic_emb = self.holistic_embeddings[
                        self.encodings.word2int['<UNK>']]

                # dropout lexical embeddings
                if runtime:
                    w_emb = word_emb + char_emb + holistic_emb
                else:
                    p1 = random.random()
                    p2 = random.random()
                    p3 = random.random()
                    m1 = 1
                    m2 = 1
                    m3 = 1
                    if p1 < self.config.input_dropout_prob:
                        m1 = 0
                    if p2 < self.config.input_dropout_prob:
                        m2 = 0
                    if p3 < self.config.input_dropout_prob:
                        m3 = 0

                    scale = 1.0
                    if m1 + m2 + m3 > 0:
                        scale = float(3) / (m1 + m2 + m3)
                    m1 = dy.scalarInput(m1)
                    m2 = dy.scalarInput(m2)
                    m3 = dy.scalarInput(m3)
                    scale = dy.scalarInput(scale)
                    w_emb = (word_emb * m1 + char_emb * m2 +
                             holistic_emb * m3) * scale

            if self.config.use_morphology:
                if entry.upos in self.encodings.upos2int:
                    upos_emb = self.upos_lookup[self.encodings.upos2int[
                        entry.upos]]
                else:
                    upos_emb = dy.inputVector(
                        [0] * self.config.input_embeddings_size)
                if entry.xpos in self.encodings.xpos2int:
                    xpos_emb = self.xpos_lookup[self.encodings.xpos2int[
                        entry.xpos]]
                else:
                    xpos_emb = dy.inputVector(
                        [0] * self.config.input_embeddings_size)
                if entry.attrs in self.encodings.attrs2int:
                    attrs_emb = self.attrs_lookup[self.encodings.attrs2int[
                        entry.attrs]]
                else:
                    attrs_emb = dy.inputVector(
                        [0] * self.config.input_embeddings_size)
                # overwrite all dropouts. it will later be handled by "same-mask"
                t_emb = upos_emb + xpos_emb + attrs_emb
                # w_emb = word_emb + char_emb + holistic_emb

            # compose embeddings, if necessary
            if self.config.use_lexical and self.config.use_morphology:
                if not runtime:
                    p1 = random.random()
                    p2 = random.random()
                    m1 = 1
                    m2 = 1
                    if p1 < self.config.input_dropout_prob:
                        m1 = 0
                    if p2 < self.config.input_dropout_prob:
                        m2 = 0
                    if m1 + m2 > 0:
                        scale = float(2.0) / (m1 + m2)
                    else:
                        scale = 1.0
                    scale = dy.scalarInput(scale)
                    m1 = dy.scalarInput(m1)
                    m2 = dy.scalarInput(m2)
                    x_list.append(
                        dy.concatenate(
                            [w_emb * m1 * scale, t_emb * m2 * scale]))
                else:
                    x_list.append(dy.concatenate([w_emb, t_emb]))
            elif self.config.use_lexical:  # just use_lexical == True
                x_list.append(w_emb)
            else:  # just use_morphology == True
                x_list.append(t_emb)

        # close sequence
        if not self.config.use_morphology:
            x_list.append(self.unknown_word_embedding[2])
        elif not self.config.use_lexical:
            x_list.append(self.pad_tag_embedding[2])
        else:
            x_list.append(
                dy.concatenate([
                    self.unknown_word_embedding[2], self.pad_tag_embedding[2]
                ]))

        encoder_states_list.append(None)
        return x_list, encoder_states_list
Ejemplo n.º 41
0
    def _predict(self, seq, runtime=True):
        softmax_list = []
        aux_softmax_list = []
        x_list = []
        for entry in seq:
            word = entry.word
            char_emb, _ = self.character_network.compute_embeddings(
                word, runtime=runtime)

            word_emb, found = self.embeddings.get_word_embeddings(
                word.decode('utf-8'))
            if not found:
                word_emb = self.unknown_word_embedding[0]
            else:
                word_emb = dy.inputVector(word_emb)

            holistic_word = word.decode('utf-8').lower()
            if holistic_word in self.encodings.word2int:
                hol_emb = self.holistic_word_embedding[
                    self.encodings.word2int[holistic_word]]
            else:
                hol_emb = self.holistic_word_embedding[
                    self.encodings.word2int['<UNK>']]
            proj_emb = self.emb_proj_w.expr() * word_emb
            proj_hol = self.hol_proj_w.expr() * hol_emb
            proj_char = self.char_proj_w.expr() * char_emb
            # x_list.append(dy.tanh(proj_char + proj_emb + proj_hol))

            if runtime:
                x_list.append(dy.tanh(proj_char + proj_emb + proj_hol))
            else:
                p1 = random.random()
                p2 = random.random()
                p3 = random.random()
                m1 = 1
                m2 = 1
                m3 = 1
                if p1 < self.config.input_dropout_prob:
                    m1 = 0
                if p2 < self.config.input_dropout_prob:
                    m2 = 0
                if p3 < self.config.input_dropout_prob:
                    m3 = 0

                scale = 1.0
                if m1 + m2 + m3 > 0:
                    scale = float(3) / (m1 + m2 + m3)
                m1 = dy.scalarInput(m1)
                m2 = dy.scalarInput(m2)
                m3 = dy.scalarInput(m3)
                scale = dy.scalarInput(scale)
                x_list.append(
                    dy.tanh((proj_char * m1 + proj_emb * m2 + proj_hol * m3) *
                            scale))

        # BDLSTM
        rnn_outputs = []
        for fw, bw, dropout in zip(self.bdrnn_fw, self.bdrnn_bw,
                                   self.config.layer_dropouts):
            if not runtime:
                fw.set_dropouts(0, dropout)
                bw.set_dropouts(0, dropout)
            else:
                fw.set_dropouts(0, 0)
                bw.set_dropouts(0, 0)
            fw_list = fw.initial_state().transduce(x_list)
            bw_list = list(
                reversed(bw.initial_state().transduce(reversed(x_list))))
            x_list = [
                dy.concatenate([x_fw, x_bw])
                for x_fw, x_bw in zip(fw_list, bw_list)
            ]
            # if runtime:
            #    x_out = x_list
            # else:
            #    x_out = [dy.dropout(x, dropout) for x in x_list]
            rnn_outputs.append(x_list)

        # SOFTMAX
        mlp_output = []
        for x in rnn_outputs[-1]:
            pre_softmax = []
            for iMLP in xrange(3):
                mlp_w = self.mlps[iMLP][0]
                mlp_b = self.mlps[iMLP][1]
                inp = x
                for w, b, drop, in zip(mlp_w, mlp_b,
                                       self.config.presoftmax_mlp_dropouts):
                    inp = dy.tanh(w.expr() * inp + b.expr())
                    if not runtime:
                        inp = dy.dropout(inp, drop)
                pre_softmax.append(inp)
            mlp_output.append(pre_softmax)

        for softmax_inp, aux_softmax_inp in zip(
                mlp_output, rnn_outputs[self.config.aux_softmax_layer - 1]):
            softmax_list.append([
                dy.softmax(self.softmax_upos_w.expr() * softmax_inp[0] +
                           self.softmax_upos_b.expr()),
                dy.softmax(self.softmax_xpos_w.expr() * softmax_inp[1] +
                           self.softmax_xpos_b.expr()),
                dy.softmax(self.softmax_attrs_w.expr() * softmax_inp[2] +
                           self.softmax_attrs_b.expr())
            ])
            aux_softmax_list.append([
                dy.softmax(self.aux_softmax_upos_w.expr() * aux_softmax_inp +
                           self.aux_softmax_upos_b.expr()),
                dy.softmax(self.aux_softmax_xpos_w.expr() * aux_softmax_inp +
                           self.aux_softmax_xpos_b.expr()),
                dy.softmax(self.aux_softmax_attrs_w.expr() * aux_softmax_inp +
                           self.aux_softmax_attrs_b.expr())
            ])

        return softmax_list, aux_softmax_list
Ejemplo n.º 42
0
 def _feed_input(dst_embed_i, attn_output_i):
     return dy.concatenate([dst_embed_i, attn_output_i])
Ejemplo n.º 43
0
    def pre_order_train(self, words, oracle_actions, oracle_tokens, options,
                        buffer, stack_top, action_top):
        stack = []
        losses = []

        reducable = 0
        reduced = 0

        #recursively generate the tree until training data is exhausted
        while not (len(stack) == 1 and reduced != 0):
            valid_actions = []
            if len(stack) == 0:
                valid_actions += [_NT]
            if len(stack) >= 1:
                valid_actions += [_TER, _NT]
            if len(stack) >= 2 and reducable != 0:
                valid_actions += [_ACT]

            action = self.act_vocab[oracle_actions.pop(0)]

            word_weights = None

            #we make predictions when stack is not empty and _ACT is not the only valid action
            if len(stack) > 0 and valid_actions[0] != _ACT:
                stack_embedding = stack[-1][0].output()
                action_summary = action_top.output()
                word_weights = self.attention(stack_embedding, buffer)
                buffer_embedding = dy.esum([
                    vector * attterion_weight
                    for vector, attterion_weight in zip(buffer, word_weights)
                ])

                for i in range(len(stack)):
                    if stack[len(stack) - 1 - i][1] == 'p':
                        parent_embedding = stack[len(stack) - 1 - i][2]
                        break
                parser_state = dy.concatenate([
                    buffer_embedding, stack_embedding, parent_embedding,
                    action_summary
                ])
                h = self.mlp_layer(parser_state)

                if options.dropout > 0:
                    h = dy.dropout(h, options.dropout)

                if len(valid_actions) > 0:
                    log_probs = dy.log_softmax(self.act_proj_layer(h),
                                               valid_actions)
                    assert action in valid_actions, "action not in scope"
                    losses.append(-dy.pick(log_probs, action))

            if action == _NT:
                #generate non-terminal
                nt = self.nt_vocab[oracle_tokens.pop(0)]
                #no need to predict the ROOT (assumed ROOT is fixed)
                if word_weights is not None:
                    log_probs_nt = dy.log_softmax(self.nt_proj_layer(h))
                    losses.append(-dy.pick(log_probs_nt, nt))

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                nt_embedding = self.nt_input_layer(self.nt_lookup[nt])
                stack_state = stack_state.add_input(nt_embedding)
                stack.append((stack_state, 'p', nt_embedding))

            elif action == _TER:
                #generate terminal
                ter = self.ter_vocab[oracle_tokens.pop(0)]
                log_probs_ter = dy.log_softmax(self.ter_proj_layer(h))
                losses.append(-dy.pick(log_probs_ter, ter))

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                ter_embedding = self.ter_input_layer(self.ter_lookup[ter])
                stack_state = stack_state.add_input(ter_embedding)
                stack.append((stack_state, 'c', ter_embedding))

            else:
                #subtree completion
                found_p = 0
                path_input = []
                #keep popping until the parent is found
                while found_p != 1:
                    top = stack.pop()
                    top_raw_rep, top_label, top_rep = top[2], top[1], top[0]
                    path_input.append(top_raw_rep)
                    if top_label == 'p':
                        found_p = 1
                parent_rep = path_input.pop()
                composed_rep = self.subtree_input_layer(
                    dy.concatenate([dy.average(path_input), parent_rep]))

                stack_state, _, _ = stack[-1] if stack else (stack_top, 'ROOT',
                                                             stack_top)
                stack_state = stack_state.add_input(composed_rep)
                stack.append((stack_state, 'c', composed_rep))
                reduced = 1

            action_embedding = self.act_input_layer(self.act_lookup[action])
            action_top = action_top.add_input(action_embedding)

            reducable = 1

            #cannot reduce after an NT
            if stack[-1][1] == 'p':
                reducable = 0

        return dy.esum(losses)
Ejemplo n.º 44
0
    def static_train(self,\
                    train_treebank,\
                    validation_treebank,\
                    lr=0.001,\
                    hidden_dropout=0.01,\
                    batch_size=64,\
                    max_epochs=200,\
                    max_lexicon_size=9998,\
                    glove_file=None):
        """
        Locally trains a model with a static oracle and a multi-task standard feedforward NN.  
        @param train_treebank      : a list of dependency trees
        @param validation_treebank : a list of dependency trees
        @param lr                  : learning rate
        @param hidden_dropout      : dropout on hidden layer
        @param batch_size          : size of mini batches
        @param max_epochs          : max number of epochs
        @param max_lexicon_size    : max number of entries in the lexicon
        @param glove_file          : file where to find pre-trained word embeddings   
        """
        print("Encoding dataset from %d trees."%len(train_treebank))

        #(1) build dictionaries
        self.code_symbols(train_treebank,lexicon_size = max_lexicon_size)

        #(2) encode data sets
        lex_train_gen , struct_train_gen  = self.make_data_generators(train_treebank,batch_size)
        lex_dev_gen   , struct_dev_gen    = self.make_data_generators(validation_treebank,batch_size)
        
        print(self,flush=True)
        print("epochs %d\nstructural training examples  [N] = %d\nlexical training examples  [N] = %d\nBatch size = %d\nDropout = %f\nlearning rate = %f"%(max_epochs,struct_train_gen.N,lex_train_gen.N,batch_size,hidden_dropout,lr),flush=True)

        #(3) make network
        self.model = dy.ParameterCollection()
        self.hidden_weights   = self.model.add_parameters((self.hidden_size,self.embedding_size*self.input_length))
        self.action_weights   = self.model.add_parameters((self.actions_size,self.hidden_size))
        if glove_file is None:
            self.input_embeddings  = self.model.add_parameters((self.lexicon_size,self.embedding_size))
        else:
            self.input_embeddings  = self.model.parameters_from_numpy(self.read_glove_embeddings(glove_file))
        if not self.tied:
            self.output_embeddings = self.model.add_parameters((self.lexicon_size,self.hidden_size))

        #(4) fitting
        lex_gen       = lex_train_gen.next_batch()
        struct_gen    = struct_train_gen.next_batch()
        max_batches = max( lex_train_gen.get_num_batches(), struct_train_gen.get_num_batches() )
        print(lex_train_gen.get_num_batches(), struct_train_gen.get_num_batches(),flush=True)
        
        lex_valid_gen       = lex_dev_gen.next_batch()
        struct_valid_gen    = struct_dev_gen.next_batch()
        
        min_nll = float('inf')
        trainer = dy.AdamTrainer(self.model,alpha=lr)
        history_log = []
        for e in range(max_epochs):
            struct_loss,lex_loss = 0,0
            struct_N,lex_N       = 0,0
            start_t = time.time()
            for b in range(max_batches):
                #struct
                X_struct,Y_struct = next(struct_gen)
                #question of proportions : should struct and lex be evenly sampled or not (??):
                #here the parity oversamples approx twice the lexical actions
                dy.renew_cg()
                W = dy.parameter(self.hidden_weights)
                E = dy.parameter(self.input_embeddings)
                A = dy.parameter(self.action_weights)
                batched_X        = zip(*X_struct)  #transposes the X matrix                           
                lookups          = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X]
                xdense           = dy.concatenate(lookups)
                ybatch_preds     = dy.pickneglogsoftmax_batch(A * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_struct)
                loss             = dy.sum_batches(ybatch_preds)
                struct_N         += len(Y_struct)
                struct_loss      += loss.value()
                loss.backward()
                trainer.update()
                #lex
                X_lex,Y_lex = next(lex_gen)
                if self.tied:
                    dy.renew_cg()
                    W = dy.parameter(self.hidden_weights)
                    E = dy.parameter(self.input_embeddings)
                    batched_X        = zip(*X_lex) #transposes the X matrix
                    lookups          = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X]
                    xdense           = dy.concatenate(lookups)
                    ybatch_preds     = dy.pickneglogsoftmax_batch(E * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_lex)
                    loss             = dy.sum_batches(ybatch_preds)
                else:
                    dy.renew_cg()
                    W = dy.parameter(self.hidden_weights)
                    E = dy.parameter(self.input_embeddings)
                    O = dy.parameter(self.output_embeddings)
                    batched_X        = zip(*X_lex) #transposes the X matrix
                    lookups          = [dy.pick_batch(E,xcolumn) for xcolumn in batched_X]
                    xdense           = dy.concatenate(lookups)
                    ybatch_preds     = dy.pickneglogsoftmax_batch(O * dy.dropout(dy.tanh( W * xdense ),hidden_dropout),Y_lex)
                    loss             = dy.sum_batches(ybatch_preds)
                lex_N            += len(Y_lex)
                lex_loss         += loss.value()
                loss.backward()
                trainer.update()
            end_t = time.time()
            # (5) validation
            X_lex_valid,Y_lex_valid = lex_dev_gen.batch_all()
            lex_valid_nll           = -sum(self.predict_logprobs(X_lex_valid,Y_lex_valid,structural=False))
            
            X_struct_valid,Y_struct_valid = struct_dev_gen.batch_all()
            struct_valid_nll              = -sum(self.predict_logprobs(X_struct_valid,Y_struct_valid,structural=True))
            
            history_log.append((e,end_t-start_t,\
                                exp(lex_loss/lex_N),\
                                exp(struct_loss/struct_N),\
                                exp(lex_valid_nll/lex_dev_gen.N),\
                                exp(struct_valid_nll/struct_dev_gen.N),\
                                exp((lex_valid_nll+struct_valid_nll) /(struct_dev_gen.N+lex_dev_gen.N))))
            print('Epoch %d (%.2f sec.) TRAIN:: PPL_lex = %f, PPL_struct = %f / VALID:: PPL_lex = %f, PPL_struct = %f, PPL_all = %f'%tuple(history_log[-1]),flush=True)
            if  lex_valid_nll+struct_valid_nll < min_nll:
                df = pd.DataFrame(history_log,columns=['epoch','wall_time','ppl_lex_train','ppl_struct_train','ppl_lex_valid','ppl_struct_valid','ppl_all_valid'])
                self.save_model('best_model_dump',epoch = e, learning_curve=df)
            
        return pd.DataFrame(history_log,columns=['epoch','wall_time','ppl_lex_train','ppl_struct_train','ppl_lex_valid','ppl_struct_valid','ppl_all_valid'])
Ejemplo n.º 45
0
    def post_order_train(self, words, oracle_actions, oracle_tokens, options,
                         buffer, stack_top, action_top):
        stack = []
        losses = []
        stack_symbol = []

        reduced = 0
        nt_allowed = 1

        #recursively generate the tree until training data is exhausted
        while not (len(stack_symbol) == 1 and reduced != 0):
            valid_actions = []
            if len(stack_symbol) == 0:
                valid_actions += [_ACT]
            if len(stack_symbol) >= 1:
                valid_actions += [_TER, _ACT]
            if len(stack) >= 1 and nt_allowed:
                valid_actions += [_NT]

            action = self.act_vocab[oracle_actions.pop(0)]

            word_weights = None

            #we make predictions when stack is not empty and _ACT is not the only valid action
            if len(stack_symbol) > 0:
                stack_embedding = stack[-1][0].output(
                ) if stack else self.initial_embedding()
                action_summary = action_top.output()
                word_weights = self.attention(stack_embedding, buffer)
                buffer_embedding = dy.esum([
                    vector * attterion_weight
                    for vector, attterion_weight in zip(buffer, word_weights)
                ])

                parser_state = dy.concatenate(
                    [buffer_embedding, stack_embedding, action_summary])
                h = self.mlp_layer(parser_state)

                if options.dropout > 0:
                    h = dy.dropout(h, options.dropout)

                if len(valid_actions) > 0:
                    log_probs = dy.log_softmax(self.act_proj_layer(h),
                                               valid_actions)
                    assert action in valid_actions, "action not in scope"
                    losses.append(-dy.pick(log_probs, action))

            if action == _NT:
                #generate non-terminal
                nt = self.nt_vocab[oracle_tokens.pop(0)]
                log_probs_nt = dy.log_softmax(self.nt_proj_layer(h))
                losses.append(-dy.pick(log_probs_nt, nt))

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                parent_rep = self.nt_input_layer(self.nt_lookup[nt])

                found_start = 0
                path_input = []
                while found_start != 1:
                    top_symbol = stack_symbol.pop()
                    if top_symbol != '|':
                        top = stack.pop()
                        top_raw_rep, top_label, top_rep = top[2], top[1], top[
                            0]
                        path_input.append(top_raw_rep)
                    else:
                        found_start = 1

                composed_rep = self.subtree_input_layer(
                    dy.concatenate([dy.average(path_input), parent_rep]))
                stack_state = stack_state.add_input(composed_rep)
                stack.append((stack_state, 'c', composed_rep))
                stack_symbol.append('c')
                reduced = 1

            elif action == _TER:
                #generate terminal
                ter = self.ter_vocab[oracle_tokens.pop(0)]
                log_probs_ter = dy.log_softmax(self.ter_proj_layer(h))
                losses.append(-dy.pick(log_probs_ter, ter))

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                ter_embedding = self.ter_input_layer(self.ter_lookup[ter])
                stack_state = stack_state.add_input(ter_embedding)
                stack.append((stack_state, 'c', ter_embedding))
                stack_symbol.append('c')

            else:
                #mark handle
                stack_symbol.append('|')

            action_embedding = self.act_input_layer(self.act_lookup[action])
            action_top = action_top.add_input(action_embedding)

            nt_allowed = 1
            if stack_symbol.count('|') == 0:
                nt_allowed = 0

        return dy.esum(losses)
Ejemplo n.º 46
0
    def generate(self, pre_context, pos_context, entity):
        embedded = self.embed_sentence(pre_context)
        pre_encoded = self.encode_sentence(self.encpre_fwd_lstm,
                                           self.encpre_bwd_lstm, embedded)

        embedded = self.embed_sentence(pos_context)
        pos_encoded = self.encode_sentence(self.encpos_fwd_lstm,
                                           self.encpos_bwd_lstm, embedded)

        w = dy.parameter(self.decoder_w)
        b = dy.parameter(self.decoder_b)

        w1_pre = dy.parameter(self.attention_w1_pre)
        h_pre = dy.concatenate_cols(pre_encoded)
        w1dt_pre = None

        w1_pos = dy.parameter(self.attention_w1_pos)
        h_pos = dy.concatenate_cols(pos_encoded)
        w1dt_pos = None

        last_output_embeddings = self.output_lookup[self.output2int[self.EOS]]
        try:
            entity_embedding = self.input_lookup[self.input2int[entity]]
        except:
            entity_embedding = self.input_lookup[self.input2int[self.EOS]]
        s = self.dec_lstm.initial_state().add_input(
            dy.concatenate([
                dy.vecInput(self.STATE_SIZE * 2), last_output_embeddings,
                entity_embedding
            ]))

        out = []
        count_EOS = 0
        for i in range(self.config['GENERATION']):
            if count_EOS == 2: break
            # w1dt can be computed and cached once for the entire decoding phase
            w1dt_pre = w1dt_pre or w1_pre * h_pre
            w1dt_pos = w1dt_pos or w1_pos * h_pos

            attention_pre = self.attend(h_pre, s, w1dt_pre,
                                        self.attention_w2_pre,
                                        self.attention_v_pre)
            attention_pos = self.attend(h_pos, s, w1dt_pos,
                                        self.attention_w2_pos,
                                        self.attention_v_pos)

            vector = dy.concatenate([
                self.hier_attend(attention_pre, attention_pos, s),
                last_output_embeddings, entity_embedding
            ])
            s = s.add_input(vector)
            out_vector = w * s.output() + b
            probs = dy.softmax(out_vector).vec_value()
            next_word = probs.index(max(probs))
            last_output_embeddings = self.output_lookup[next_word]
            if self.int2output[next_word] == self.EOS:
                count_EOS += 1
                continue

            out.append(self.int2output[next_word])

        return out
Ejemplo n.º 47
0
    def pre_order_parse(self, words, oracle_actions, oracle_tokens, buffer,
                        stack_top, action_top):
        stack = []

        #check if a reduce is allowed
        reducable = 0
        #check if a reduced has ever been performed
        reduced = 0
        #check if nt/ter actions are allowed
        nt_allowed = 1
        ter_allowed = 1

        output_actions = []
        output_tokens = []

        #the first action is always NT and the first token ROOT
        action = self.act_vocab[oracle_actions.pop(0)]
        nt = self.nt_vocab[oracle_tokens.pop(0)]
        #recursively generate the tree until constrains are met
        while not (len(stack) == 1 and reduced != 0):
            valid_actions = []
            if len(stack) == 0:
                valid_actions += [_NT]
            if len(stack) >= 1:
                if ter_allowed == 1:
                    valid_actions += [_TER]
                if nt_allowed == 1:
                    valid_actions += [_NT]
            if len(stack) >= 2 and reducable != 0:
                valid_actions += [_ACT]

            word_weights = None

            action = valid_actions[0]
            if len(valid_actions) > 1 or (len(stack) > 0
                                          and valid_actions[0] != _ACT):
                stack_embedding = stack[-1][0].output()
                action_summary = action_top.output()
                word_weights = self.attention(stack_embedding, buffer)
                buffer_embedding = dy.esum([
                    vector * attterion_weight
                    for vector, attterion_weight in zip(buffer, word_weights)
                ])
                for i in range(len(stack)):
                    if stack[len(stack) - 1 - i][1] == 'p':
                        parent_embedding = stack[len(stack) - 1 - i][2]
                        break
                parser_state = dy.concatenate([
                    buffer_embedding, stack_embedding, parent_embedding,
                    action_summary
                ])
                h = self.mlp_layer(parser_state)
                log_probs = dy.log_softmax(self.act_proj_layer(h),
                                           valid_actions)
                action = max(enumerate(log_probs.vec_value()),
                             key=itemgetter(1))[0]

            if action == _NT:
                if word_weights is not None:
                    #no prediction is made for ROOT
                    log_probs_nt = dy.log_softmax(self.nt_proj_layer(h))
                    nt = max(enumerate(log_probs_nt.vec_value()),
                             key=itemgetter(1))[0]

                nt_embedding = self.nt_input_layer(self.nt_lookup[nt])

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                stack_state = stack_state.add_input(nt_embedding)
                stack.append((stack_state, 'p', nt_embedding))

                output_actions.append(self.act_vocab.token(action))
                output_tokens.append(self.nt_vocab.token(nt))

            elif action == _TER:
                log_probs_ter = dy.log_softmax(self.ter_proj_layer(h))
                ter = max(enumerate(log_probs_ter.vec_value()),
                          key=itemgetter(1))[0]
                ter_embedding = self.ter_input_layer(self.ter_lookup[ter])

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                stack_state = stack_state.add_input(ter_embedding)
                stack.append((stack_state, 'c', ter_embedding))

                output_actions.append(self.act_vocab.token(action))
                output_tokens.append(self.ter_vocab.token(ter))

            else:
                found_p = 0
                path_input = []
                while found_p != 1:
                    top = stack.pop()
                    top_raw_rep, top_label, top_rep = top[2], top[1], top[0]
                    path_input.append(top_raw_rep)
                    if top_label == 'p' or top_label == 'ROOT':
                        found_p = 1
                parent_rep = path_input.pop()
                composed_rep = self.subtree_input_layer(
                    dy.concatenate([dy.average(path_input), parent_rep]))

                stack_state, _, _ = stack[-1] if stack else (stack_top, 'ROOT',
                                                             stack_top)
                stack_state = stack_state.add_input(composed_rep)
                stack.append((stack_state, 'c', composed_rep))
                reduced = 1

                output_actions.append(self.act_vocab.token(action))

            action_embedding = self.act_input_layer(self.act_lookup[action])
            action_top = action_top.add_input(action_embedding)

            reducable = 1
            nt_allowed = 1
            ter_allowed = 1

            #reduce cannot follow nt
            if stack[-1][1] == 'p' or stack[-1][1] == 'ROOT':
                reducable = 0

            #nt is disabled if maximum open non-terminal allowed is reached
            count_p = 0
            for item in stack:
                if item[1] == 'p':
                    count_p += 1
            if count_p >= 10:
                nt_allowed = 0

            #ter is disabled if maximum children under the open nt is reached
            count_c = 0
            for item in stack[::-1]:
                if item[1] == 'c':
                    count_c += 1
                else:
                    break
            if count_c >= 10:
                ter_allowed = 0

        return output_actions, output_tokens
Ejemplo n.º 48
0
    def beam_search(self, pre_context, pos_context, entity, beam):
        embedded = self.embed_sentence(pre_context)
        pre_encoded = self.encode_sentence(self.encpre_fwd_lstm,
                                           self.encpre_bwd_lstm, embedded)

        embedded = self.embed_sentence(pos_context)
        pos_encoded = self.encode_sentence(self.encpos_fwd_lstm,
                                           self.encpos_bwd_lstm, embedded)

        w = dy.parameter(self.decoder_w)
        b = dy.parameter(self.decoder_b)

        w1_pre = dy.parameter(self.attention_w1_pre)
        h_pre = dy.concatenate_cols(pre_encoded)
        w1dt_pre = None

        w1_pos = dy.parameter(self.attention_w1_pos)
        h_pos = dy.concatenate_cols(pos_encoded)
        w1dt_pos = None

        try:
            entity_embedding = self.input_lookup[self.input2int[entity]]
        except:
            entity_embedding = self.input_lookup[self.input2int[self.EOS]]
        last_output_embeddings = self.output_lookup[self.output2int[self.EOS]]
        s = self.dec_lstm.initial_state().add_input(
            dy.concatenate([
                dy.vecInput(self.STATE_SIZE * 2), last_output_embeddings,
                entity_embedding
            ]))
        candidates = [{
            'sentence': [self.EOS],
            'prob': 0.0,
            'count_EOS': 0,
            's': s
        }]
        outputs = []

        i = 0
        while i < self.config['GENERATION'] and len(outputs) < beam:
            new_candidates = []
            for candidate in candidates:
                if candidate['count_EOS'] == 2:
                    outputs.append(candidate)

                    if len(outputs) == beam: break
                else:
                    # w1dt can be computed and cached once for the entire decoding phase
                    w1dt_pre = w1dt_pre or w1_pre * h_pre
                    w1dt_pos = w1dt_pos or w1_pos * h_pos

                    attention_pre = self.attend(h_pre, candidate['s'],
                                                w1dt_pre,
                                                self.attention_w2_pre,
                                                self.attention_v_pre)
                    attention_pos = self.attend(h_pos, candidate['s'],
                                                w1dt_pos,
                                                self.attention_w2_pos,
                                                self.attention_v_pos)

                    last_output_embeddings = self.output_lookup[
                        self.output2int[candidate['sentence'][-1]]]
                    vector = dy.concatenate([
                        self.hier_attend(attention_pre, attention_pos,
                                         candidate['s']),
                        last_output_embeddings, entity_embedding
                    ])
                    s = candidate['s'].add_input(vector)
                    out_vector = w * s.output() + b
                    probs = dy.softmax(out_vector).vec_value()
                    next_words = [{
                        'prob': e,
                        'index': probs.index(e)
                    } for e in sorted(probs, reverse=True)[:beam]]

                    for next_word in next_words:
                        word = self.int2output[next_word['index']]

                        new_candidate = {
                            'sentence': candidate['sentence'] + [word],
                            'prob':
                            candidate['prob'] + np.log(next_word['prob']),
                            'count_EOS': candidate['count_EOS'],
                            's': s
                        }

                        if word == self.EOS:
                            new_candidate['count_EOS'] += 1

                        new_candidates.append(new_candidate)
            candidates = sorted(new_candidates,
                                key=lambda x: x['prob'],
                                reverse=True)[:beam]
            i += 1

        if len(outputs) == 0:
            outputs = candidates

        # Length Normalization
        alpha = 0.6
        for output in outputs:
            length = len(output['sentence'])
            lp_y = ((5.0 + length)**alpha) / ((5.0 + 1.0)**alpha)

            output['prob'] = output['prob'] / lp_y

        outputs = sorted(outputs, key=lambda x: x['prob'], reverse=True)
        return list(map(lambda x: x['sentence'], outputs))
Ejemplo n.º 49
0
    def span_parse(self, words, oracle_actions, oracle_tokens, buffer,
                   stack_top, action_top):
        stack = []
        losses = []

        output_actions = []
        output_tokens = []

        nt_allowed = 1
        found_root = 0
        consecutive_nt = 0
        consecutive_ter = 0
        total_ter = 0

        _max_ter = len(words)
        _root = self.nt_vocab[oracle_tokens[-1]]

        #recursively generate the tree until training data is exhausted
        while not (found_root):
            valid_actions = []
            if len(stack) == 0:
                valid_actions += [_TER]
            if len(stack
                   ) >= 1 and consecutive_ter <= 5 and total_ter <= _max_ter:
                valid_actions += [_TER]
            if len(stack) >= 2:
                valid_actions += [_ACT]
            if len(stack) >= 1 and consecutive_nt <= 10:
                valid_actions += [_NT]

            if len(valid_actions) == 0: break
            action = valid_actions[0]
            #we make predictions when stack is not empty and _ACT is not the only valid action
            stack_embedding = stack[-1][0].output(
            ) if stack else self.initial_embedding()
            action_summary = action_top.output(
            ) if len(stack) > 0 else self.initial_embedding()
            word_weights = self.attention(stack_embedding, buffer)
            buffer_embedding = dy.esum([
                vector * attterion_weight
                for vector, attterion_weight in zip(buffer, word_weights)
            ])

            parser_state = dy.concatenate(
                [buffer_embedding, stack_embedding, action_summary])
            h = self.mlp_layer(parser_state)

            if len(valid_actions) > 0:
                log_probs = dy.log_softmax(self.act_proj_layer(h),
                                           valid_actions)
                assert action in valid_actions, "action not in scope"
                action = max(enumerate(log_probs.vec_value()),
                             key=itemgetter(1))[0]

            if action == _NT:
                #label span
                log_probs_nt = dy.log_softmax(self.nt_proj_layer(h))
                nt = max(enumerate(log_probs_nt.vec_value()),
                         key=itemgetter(1))[0]

                if nt == _root:
                    found_root = 1

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                parent_rep = self.nt_input_layer(self.nt_lookup[nt])

                top = stack.pop()
                top_raw_rep, top_label, top_rep = top[2], top[1], top[0]
                composed_rep = self.subtree_input_layer(
                    dy.concatenate([top_raw_rep, parent_rep]))
                stack_state = stack_state.add_input(composed_rep)
                stack.append((stack_state, 'p', composed_rep))

                consecutive_nt += 1
                consecutive_ter = 0
                output_actions.append(self.act_vocab.token(action))
                output_tokens.append(self.nt_vocab.token(nt))

            elif action == _TER:
                #generate terminal
                log_probs_ter = dy.log_softmax(self.ter_proj_layer(h))
                ter = max(enumerate(log_probs_ter.vec_value()),
                          key=itemgetter(1))[0]

                stack_state, label, _ = stack[-1] if stack else (stack_top,
                                                                 'ROOT',
                                                                 stack_top)
                ter_embedding = self.ter_input_layer(self.ter_lookup[ter])
                stack_state = stack_state.add_input(ter_embedding)
                stack.append((stack_state, 'c', ter_embedding))

                consecutive_nt = 0
                consecutive_ter += 1
                total_ter += 1
                output_actions.append(self.act_vocab.token(action))
                output_tokens.append(self.ter_vocab.token(ter))

            else:
                #extend span
                assert len(stack) >= 2
                top2 = stack.pop()
                top1 = stack.pop()
                top2_raw_rep = top2[2]
                top1_raw_rep = top1[2]
                span_rep = self.span_input_layer(
                    dy.concatenate([top2_raw_rep, top1_raw_rep]))
                stack_state = stack_state.add_input(span_rep)
                stack.append((stack_state, 'c', span_rep))

                consecutive_nt = 0
                consecutive_ter = 0
                output_actions.append(self.act_vocab.token(action))

            action_embedding = self.act_input_layer(self.act_lookup[action])
            action_top = action_top.add_input(action_embedding)

        return output_actions, output_tokens
Ejemplo n.º 50
0
    def run(self,
            word_inputs,
            tag_inputs,
            arc_targets=None,
            rel_targets=None,
            isTrain=True):
        # inputs, targets: seq_len x batch_size
        def dynet_flatten_numpy(ndarray):
            return np.reshape(ndarray, (-1, ), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))

        if isTrain or arc_targets is not None:
            mask_1D = dynet_flatten_numpy(mask)
            # batched here means that the last dim is treated as batch dimension, both in input and output
            mask_1D_tensor = dy.inputTensor(mask_1D, batched=True)

        # TODO: 注意 _words_in_train
        # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len
        if self.pre_train_emb:
            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) +
                dy.lookup_batch(self.pret_word_embs, w, update=False)
                for w in word_inputs
            ]  # 两个 embedding 相加 [Expression] * seq_len
        else:
            word_embs = [
                dy.lookup_batch(
                    self.word_embs,
                    np.where(w < self._vocab.words_in_train, w,
                             self._vocab.UNK)) for w in word_inputs
            ]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        top_recur = dy.concatenate_cols(
            biLSTM(self.LSTM_builders, emb_inputs, batch_size,
                   self.dropout_lstm_input if isTrain else 0.,
                   self.dropout_lstm_hidden if isTrain else 0.))
        if isTrain:
            top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp)

        W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(
            self.mlp_dep_b)
        W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(
            self.mlp_head_b)
        dep, head = leaky_relu(dy.affine_transform([
            b_dep, W_dep, top_recur
        ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur]))
        if isTrain:
            dep, head = dy.dropout_dim(dep, 1,
                                       self.dropout_mlp), dy.dropout_dim(
                                           head, 1, self.dropout_mlp)
            # 1 就意味着某些情况下整个 dim 1 变成0, dim=0 就是 drop 列, dim=1 就是 drop 行, 第三维是 batch

        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = dy.parameter(self.arc_W)
        arc_logits = bilinear(dep_arc,
                              W_arc,
                              head_arc,
                              self.mlp_arc_size,
                              seq_len,
                              batch_size,
                              num_outputs=1,
                              bias_x=True,
                              bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len *
                                     batch_size)  # 这种风格的平坦是为了计算 loss 啦
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.npvalue().argmax(0)
        # seq_len x batch_size

        if isTrain or arc_targets is not None:
            # 用得分最高的去计算 loss, 并不意味着我就选这个作为解码结果的哦, 但是必须削减它
            arc_correct = np.equal(arc_preds, arc_targets).astype(
                np.float32) * mask  # mask 你真厉害呀现在还活着
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = dynet_flatten_numpy(arc_targets)
            losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D)
            arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            arc_probs = np.transpose(
                np.reshape(
                    dy.softmax(flat_arc_logits).npvalue(),
                    (seq_len, seq_len, batch_size), 'F'))
            # #batch_size x #dep x #head

        W_rel = dy.parameter(self.rel_W)
        #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))])
        #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))])
        rel_logits = bilinear(dep_rel,
                              W_rel,
                              head_rel,
                              self.mlp_rel_size,
                              seq_len,
                              batch_size,
                              num_outputs=self._vocab.rel_size,
                              bias_x=True,
                              bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = dy.reshape(rel_logits,
                                     (seq_len, self._vocab.rel_size),
                                     seq_len * batch_size)
        # (#head x rel_size) x (#dep x batch_size)

        partial_rel_logits = dy.pick_batch(
            flat_rel_logits,
            targets_1D if isTrain else dynet_flatten_numpy(arc_preds))
        # (rel_size) x (#dep x batch_size)

        if isTrain or arc_targets is not None:
            rel_preds = partial_rel_logits.npvalue().argmax(0)
            targets_1D = dynet_flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds, targets_1D).astype(
                np.float32) * mask_1D  # 这里的形状如此, 需要用 mask1d
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D)
            rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens

        if not isTrain:
            rel_probs = np.transpose(
                np.reshape(
                    dy.softmax(dy.transpose(flat_rel_logits)).npvalue(),
                    (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
            # batch_size x #dep x #head x #nclasses

        if isTrain or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * dynet_flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if isTrain:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs,
                                           rel_probs):
            # parse sentences one by ones
            # 我非常赞同, parse 的解码这一部分根本没法 batch
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len],
                            rel_pred[1:sent_len]))  # 难道第 0 个真的是 ROOT, 确实如此

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs
Ejemplo n.º 51
0
    def beam_search_decode(
        self,
        input_: str,
        encoded_input: List[int],
        beam_width: int,
        external_cg: bool = True,
    ):

        if not external_cg:
            dy.renew_cg()

        input_emb = self.input_embedding(encoded_input, is_training=False)
        bidirectional_emb = self.bidirectional_encoding(input_emb)[
            1:]  # drop BEGIN_WORD
        input_length = len(bidirectional_emb)
        decoder = self.dec.initial_state()

        beam: List[Hypothesis] = [
            Hypothesis(
                action_history=[BEGIN_WORD],
                alignment=0,
                decoder=decoder,
                negative_log_p=0.0,
                output=[],
            )
        ]

        hypothesis_length = 0
        complete_hypotheses = []

        while (beam and beam_width > 0
               and hypothesis_length <= MAX_ACTION_SEQ_LEN):

            expansions: List[Hypothesis] = []

            for hypothesis in beam:

                length_encoder_suffix = input_length - hypothesis.alignment
                valid_actions = self.compute_valid_actions(
                    length_encoder_suffix)
                # decoder
                decoder_input = dy.concatenate([
                    bidirectional_emb[hypothesis.alignment],
                    self.act_lookup[hypothesis.action_history[-1]],
                ])
                decoder = hypothesis.decoder.add_input(decoder_input)
                # classifier
                logits = self.pW * decoder.output() + self.pb
                log_probs_expr = dy.log_softmax(logits, valid_actions)
                log_probs = log_probs_expr.npvalue()

                for action in valid_actions:

                    log_p = (hypothesis.negative_log_p - log_probs[action]
                             )  # min heap, so minus

                    heapq.heappush(
                        expansions,
                        Expansion(action, decoder, hypothesis, log_p),
                    )

            beam: List[Hypothesis] = []

            for _ in range(beam_width):

                expansion: Expansion = heapq.heappop(expansions)
                from_hypothesis = expansion.from_hypothesis
                action = expansion.action
                action_history = list(from_hypothesis.action_history)
                action_history.append(action)
                output = list(from_hypothesis.output)

                # execute the action to update the transducer state
                action = self.vocab.decode_action(action)

                if isinstance(action, EndOfSequence):
                    # 1. COMPLETE HYPOTHESIS, REDUCE BEAM
                    complete_hypothesis = Output(
                        action_history=action_history,
                        output="".join(output),
                        log_p=-expansion.negative_log_p,
                    )  # undo min heap minus

                    complete_hypotheses.append(complete_hypothesis)
                    beam_width -= 1
                else:
                    # 2. EXECUTE ACTION AND ADD FULL HYPOTHESIS TO NEW BEAM
                    alignment = from_hypothesis.alignment

                    if isinstance(action, ConditionalCopy):
                        char_ = input_[alignment]
                        alignment += 1
                        output.append(char_)
                    elif isinstance(action, ConditionalDel):
                        alignment += 1
                    elif isinstance(action, ConditionalIns):
                        output.append(action.new)
                    elif isinstance(action, ConditionalSub):
                        alignment += 1
                        output.append(action.new)
                    else:
                        raise ValueError(f"Unknown action: {action}.")

                    hypothesis = Hypothesis(
                        action_history=action_history,
                        alignment=alignment,
                        decoder=expansion.decoder,
                        negative_log_p=expansion.negative_log_p,
                        output=output,
                    )

                    beam.append(hypothesis)

            hypothesis_length += 1

        if not complete_hypotheses:
            # nothing found because the model is very bad
            for hypothesis in beam:

                complete_hypothesis = Output(
                    action_history=hypothesis.action_history,
                    output="".join(hypothesis.output),
                    log_p=-hypothesis.negative_log_p,
                )  # undo min heap minus

                complete_hypotheses.append(complete_hypothesis)

        complete_hypotheses.sort(reverse=True)
        return complete_hypotheses
Ejemplo n.º 52
0
    def build_tagging_graph(self, words):
        # parameters -> expressions
        self.w1 = dy.parameter(self.W1)
        self.b1 = dy.parameter(self.B1)
        ###############################
        self.xw1 = dy.parameter(self.xW1)
        self.xb1 = dy.parameter(self.xB1)
        self.xw2 = dy.parameter(self.xW2)
        self.xb2 = dy.parameter(self.xB2)

        # apply dropout
        if self.eval:
            self.disable_dropout()
        else:
            self.enable_dropout()

        # initialize the RNNs
        f_init = self.fwdRNN.initial_state()
        b_init = self.bwdRNN.initial_state()
        f2_init = self.fwdRNN2.initial_state()
        b2_init = self.bwdRNN2.initial_state()

        self.cf_init = self.cfwdRNN.initial_state()
        self.cb_init = self.cbwdRNN.initial_state()

        xf_init = self.xfwdRNN.initial_state()
        xb_init = self.xbwdRNN.initial_state()
        xf2_init = self.xfwdRNN2.initial_state()
        xb2_init = self.xbwdRNN2.initial_state()

        self.xcf_init = self.xcfwdRNN.initial_state()
        self.xcb_init = self.xcbwdRNN.initial_state()

        # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
        wembs = [self.word_rep(w) for w in words]
        cembs = [self.char_rep(w, self.cf_init, self.cb_init) for w in words]
        xembs = [dy.concatenate([w, c]) for w, c in zip(wembs, cembs)]

        # feed word vectors into biLSTM
        fw_exps = f_init.transduce(xembs)
        bw_exps = b_init.transduce(reversed(xembs))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
        ]

        # feed word vectors into biLSTM
        fw_exps = f2_init.transduce(bi_exps)
        bw_exps = b2_init.transduce(reversed(bi_exps))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
        ]

        # feed each biLSTM state to an MLP
        exps = []
        pos_hidden = []
        for xi in bi_exps:
            xh = self.w1 * xi
            #xh = dy.tanh(xh) + self.b1
            pos_hidden.append(xh)

        cembs = [self.char_rep(w, self.xcf_init, self.xcb_init) for w in words]
        xembs = [
            dy.concatenate(list(wcp)) for wcp in zip(wembs, cembs, pos_hidden)
        ]
        xfw_exps = xf_init.transduce(xembs)
        xbw_exps = xb_init.transduce(reversed(xembs))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b])
            for f, b in zip(xfw_exps, reversed(xbw_exps))
        ]

        # feed word vectors into biLSTM
        fw_exps = xf2_init.transduce(bi_exps)
        bw_exps = xb2_init.transduce(reversed(bi_exps))

        # biLSTM states
        bi_exps = [
            dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps))
        ]

        exps = []
        for xi in bi_exps:
            xh = self.xw1 * xi
            xh = self.meta.activation(xh) + self.xb1
            xo = self.xw2 * xh + self.xb2
            exps.append(xo)

        return exps
Ejemplo n.º 53
0
    def __call__(self, words_sequence, word2int, vocab, dataset="train"):

        # get prefix and suffix and sum them up with the word
        def add_sub_words_embd(word, word_embed):
            if len(word) <= 3:
                return dy.esum([word_embed])
            else:
                pref = False
                suff = False
                # check if prefix exist in F2I. relevant for test/dev sets
                if word2int.has_key(word[:3]):
                    prefix_embd = lookup[word2int.get(word[:3])]
                    pref = True
                # check if suffix exist in F2I. relevant for test/dev sets
                if word2int.has_key(word[-3:]):
                    suffix_embd = lookup[word2int.get(word[-3:])]
                    suff = True

                # sum vectors of word with existing prefix/suffix
                if pref and suff:
                    sum_embd = dy.esum([prefix_embd, suffix_embd, word_embed])
                elif pref and suff == False:
                    sum_embd = dy.esum([prefix_embd, word_embed])
                elif suff and pref == False:
                    sum_embd = dy.esum([suffix_embd, word_embed])
                else:
                    sum_embd = dy.esum([word_embed])

                return sum_embd

        lookup = self.params["lookup"]

        sequence = []
        if dataset == "train":
            for word, label in words_sequence:
                char_embed = []
                if word not in vocab:  # for words not in vocab get char embeddings
                    word_chars = list(word)
                    for ch in word_chars:
                        char_embed.append(lookup[word2int.get(ch)])
                    s = dy.esum(char_embed)
                    sequence.append(add_sub_words_embd(word, s))
                else:
                    word_embed = lookup[word2int.get(word)]
                    sequence.append(add_sub_words_embd(word, word_embed))
        else:
            for word in words_sequence:
                char_embed = []
                if word not in vocab:  # for words not in vocab get char embeddings
                    word_chars = list(word)
                    for ch in word_chars:
                        char_embed.append(lookup[word2int.get(ch)])
                    s = dy.esum(char_embed)
                    sequence.append(add_sub_words_embd(word, s))
                else:
                    word_embed = lookup[word2int.get(word)]
                    sequence.append(add_sub_words_embd(word, word_embed))

        # convert the parameter into an Expession (add it to graph)
        W = dy.parameter(self.params["W"])
        b = dy.parameter(self.params["b"])
        fw_lstm1 = self.fw_builder1.initial_state()
        bw_lstm1 = self.bw_builder1.initial_state()
        fw_lstm2 = self.fw_builder2.initial_state()
        bw_lstm2 = self.bw_builder2.initial_state()

        # get output vectors of all time steps for the first bi-lstm
        fw_lstm1_output = fw_lstm1.transduce(sequence)
        bw_lstm1_output = bw_lstm1.transduce(reversed(sequence))

        # concatenate backward vector to forward vector per each word
        bi1_output = [
            dy.concatenate([fw1, bw1])
            for fw1, bw1 in zip(fw_lstm1_output, reversed(bw_lstm1_output))
        ]

        # get output vectors of all time steps for the second bi-lstm
        fw_lstm2_output = fw_lstm2.transduce(bi1_output)
        bw_lstm2_output = bw_lstm2.transduce(reversed(bi1_output))

        # concatenate backward vector to forward vector per each 1st biLSTM vector
        bi2_output = [
            dy.concatenate([fw2, bw2])
            for fw2, bw2 in zip(fw_lstm2_output, reversed(bw_lstm2_output))
        ]

        # calc net output
        net_output = [dy.softmax(W * out + b) for out in bi2_output]

        return net_output
Ejemplo n.º 54
0
    def transduce(self, es: ExpressionSequence) -> ExpressionSequence:
        """
    returns the list of output Expressions obtained by adding the given inputs
    to the current state, one by one, to both the forward and backward RNNs,
    and concatenating.

    Args:
      es: an ExpressionSequence
    """
        es_list = [es]

        for layer_i, (fb, bb) in enumerate(self.builder_layers):
            reduce_factor = self._reduce_factor_for_layer(layer_i)

            if es_list[0].mask is None: mask_out = None
            else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor)

            if self.downsampling_method == "concat" and len(
                    es_list[0]) % reduce_factor != 0:
                raise ValueError(
                    f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, "
                    f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. "
                    f"Set Batcher's pad_src_to_multiple argument accordingly.")
            fs = fb.transduce(es_list)
            bs = bb.transduce(
                [ReversedExpressionSequence(es_item) for es_item in es_list])
            if layer_i < len(self.builder_layers) - 1:
                if self.downsampling_method == "skip":
                    es_list = [
                        ExpressionSequence(expr_list=fs[::reduce_factor],
                                           mask=mask_out),
                        ExpressionSequence(expr_list=bs[::reduce_factor][::-1],
                                           mask=mask_out)
                    ]
                elif self.downsampling_method == "concat":
                    es_len = len(es_list[0])
                    es_list_fwd = []
                    es_list_bwd = []
                    for i in range(0, es_len, reduce_factor):
                        for j in range(reduce_factor):
                            if i == 0:
                                es_list_fwd.append([])
                                es_list_bwd.append([])
                            es_list_fwd[j].append(fs[i + j])
                            es_list_bwd[j].append(bs[len(es_list[0]) -
                                                     reduce_factor + j - i])
                    es_list = [ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \
                              [ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)]
                else:
                    raise RuntimeError(
                        f"unknown downsampling_method {self.downsampling_method}"
                    )
            else:
                # concat final outputs
                ret_es = ExpressionSequence(expr_list=[
                    dy.concatenate([f, b])
                    for f, b in zip(fs, ReversedExpressionSequence(bs))
                ],
                                            mask=mask_out)

        self._final_states = [FinalTransducerState(dy.concatenate([fb.get_final_states()[0].main_expr(),
                                                                   bb.get_final_states()[0].main_expr()]),
                                                   dy.concatenate([fb.get_final_states()[0].cell_expr(),
                                                                   bb.get_final_states()[0].cell_expr()])) \
                              for (fb, bb) in self.builder_layers]

        return ret_es
Ejemplo n.º 55
0
 def __call__(self, a, b, c):
     enc = [dy.rectify(self.a_mlp(a)),  # HOTFIX rectify here?
            dy.rectify(self.b_mlp(b)),
            dy.rectify(self.c_mlp(c))]
     enc = [dy.concatenate([dy.scalarInput(1), x]) for x in enc]
     return self.multilinear(*enc)
Ejemplo n.º 56
0
    def getWordEmbeddings(self,
                          sentence,
                          train,
                          options,
                          test_embeddings=defaultdict(lambda: {})):

        if self.elmo:
            sentence_text = " ".join([entry.form for entry in sentence[:-1]])

            elmo_sentence_representation = \
                self.elmo.get_sentence_representation(sentence_text)

        for i, root in enumerate(sentence):
            root.vecs = defaultdict(
                lambda: None
            )  # all vecs are None by default (possibly a little risky?)
            if options.word_emb_size > 0:
                if train:
                    word_count = float(self.word_counts.get(root.norm, 0))
                    dropFlag = random.random() > word_count / (0.25 +
                                                               word_count)
                    root.vecs["word"] = self.word_lookup[
                        self.words.get(root.norm, 0) if not dropFlag else 0]
                else:  # need to check in test_embeddings at prediction time
                    if root.norm in self.words:
                        root.vecs["word"] = self.word_lookup[self.words[
                            root.norm]]
                    elif root.norm in test_embeddings["words"]:
                        root.vecs["word"] = dy.inputVector(
                            test_embeddings["words"][root.norm])
                    else:
                        root.vecs["word"] = self.word_lookup[0]
            if options.pos_emb_size > 0:
                root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)]
            if options.char_emb_size > 0:
                root.vecs["char"] = self.get_char_vector(
                    root, train, test_embeddings["chars"])
            if options.tbank_emb_size > 0:
                if options.forced_tbank_emb:
                    treebank_id = options.forced_tbank_emb
                elif root.proxy_tbank:
                    treebank_id = root.proxy_tbank
                else:
                    treebank_id = root.treebank_id
                # this is a bit of a hack for models trained on an old version of the code
                # that used treebank name rather than id as the lookup
                if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \
                    utils.reverse_iso_dict[treebank_id] in self.treebanks:
                    treebank_id = utils.reverse_iso_dict[treebank_id]
                root.vecs["treebank"] = self.treebank_lookup[
                    self.treebanks[treebank_id]]
            if self.elmo:
                if i < len(sentence) - 1:
                    # Don't look up the 'root' word
                    root.vecs["elmo"] = elmo_sentence_representation[i]
                else:
                    # TODO
                    root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim)

            root.vec = dy.concatenate(
                filter(None, [
                    root.vecs["word"], root.vecs["elmo"], root.vecs["pos"],
                    root.vecs["char"], root.vecs["treebank"]
                ]))

        for bilstm in self.bilstms:
            bilstm.set_token_vecs(sentence, train)
Ejemplo n.º 57
0
def encode_sents(look, fwd, bwd, sents):
    embs = [[look[x] for x in sent] for sent in sents]
    return [
        dy.concatenate([fwd.transduce(x)[-1],
                        bwd.transduce(x)[-1]]) for x in embs
    ]
Ejemplo n.º 58
0
    def step(self, instances, enable_dropout=True):
        dy.renew_cg()

        if enable_dropout:
            self.l2r_builder.set_dropout(0.5)
            self.r2l_builder.set_dropout(0.5)
            self.dec_builder.set_dropout(0.5)
        else:
            self.l2r_builder.disable_dropout()
            self.r2l_builder.disable_dropout()
            self.dec_builder.disable_dropout()

        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        W1_att_f = dy.parameter(self.W1_att_f)
        W1_att_e = dy.parameter(self.W1_att_e)
        w2_att = dy.parameter(self.w2_att)

        #instances : a list [(src0,tgt0),(src1,tgt1),(src2,tgt2)]
        maxLen = max(map(lambda x: len(x[1]), instances))
        src_sents = []
        src_sents_rev = []
        tgt_sents = []
        srcSenLen = len(
            instances[0][0]) + 2  #the length of the src sentence, all the same
        tgtSenLen = maxLen + 1
        masks = [
            [] for i in range(tgtSenLen)
        ]  #mask for each position. each item in this list is a list with length=batchsize
        num_words = 0

        for item in instances:
            #item[0]:src ; item[1]:tgt
            num_words += (len(item[1]) + 1)
            padNum = maxLen - len(item[1])
            for i in range(len(item[1]) + 1):
                masks[i].append(1)
            for i in range(len(item[1]) + 1, tgtSenLen):
                masks[i].append(0)
            thisSrc = [startSymbol] + item[0] + [endSymbol]
            src_sents.append(thisSrc)
            src_sents_rev.append(list(reversed(thisSrc)))
            thisTgt = [startSymbol
                       ] + item[1] + [endSymbol for i in range(padNum + 1)]
            tgt_sents.append(thisTgt)

        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []
        for i in range(srcSenLen):
            batchSrc = dy.lookup_batch(
                self.src_lookup,
                [self.src_token_to_id[x[i]] for x in src_sents])
            batchSrc_rev = dy.lookup_batch(
                self.src_lookup,
                [self.src_token_to_id[x[i]] for x in src_sents_rev])
            l2r_state = l2r_state.add_input(batchSrc)
            r2l_state = r2l_state.add_input(batchSrc_rev)
            l2r_contexts.append(l2r_state.output())
            r2l_contexts.append(r2l_state.output())

        r2l_contexts.reverse()

        # Combine the left and right representations for every word
        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))
        h_fs_matrix = dy.concatenate_cols(h_fs)

        losses = []

        # Decoder
        c_t = dy.vecInput(self.hidden_size * 2)
        start = dy.concatenate([
            dy.lookup_batch(self.tgt_lookup,
                            [self.tgt_token_to_id['</S>'] for i in tgt_sents]),
            c_t
        ])
        dec_state = self.dec_builder.initial_state().add_input(start)
        #loss = dy.pickneglogsoftmax_batch(W_y * dec_state.output() + b_y,[self.tgt_token_to_id[tgt_sent[0]] for tgt_sent in tgt_sents])
        #losses.append(loss)

        for i in range(tgtSenLen):
            #cw : item[i] nw:item[i+1]
            h_e = dec_state.output()
            c_t = self.__attention_mlp(h_fs_matrix, h_e, 1)[0]
            # Get the embedding for the current target word
            embed_t = dy.lookup_batch(
                self.tgt_lookup,
                [self.tgt_token_to_id[tgt_sent[i]] for tgt_sent in tgt_sents])
            # Create input vector to the decoder
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            o_en = dec_state.output()
            if enable_dropout:
                o_en = dy.dropout(o_en, 0.5)
            loss = dy.pickneglogsoftmax_batch(W_y * o_en + b_y, [
                self.tgt_token_to_id[tgt_sent[i + 1]] for tgt_sent in tgt_sents
            ])
            thisMask = dy.inputVector(masks[i])
            thisMask = dy.reshape(thisMask, (1, ), len(instances))
            losses.append(loss * thisMask)

        return dy.sum_batches(dy.esum(losses)), num_words
Ejemplo n.º 59
0
    def translate_sentence(self, sent):
        dy.renew_cg()

        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)
        W1_att_f = dy.parameter(self.W1_att_f)
        W1_att_e = dy.parameter(self.W1_att_e)
        w2_att = dy.parameter(self.w2_att)

        sent = [startSymbol] + sent + [endSymbol]
        sent_rev = list(reversed(sent))

        # Bidirectional representations
        l2r_state = self.l2r_builder.initial_state()
        r2l_state = self.r2l_builder.initial_state()
        l2r_contexts = []
        r2l_contexts = []

        for (cw_l2r, cw_r2l) in zip(sent, sent_rev):
            l2r_state = l2r_state.add_input(
                dy.lookup(self.src_lookup, self.src_token_to_id[cw_l2r]))
            r2l_state = r2l_state.add_input(
                dy.lookup(self.src_lookup, self.src_token_to_id[cw_r2l]))
            l2r_contexts.append(l2r_state.output())
            r2l_contexts.append(r2l_state.output())
        r2l_contexts.reverse()

        h_fs = []
        for (l2r_i, r2l_i) in zip(l2r_contexts, r2l_contexts):
            h_fs.append(dy.concatenate([l2r_i, r2l_i]))

        h_fs_matrix = dy.concatenate_cols(h_fs)

        # Decoder
        trans_sentence = [startSymbol]
        cw = trans_sentence[-1]
        #initial context
        c_t = dy.vecInput(self.hidden_size * 2)
        start = dy.concatenate(
            [dy.lookup(self.tgt_lookup, self.tgt_token_to_id[endSymbol]), c_t])
        dec_state = self.dec_builder.initial_state().add_input(start)
        while len(trans_sentence) < self.max_len:
            h_e = dec_state.output()
            getAttention = self.__attention_mlp(h_fs_matrix, h_e, 0)
            c_t = getAttention[0]
            embed_t = dy.lookup(self.tgt_lookup, self.tgt_token_to_id[cw])
            x_t = dy.concatenate([embed_t, c_t])
            dec_state = dec_state.add_input(x_t)
            y_star = dy.softmax(W_y * dec_state.output() + b_y).vec_value()
            next_wordID = np.argmax(y_star)
            cw = self.tgt_id_to_token[next_wordID]
            cpcw = cw  #store the original word for computing next word
            if cw == unkSymbol:
                #find the source word with highest attention score
                keyWord = sent[getAttention[1]]
                if self.src_token_to_id[keyWord] == self.src_token_to_id[
                        unkSymbol]:
                    cw = keyWord  #special word . simply pass it source word out
                else:
                    #find the target word with second max prob
                    #prob: y_star
                    next_wordID = np.argpartition(y_star, 1)[1]
                    cw = self.tgt_id_to_token[next_wordID]
            if cw == endSymbol:
                break
            if cw != startSymbol:
                trans_sentence.append(cw)
            cw = cpcw  #get the original cw

        return ' '.join(trans_sentence[1:])
Ejemplo n.º 60
0
    def batch_predict_next_best_action(self,config_batched,prev_action_batched,sentence_batch):
        """
        Predicts greedily the next transition for a batch of configs,
        actions leading to that config,and related sentences
        @param config_batched: a list of configurations
        @param prev_action_batched: a list of actions (or None if no prev actions)
        @param sentence_batch: a list of sentences
        @return a list of new configurations, a list of actions generating these new configs
        """
    
        B = len(config_batched)
        idxes = list(range(B))
        new_configs = [None] * B
        new_actions = [None] * B

        if prev_action_batched is None:
            prev_action_batched = [None]*B
                
        #(1) sort out the lexical and structural batches
        def is_lexical(config):
            S,F,B,A,prefix_score = config
            return F is None and len(B) > 0

        lexical_idxes    = [idx for idx in idxes if     is_lexical(config_batched[idx])]
        structural_idxes = [idx for idx in idxes if not is_lexical(config_batched[idx])]

        #(2) lexical predictions
        if len(lexical_idxes) > 0:

            def make_ref_lex_action(config,sentence):
                S,F,B,A,prefix_score = config
                return (ArcEagerGenerativeParser.GENERATE,sentence[B[0]])

            X = []
            Y = []
            for idx in lexical_idxes:
                x,y = self.make_representation(config_batched[idx],make_ref_lex_action(config_batched[idx],sentence_batch[idx]),sentence_batch[idx],structural=False)
                X.append(x)
                Y.append(y)

            Xt = zip(*X)    #transpose
        
            if self.tied:
                dy.renew_cg()
                W = dy.parameter(self.hidden_weights)
                E = dy.parameter(self.input_embeddings)
                embeddings = [dy.pick_batch(E, xcol) for xcol in Xt]
                xdense     = dy.concatenate(embeddings)
                preds      = dy.pickneglogsoftmax_batch(E * dy.tanh( W * xdense ),Y).npvalue()[0]
            else:
                dy.renew_cg()
                W = dy.parameter(self.hidden_weights)
                E = dy.parameter(self.input_embeddings)
                O = dy.parameter(self.output_embeddings)
                embeddings = [dy.pick_batch(E, xcol) for xcol in Xt]
                xdense     = dy.concatenate(embeddings)
                preds      = dy.pickneglogsoftmax_batch(O * dy.tanh( W * xdense ),Y).npvalue()[0]

            preds = np.atleast_1d(preds)
                
            for pred_score,idx in zip(preds,lexical_idxes): 
                new_configs[idx] = self.generate(config_batched[idx],local_score= -pred_score)# execs the actions  
                new_actions[idx] = (ArcEagerGenerativeParser.GENERATE,sentence_batch[idx][config_batched[idx][2][0]])

        #(3) structural predictions
        if len(structural_idxes) > 0 :
            action_masks = np.array([self.mask_actions(config_batched[idx],prev_action_batched[idx],len(sentence_batch[idx])) for idx in structural_idxes])
            X = [self.make_representation(config_batched[idx],None,sentence_batch[idx],structural=True) for idx in structural_idxes]
            Xt = zip(*X)    #transpose
            dy.renew_cg()
            W = dy.parameter(self.hidden_weights)
            E = dy.parameter(self.input_embeddings)
            A = dy.parameter(self.action_weights)
            embeddings = [dy.pick_batch(E, xcol) for xcol in Xt]
            xdense     = dy.concatenate(embeddings)
            preds      = dy.softmax(A * dy.tanh( W * xdense )).npvalue().transpose()

            max_idxes      = np.argmax(preds * action_masks,axis=1) 
            max_scores     = np.log(preds[np.arange(preds.shape[0]),max_idxes])
            for argmax_idx,max_score,idx in zip(max_idxes,max_scores,structural_idxes): 
                new_configs[idx] = self.actions[argmax_idx](config_batched[idx],local_score=max_score)  #execs the actions  
                new_actions[idx] = self.rev_action_codes[argmax_idx]
        return (new_configs, new_actions)