Exemple #1
0
    def beam_search(self, char_seq, truth = None, mu =0.): 
        start_agenda = Agenda(self.options['beam_size'])
        init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>'])
        init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb'])
        init_score = dy.scalarInput(0.)
        start_agenda.push(Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None))
        agenda = [start_agenda]

        for idx, _ in enumerate(char_seq,1): # from left to right, character by character
            now = Agenda(self.options['beam_size'])
            for wlen in xrange(1,min(idx,self.options['max_word_len'])+1): # generate candidate word vectors
                word = self.word_repr(char_seq[idx-wlen:idx])
                word_score = dy.dot_product(word,self.param_exprs['U'])
                for sent in agenda[idx-wlen]: # join segmentation
                    if truth is not None:
                        margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.)
                        score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score 
                    else:
                        score = sent.score_expr + dy.dot_product(sent.y, word) + word_score 
                    
                    if now.happy_with(score.scalar_value()):
                        new_state = sent.LSTMState.add_input(word)
                        new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb'])
                        now.push(Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen))
            agenda.append(now)

        if truth is not None:
            return agenda[-1].max().score_expr
        return agenda
Exemple #2
0
 def expr_for_tree(self, tree):
     if tree.isleaf():
         return self.E[self.w2i.get(tree.label,0)]
     if len(tree.children) == 1:
         assert(tree.children[0].isleaf())
         emb = self.expr_for_tree(tree.children[0])
         Wi,Wo,Wu   = [dy.parameter(w) for w in self.WS]
         bi,bo,bu,_ = [dy.parameter(b) for b in self.BS]
         i = dy.logistic(Wi*emb + bi)
         o = dy.logistic(Wo*emb + bo)
         u = dy.tanh(    Wu*emb + bu)
         c = dy.cmult(i,u)
         expr = dy.cmult(o,dy.tanh(c))
         return expr
     assert(len(tree.children) == 2),tree.children[0]
     e1 = self.expr_for_tree(tree.children[0])
     e2 = self.expr_for_tree(tree.children[1])
     Ui,Uo,Uu = [dy.parameter(u) for u in self.US]
     Uf1,Uf2 = [dy.parameter(u) for u in self.UFS]
     bi,bo,bu,bf = [dy.parameter(b) for b in self.BS]
     e = dy.concatenate([e1,e2])
     i = dy.logistic(Ui*e + bi)
     o = dy.logistic(Uo*e + bo)
     f1 = dy.logistic(Uf1*e1 + bf)
     f2 = dy.logistic(Uf2*e2 + bf)
     u = dy.tanh(    Uu*e + bu)
     c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2)
     h = dy.cmult(o,dy.tanh(c))
     expr = h
     return expr
def attend(blstm_outputs, h_t, W_c, v_a, W__a, U__a):
    # iterate through input states to compute alphas
    # print 'computing scores...'
    # scores = [W_a * pc.concatenate([h_t, h_input]) for h_input in blstm_outputs]
    scores = [v_a * pc.tanh(W__a * h_t + U__a * h_input) for h_input in blstm_outputs]
    # print 'computed scores'
    # normalize to alphas using softmax
    # print 'computing alphas...'
    alphas = pc.softmax(pc.concatenate(scores))
    # print 'computed alphas...'
    # compute c using alphas
    # print 'computing c...'

    # import time
    # s = time.time()
    # dim = len(blstm_outputs[0].vec_value())
    # stacked_alphas = pc.concatenate_cols([alphas for j in xrange(dim)])
    # stacked_vecs = pc.concatenate_cols([h_input for h_input in blstm_outputs])
    # c = pc.esum(pc.cwise_multiply(stacked_vecs, stacked_alphas))
    # print "stack time:", time.time() - s

    # s = time.time()
    c = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)])
    # print "pick time:", time.time() - s
    # print 'computed c'
    # print 'c len is {}'.format(len(c.vec_value()))
    # compute output state h~ using c and the decoder's h (global attention variation from Loung and Manning 2015)
    # print 'computing h~...'
    h_output = pc.tanh(W_c * pc.concatenate([h_t, c]))
    # print 'len of h_output is {}'.format(len(h_output.vec_value()))
    # print 'computed h~'

    return h_output, alphas, W__a.value()
 def __call__(self, x):
     W = dy.parameter(self.mw)
     b = dy.parameter(self.mb)
     W2 = dy.parameter(self.mw2)
     b2 = dy.parameter(self.mb2)
     mlp_output = W2 * (dy.tanh(W * x + b)) + b2
     if fDo_3_Layers:
         W3 = dy.parameter(self.mw3)
         b3 = dy.parameter(self.mb3)
         mlp_output = W3 * (dy.tanh(dy.mlpoutput)) + b3
     return dy.softmax(mlp_output)
def generate(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent

    #get the output of the first LSTM
    src_outputs =  [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])]

    src_output = src_outputs[-1]

    #gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix



    #generate until a eos tag or max is reached
    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])

    prev_word = sos_trg
    trg_sent = []
    attention_matrix = []
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)



    for i in range(MAX_SENT_SIZE):
        #feed the previous word into the lstm, calculate the most likely word, add it to the sentence
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()
        att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        attention_matrix.append(alignment)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        probs = (-dy.log_softmax(s)).value()
        next_word = np.argmax(probs)

        if next_word == eos_trg:
            break
        prev_word = next_word
        trg_sent.append(i2w_trg[next_word])
    return trg_sent, dy.concatenate_cols(attention_matrix).value()
Exemple #6
0
 def gate_and_next_vecs(self, ht1, ct1, xt):
     v = self.gate_vecs(ht1, xt)
     c = dy.cmult(ct1, v["f"]) + dy.cmult(v["ctilde"], v["i"])
     h = dy.cmult(dy.tanh(c), v["o"])
     res = v
     res.update({"c": c, "h": h})
     return res
Exemple #7
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()
    #now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        #feed the current state into the 
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word
    return dy.esum(all_losses)
Exemple #8
0
def generate(sent):
    dy.renew_cg()

    src = sent


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()

    #generate until a eos tag or max is reached
    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])

    prev_word = sos_trg
    trg_sent = []
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for i in range(MAX_SENT_SIZE):
        #feed the previous word into the lstm, calculate the most likely word, add it to the sentence
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()
        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        probs = (-dy.log_softmax(s)).value()
        next_word = np.argmax(probs)

        if next_word == eos_trg:
            break
        prev_word = next_word
        trg_sent.append(i2w_trg[next_word])
    return trg_sent
Exemple #9
0
    def word_repr(self, char_seq):
        # obtain the word representation when given its character sequence
        wlen = len(char_seq)
        if 'rgW%d'%wlen not in self.param_exprs:
            self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1])
            self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1])
            self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1])
            self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1])
            self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1])
            self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1])
          
        chars = dy.concatenate(char_seq)
        reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen])
        comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars])
        update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen]
        
        update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])]))
        
        # The following implementation of Softmax fucntion is not safe, but faster...
        #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1)))
        #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1)))
        #assert (not np.isnan(update_gate.npvalue()).any())

        word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1))))
        return word
    def word_repr(self, char_seq, cembs):
        # obtain the word representation when given its character sequence

        wlen = len(char_seq)
        if 'rgW%d' % wlen not in self.param_exprs:
            self.param_exprs['rgW%d' % wlen] = dy.parameter(
                self.params['reset_gate_W'][wlen - 1])
            self.param_exprs['rgb%d' % wlen] = dy.parameter(
                self.params['reset_gate_b'][wlen - 1])
            self.param_exprs['cW%d' % wlen] = dy.parameter(
                self.params['com_W'][wlen - 1])
            self.param_exprs['cb%d' % wlen] = dy.parameter(
                self.params['com_b'][wlen - 1])

        chars = dy.concatenate(cembs)
        reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars +
                                 self.param_exprs['rgb%d' % wlen])
        word = dy.tanh(self.param_exprs['cW%d' % wlen] *
                       dy.cmult(reset_gate, chars) +
                       self.param_exprs['cb%d' % wlen])
        if self.known_words is not None and tuple(
                char_seq) in self.known_words:
            return (word + dy.lookup(self.params['word_embed'],
                                     self.known_words[tuple(char_seq)])) / 2.
        return word
Exemple #11
0
def calc_score_of_history(words):
  # Lookup the embeddings and concatenate them
  emb = dy.concatenate([W_emb[x] for x in words])
  # Create the hidden layer
  h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
  # Calculate the score and return
  return dy.affine_transform([b_sm, W_sm, h])
    def get_decode_loss(self, src_encodings, tgt_sents):
        W_s = dy.parameter(self.W_s)
        b_s = dy.parameter(self.b_s)
        W_h = dy.parameter(self.W_h)
        b_h = dy.parameter(self.b_h)
        W_y = dy.parameter(self.W_y)
        b_y = dy.parameter(self.b_y)

        tgt_words, tgt_masks = input_transpose(tgt_sents)
        batch_size = len(tgt_sents)

        s = self.dec_builder.initial_state(
            [dy.tanh(W_s * src_encodings[-1] + b_s)])
        ctx_tm1 = dy.vecInput(self.args.hidden_size * 2)
        losses = []

        # start from <S>, until y_{T-1}
        for t, (y_ref_t, mask_t) in enumerate(zip(tgt_words[1:],
                                                  tgt_masks[1:]),
                                              start=1):
            y_tm1_embed = dy.lookup_batch(self.tgt_lookup, tgt_words[t - 1])
            x = dy.concatenate([y_tm1_embed, ctx_tm1])
            s = s.add_input(x)
            h_t = s.output()
            ctx_t, alpha_t = self.attention(src_encodings, h_t, batch_size)

            # read_out = dy.tanh(W_h * dy.concatenate([h_t, ctx_t]) + b_h)
            read_out = dy.tanh(
                dy.affine_transform([b_h, W_h,
                                     dy.concatenate([h_t, ctx_t])]))
            if args.dropout > 0.:
                read_out = dy.dropout(read_out, args.dropout)
            y_t = W_y * read_out + b_y
            loss_t = dy.pickneglogsoftmax_batch(y_t, y_ref_t)

            if 0 in mask_t:
                mask_expr = dy.inputVector(mask_t)
                mask_expr = dy.reshape(mask_expr, (1, ), batch_size)
                loss_t = loss_t * mask_expr

            losses.append(loss_t)
            ctx_tm1 = ctx_t

        loss = dy.esum(losses)
        loss = dy.sum_batches(loss) / batch_size

        return loss
Exemple #13
0
  def __call__(self, translator, dec_state, src, trg):
    # TODO: apply trg.mask ?
    samples = []
    logsofts = []
    self.bs = []
    done = [False for _ in range(len(trg))]
    for _ in range(self.sample_length):
      dec_state.context = translator.attender.calc_context(dec_state.rnn_state.output())
      if self.use_baseline:
        h_t = dy.tanh(translator.decoder.context_projector(dy.concatenate([dec_state.rnn_state.output(), dec_state.context])))
        self.bs.append(self.baseline(dy.nobackprop(h_t)))
      logsoft = dy.log_softmax(translator.decoder.get_scores(dec_state))
      sample = logsoft.tensor_value().categorical_sample_log_prob().as_numpy()[0]
      # Keep track of previously sampled EOS
      sample = [sample_i if not done_i else Vocab.ES for sample_i, done_i in zip(sample, done)]
      # Appending and feeding in the decoder
      logsoft = dy.pick_batch(logsoft, sample)
      logsofts.append(logsoft)
      samples.append(sample)
      dec_state = translator.decoder.add_input(dec_state, translator.trg_embedder.embed(xnmt.batcher.mark_as_batch(sample)))
      # Check if we are done.
      done = list(six.moves.map(lambda x: x == Vocab.ES, sample))
      if all(done):
        break

    samples = np.stack(samples, axis=1).tolist()
    self.eval_score = []
    for trg_i, sample_i in zip(trg, samples):
      # Removing EOS
      try:
        idx = sample_i.index(Vocab.ES)
        sample_i = sample_i[:idx]
      except ValueError:
        pass
      try:
        idx = trg_i.words.index(Vocab.ES)
        trg_i.words = trg_i.words[:idx]
      except ValueError:
        pass
      # Calculate the evaluation score
      score = 0 if not len(sample_i) else self.evaluation_metric.evaluate_fast(trg_i.words, sample_i)
      self.eval_score.append(score)
    self.true_score = dy.inputTensor(self.eval_score, batched=True)
    loss = LossBuilder()

    if self.use_baseline:
      for i, (score, _) in enumerate(zip(self.bs, logsofts)):
        logsofts[i] = dy.cmult(logsofts[i], score - self.true_score)
      loss.add_loss("Reinforce", dy.sum_elems(dy.esum(logsofts)))

    else:
        loss.add_loss("Reinforce", dy.sum_elems(dy.cmult(-self.true_score, dy.esum(logsofts))))

    if self.use_baseline:
      baseline_loss = []
      for bs in self.bs:
        baseline_loss.append(dy.squared_distance(self.true_score, bs))
      loss.add_loss("Baseline", dy.sum_elems(dy.esum(baseline_loss)))
    return loss
    def compute_embeddings(self, word, runtime=True):
        x_list = []
        if not isinstance(word, unicode):
            uniword = unicode(word, 'utf-8')
        else:
            import copy
            uniword = copy.deepcopy(word)

        uniword = re.sub('\d', '0', uniword)
        for i in range(len(uniword)):
            char = uniword[i]
            if char.lower() == char and char.upper() == char:
                style_emb = dy.inputVector([1.0, 0.0, 0.0])  # does not support uppercase
            elif char.lower() == char:
                style_emb = dy.inputVector([0.0, 1.0, 0.0])  # is lowercased
            else:
                style_emb = dy.inputVector([0.0, 0.0, 1.0])  # is uppercased

            char = char.lower()
            if char in self.encodings.char2int:
                x_list.append(dy.concatenate([self.character_lookup[self.encodings.char2int[char]], style_emb]))
            else:
                x_list.append(dy.concatenate([self.character_lookup[self.encodings.char2int['<UNK>']], style_emb]))

        rnn_outputs = x_list
        rnn_states_fw = None
        rnn_states_bw = None
        for rnn_fw, rnn_bw in zip(self.rnn_fw, self.rnn_bw):
            fw = []
            bw = []
            if runtime:
                rnn_fw.set_dropouts(0, 0)
                rnn_bw.set_dropouts(0, 0)
            else:
                rnn_fw.set_dropouts(0, 0.33)
                rnn_bw.set_dropouts(0, 0.33)

            rnn_fw = rnn_fw.initial_state()
            rnn_bw = rnn_bw.initial_state()
            rnn_states_fw = []
            rnn_states_bw = []
            for x in rnn_outputs:
                rnn_fw = rnn_fw.add_input(x)
                rnn_states_fw.append(rnn_fw)
                fw.append(rnn_states_fw[-1].output())
            for x in reversed(rnn_outputs):
                rnn_bw = rnn_bw.add_input(x)
                rnn_states_bw.append(rnn_bw)
                bw.append(rnn_states_bw[-1].output())
            rnn_outputs = []
            for x1, x2 in zip(fw, reversed(bw)):
                rnn_outputs.append(dy.concatenate([x1, x2]))

        attention = self._attend(rnn_outputs, rnn_states_fw[-1], rnn_states_bw[-1])

        pre_linear = dy.concatenate([fw[-1], bw[-1], attention])
        embedding = dy.tanh(self.linearW.expr() * pre_linear + self.linearB.expr())

        return embedding, rnn_outputs
Exemple #15
0
    def __call__(self, inputs, is_train=True):
        ners, constituent_path, dep_path = inputs

        dy.renew_cg()

        #make ner a dynet expression
        ners_vec = dy.vecInput(LENGTH_OF_NER)
        ners_vec.set(ners)

        #get vector from lstm on constituent path
        if len(constituent_path) > 0:
            constituent_path = [
                self.word_embeds[x] if i % 2 == 0 else self.arrow_embeds[x]
                for i, x in enumerate(constituent_path)
            ]
            if is_train:
                constituent_path = [
                    dy.dropout(x, self.dropout) for x in constituent_path
                ]
            lstm_init1 = self.constituent_lstm.initial_state()
            cons_vec = lstm_init1.transduce(constituent_path)[-1]
        else:
            cons_vec = dy.vecInput(self.lstm_dim)

        #get vector from lstm on dependency path
        if len(dep_path) > 0:
            dep_vec = []
            for i, x in enumerate(dep_path):
                if i % 3 == 0:
                    dep_vec.append(self.word_embeds[x])
                elif i % 3 == 1:
                    dep_vec.append(self.arrow_embeds[x])
                else:
                    dep_vec.append(self.dep_embeds[x])
            if is_train:
                dep_vec = [dy.dropout(x, self.dropout) for x in dep_vec]
            lstm_init2 = self.dependency_lstm.initial_state()
            dep_vec = lstm_init2.transduce(dep_vec)[-1]
        else:
            dep_vec = dy.vecInput(self.lstm_dim)

        final_input = dy.concatenate([ners_vec, cons_vec, dep_vec])

        return dy.softmax(self.W3 * dy.tanh(
            self.W2 * dy.tanh(self.W1 * final_input + self.b1) + self.b2) +
                          self.b3)
 def attend_with_prev(self, state, w1dt, prev_att):
     w2dt = self.attention_w2 * state
     w3dt = self.attention_w3 * prev_att
     unnormalized = dy.transpose(
         self.attention_v *
         dy.tanh(dy.colwise_add(dy.colwise_add(w1dt, w2dt), w3dt)))
     att_weights = dy.softmax(unnormalized)
     return att_weights
    def predict_output(self, x):
        x_vector = dy.inputVector(x)

        f = dy.tanh(self.W * x_vector + self.b_bias)
        probs = dy.softmax(self.U * f + self.d_bias).npvalue()
        selection = np.random.choice(self.inp_dim, p=probs / probs.sum())

        return selection, probs[selection]
Exemple #18
0
 def predict(self, x):
     x = dy.inputVector(x)
     pred = ((self.U * dy.tanh(self.W * x + self.b))) + self.d
     softmax = dy.softmax(pred).npvalue()
     max_pos = heapq.nlargest(20,
                              range(len(softmax)),
                              key=softmax.__getitem__)
     return max_pos, softmax
Exemple #19
0
        def add_input(self, input_vec):

            x = dynet.concatenate([input_vec, self.h])

            i = dynet.logistic(self.W_i * x + self.b_i)
            f = dynet.logistic(self.W_f * x + self.b_f)
            g = dynet.tanh(self.W_c * x + self.b_c)
            o = dynet.logistic(self.W_o * x + self.b_o)

            c = dynet.cwise_multiply(f, self.c) + dynet.cwise_multiply(i, g)
            h = dynet.cwise_multiply(o, dynet.tanh(c))

            self.c = c
            self.h = h
            self.outputs.append(h)

            return self
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component):
    w1_att_src = dy.parameter(w1_att_src_p)
    w1_att_tgt = dy.parameter(w1_att_tgt_p)
    w2_att = dy.parameter(w2_att_p)
    a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att
    alignment = dy.softmax(a_t)
    att_output = src_output_matrix * alignment
    return att_output, alignment
Exemple #21
0
 def __call__(self, inputs):
     lookup = self.E
     emb_vectors = [lookup[i] for i in inputs]
     net_input = dy.concatenate(emb_vectors)
     net_output = dy.softmax(self.pV *
                             (dy.tanh((self.pW * net_input) + self.pB_1)) +
                             self.pB_2)
     return net_output
Exemple #22
0
  def __call__(self, input_expr):
    W1 = dy.parameter(self.W1)
    W2 = dy.parameter(self.W2)
    b1 = dy.parameter(self.b1)
    b2 = dy.parameter(self.b2)

    h = dy.tanh(W1 * input_expr + b1)
    return W2 * h + b2
Exemple #23
0
	def get_gen_vocab_embedding(self,current_state_output, context_vector, w, b):
		voc_lookup = dy.parameter(self.gentokenLookup)
		# state = dy.concatenate([current_state.output(), context_vector])
		state = dy.concatenate([current_state_output,context_vector])
		s = dy.affine_transform([b, w, state])
		g = dy.tanh(s)
		s = dy.transpose(voc_lookup) * g
		return s
    def predict_next_(self, state, *args, **kwargs):
        (R, bias, W_c, W__a, U__a, v__a) = self.cg_params

        # soft attention vector
        att_scores = [
            v__a * dy.tanh(W__a * state.output() + U__a * h_input)
            for h_input in self.biencoder
        ]
        alphas = dy.softmax(dy.concatenate(att_scores))
        c = dy.esum([
            h_input * dy.pick(alphas, j)
            for j, h_input in enumerate(self.biencoder)
        ])

        # softmax over vocabulary
        h_output = dy.tanh(W_c * dy.concatenate([state.output(), c]))
        return dy.softmax(R * h_output + bias)
Exemple #25
0
    def predict_next(self, scores=False, hidden =False):
        (R, bias, W_c, W__a, U__a, v__a) = self.cg_params

        # soft attention vector
        att_scores = [v__a * dy.tanh(W__a * self.s.output() + U__a * h_input) for h_input in self.biencoder]
        alphas = dy.softmax(dy.concatenate(att_scores))
        c = dy.esum([h_input * dy.pick(alphas, j) for j, h_input in enumerate(self.biencoder)])
            
        # softmax over vocabulary
        h_output = dy.tanh(W_c * dy.concatenate([self.s.output(), c]))
        if not hidden:
            if not scores:
                return dy.softmax(R * h_output + bias)
            else:
                return R * h_output + bias
        else:
            return h_output
Exemple #26
0
  def get_scores(self, mlp_dec_state):
    """Get scores given a current state.

    :param mlp_dec_state: An MlpSoftmaxDecoderState object.
    :returns: Scores over the vocabulary given this state.
    """
    h_t = dy.tanh(self.context_projector(dy.concatenate([mlp_dec_state.rnn_state.output(), mlp_dec_state.context])))
    return self.vocab_projector(h_t)
Exemple #27
0
    def calc_attention(self, state):
        V = dy.parameter(self.pV)
        U = dy.parameter(self.pU)

        h = dy.tanh(dy.colwise_add(self.WI, V * state))
        scores = dy.transpose(U * h)

        return dy.softmax(scores)
 def attend(self, input_mat, state, w1dt):
     w2 = dy.parameter(self.attention_w2)
     v = dy.parameter(self.attention_v)
     w2dt = w2 * dy.concatenate(list(state.s()))
     att_weights = dy.softmax(
         dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))))
     context = input_mat * att_weights
     return context
Exemple #29
0
    def __calc_attn_score(self, W1_att_f, W1_att_e, w2_att, h_fs_matrix, h_e):
        #print type(h_fs_matrix)
        h_e_matrix = dy.concatenate_cols(
            [h_e for i in range(h_fs_matrix.npvalue().shape[1])])
        layer_1 = dy.tanh(W1_att_f * h_fs_matrix + W1_att_e * h_e_matrix)

        #print 'continues'
        return dy.transpose(layer_1) * w2_att
Exemple #30
0
 def calc_scores(words):
     dy.renew_cg()
     word = words.index(1)
     h1 = dy.lookup(W_emb, word)
     h2 = dy.tanh(dy.parameter(W_h) * h1 + dy.parameter(b_h))
     W_softmax = dy.parameter(W_sm)
     b_softmax = dy.parameter(b_sm)
     return W_softmax * h2 + b_softmax
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component):
    w1_att_src = dy.parameter(w1_att_src_p)
    w1_att_tgt = dy.parameter(w1_att_tgt_p)
    w2_att = dy.parameter(w2_att_p)
    a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att
    alignment = dy.softmax(a_t)
    att_output = src_output_matrix * alignment
    return att_output, alignment
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])

    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs(
        [dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output()
    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append(
            [sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)

    current_state = LSTM_TRG_BUILDER.initial_state().set_s(
        [src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the
        current_state = current_state.add_input(
            dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1, ), len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Exemple #33
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]

    # initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x]
                                            for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mean = dy.parameter(W_mean_p)
    V_mean = dy.parameter(V_mean_p)
    b_mean = dy.parameter(b_mean_p)

    W_var = dy.parameter(W_var_p)
    V_var = dy.parameter(V_var_p)
    b_var = dy.parameter(b_var_p)

    # The mean vector from the encoder.
    mu = mlp(src_output, W_mean, V_mean, b_mean)
    # This is the diagonal vector of the log co-variance matrix from the encoder
    # (regard this as log variance is easier for furture implementation)
    log_var = mlp(src_output, W_var, V_var, b_var)

    # Compute KL[N(u(x), sigma(x)) || N(0, I)]
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    kl_loss = -0.5 * dy.sum_elems(1 + log_var -
                                  dy.pow(mu, dy.inputVector([2])) -
                                  dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        # feed the current state into the
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    return kl_loss, softmax_loss
Exemple #34
0
    def synthesize(self, mgc, batch_size, sample=True, temperature=1.0):
        synth = []
        total_audio_len = mgc.shape[0] * len(self.upsample_w_s)
        num_batches = total_audio_len / batch_size
        if total_audio_len % batch_size != 0:
            num_batches + 1
        last_rnn_state = None
        last_sample = 127
        w_index = 0
        last_proc = 0
        for iBatch in range(num_batches):
            dy.renew_cg()
            # bias=dy.inputVector([0]*self.RNN_SIZE)
            # gain=dy.inputVector([1.0]*self.RNN_SIZE)
            start = batch_size * iBatch
            stop = batch_size * (iBatch + 1)
            if stop >= total_audio_len:
                stop = total_audio_len - 1
            upsampled = self._upsample(mgc, start, stop)
            rnn = self.rnn.initial_state()
            if last_rnn_state is not None:
                rnn_state = [dy.inputVector(s) for s in last_rnn_state]
                rnn = rnn.set_s(rnn_state)

            out_list = []
            for index in range(stop - start):
                w_index += 1
                curr_proc = w_index * 100 / total_audio_len
                if curr_proc % 5 == 0 and curr_proc != last_proc:
                    last_proc = curr_proc
                    sys.stdout.write(' ' + str(curr_proc))
                    sys.stdout.flush()

                if self.OUTPUT_EMB_SIZE != 1:
                    rnn_input = dy.concatenate([self.output_lookup[last_sample], upsampled[index]])
                else:
                    rnn_input = dy.concatenate([dy.scalarInput(float(last_sample) / 127.0 - 1.0), upsampled[index]])
                rnn = rnn.add_input(rnn_input)
                rnn_output = rnn.output()  # dy.layer_norm(rnn.output(), gain, bias)
                hidden = rnn_output
                for w, b in zip(self.mlp_w, self.mlp_b):
                    hidden = dy.tanh(w.expr(update=True) * hidden + b.expr(update=True))
                softmax_output = dy.softmax(
                    self.softmax_w.expr(update=True) * hidden + self.softmax_b.expr(update=True))
                out_list.append(softmax_output)

                if sample:
                    last_sample = self._pick_sample(softmax_output.npvalue(),
                                                    temperature=temperature)  # np.argmax(softmax_output.npvalue())
                else:
                    last_sample = np.argmax(softmax_output.npvalue())
                # last_sample = np.argmax(softmax_output.npvalue())
                synth.append(last_sample)

            rnn_state = rnn.s()
            last_rnn_state = [s.value() for s in rnn_state]

        return synth
def generate(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent

    # get the output of the first LSTM
    src_outputs = [dy.concatenate([x.output(), y.output()]) for x, y in
                   LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])]

    src_output = src_outputs[-1]

    # gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix

    # generate until a eos tag or max is reached
    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])

    prev_word = sos_trg
    trg_sent = []
    attention_matrix = []
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)

    for i in range(MAX_SENT_SIZE):
        # feed the previous word into the lstm, calculate the most likely word, add it to the sentence
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()
        att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        attention_matrix.append(alignment)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        probs = (-dy.log_softmax(s)).value()
        next_word = np.argmax(probs)

        if next_word == eos_trg:
            break
        prev_word = next_word
        trg_sent.append(i2w_trg[next_word])
    return trg_sent, dy.concatenate_cols(attention_matrix).value()
Exemple #36
0
    def predict_logprobs(self,X,Y,structural=True,hidden_out=False):
        """
        Returns the log probabilities of the predictions for this model (batched version).

        @param X: the input indexes from which to predict (each xdatum is expected to be an iterable of integers) 
        @param Y: a list of references indexes for which to extract the prob
        @param structural: switches between structural and lexical logprob evaluation
        @param hidden_out: outputs an additional list of hidden dimension vectors
        @return the list of predicted logprobabilities for each of the provided ref y in Y
        """
        assert(len(X) == len(Y))
        assert(all(len(x) == self.input_length for x in X))

        if structural:
            dy.renew_cg()
            W = dy.parameter(self.hidden_weights)
            E = dy.parameter(self.input_embeddings)
            A = dy.parameter(self.action_weights)
            
            batched_X  = zip(*X) #transposes the X matrix
            embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X]
            xdense     = dy.concatenate(embeddings)
            preds      = dy.pickneglogsoftmax_batch(A * dy.tanh( W * xdense ),Y).value()
            return [-ypred  for ypred in preds]

        else:#lexical
            if self.tied:
                dy.renew_cg()
                W = dy.parameter(self.hidden_weights)
                E = dy.parameter(self.input_embeddings)
                batched_X  = zip(*X) #transposes the X matrix
                embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X]
                xdense     = dy.concatenate(embeddings)
                preds      = dy.pickneglogsoftmax_batch(E * dy.tanh( W * xdense ),Y).value()
                return [-ypred  for ypred in preds]
            else:
                dy.renew_cg()
                O = dy.parameter(self.output_embeddings)
                W = dy.parameter(self.hidden_weights)
                E = dy.parameter(self.input_embeddings)
                batched_X  = zip(*X) #transposes the X matrix
                embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X]
                xdense     = dy.concatenate(embeddings)
                preds      = dy.pickneglogsoftmax_batch(O * dy.tanh( W * xdense ),Y).value()
                return [-ypred  for ypred in preds]
Exemple #37
0
    def __attention_mlp(self, H_f, h_e, W1_att_e, W1_att_f, w2_att):

        # Calculate the alignment score vector
        a_t = dy.tanh(dy.colwise_add(W1_att_f * H_f, W1_att_e * h_e))
        a_t = w2_att * a_t
        a_t = a_t[0]
        alignment = dy.softmax(a_t)
        c_t = H_f * alignment
        return c_t
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    #get the output of the first LSTM
    src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output()
    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Exemple #39
0
  def calc_attention(self, state):
    V = dy.parameter(self.pV)
    U = dy.parameter(self.pU)

    h = dy.tanh(dy.colwise_add(self.WI, V * state))
    scores = dy.transpose(U * h)
    normalized = dy.softmax(scores)
    self.attention_vecs.append(normalized)
    return normalized
Exemple #40
0
def mlp(rnn_ouput, params):
    w1 = params["w1"]
    w2 = params["w2"]
    b1 = params["b1"]
    b2 = params["b2"]

    l1 = dy.tanh((w1 * rnn_ouput) + b1)
    out = dy.softmax((w2 * l1) + b2)
    return out
Exemple #41
0
 def _attend(self, query, mask=None):
     # query ((H), B)
     # mask  ((T, 1), B)
     projected_state = self.decoder * query  # ((H,), B)
     non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state))  # ((H, T), B)
     attn_scores = dy.transpose(self.v * non_lin)  # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B)
     if mask is not None:
         attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9))
     return dy.softmax(attn_scores)  # ((T, 1), B)
Exemple #42
0
 def __attention_mlp_batch(self, H_f_batch, h_e_batch, W1_att_e, W1_att_f, w2_att):
     # H_f_batch: (2 * hidden_size, num_step, batch_size)
     # h_e_batch: (hidden_size, batch_size)
     a_t_batch = dy.tanh(dy.colwise_add(W1_att_f * H_f_batch, W1_att_e * h_e_batch)) # (attention_size, num_step, batch_size)
     a_t_batch = w2_att * a_t_batch  # (1, num_step, batch_size)
     a_t_batch = a_t_batch[0]  # (num_step, batch_size)
     alignment_batch = dy.softmax(a_t_batch)  # (num_step, batch_size)
     c_t_batch = H_f_batch * alignment_batch  # (2 * hidden_size, batch_size)
     return c_t_batch
Exemple #43
0
    def truth_score(self, word_seq):

        wembs = [self.param_exprs['<bos>']]+[self.word_repr(word) for word in word_seq]
        init_state = self.params['lstm'].initial_state()
        hidden_states = init_state.transduce(wembs)
        score = dy.scalarInput(0.)
        for h, w in zip(hidden_states[:-1],wembs[1:]):
            y = dy.tanh(self.param_exprs['pW'] * h + self.param_exprs['pb'])
            score = score + dy.dot_product(y,w) +dy.dot_product(w,self.param_exprs['U']) 
        return score
Exemple #44
0
def calc_score_of_history(words, dropout=0.0):
  # Lookup the embeddings and concatenate them
  emb = dy.concatenate([W_emb[x] for x in words])
  # Create the hidden layer
  h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
  # CHANGE 2: perform dropout
  if dropout != 0.0:
    h = dy.dropout(h, dropout)
  # Calculate the score and return
  return dy.affine_transform([b_sm, W_sm, h])
Exemple #45
0
def calc_loss(sent):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src = sent[0]
    trg = sent[1]

    # initialize the LSTM
    init_state_src = LSTM_SRC_BUILDER.initial_state()

    # get the output of the first LSTM
    src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output()

    # Now compute mean and standard deviation of source hidden state.
    W_mean = dy.parameter(W_mean_p)
    V_mean = dy.parameter(V_mean_p)
    b_mean = dy.parameter(b_mean_p)

    W_var = dy.parameter(W_var_p)
    V_var = dy.parameter(V_var_p)
    b_var = dy.parameter(b_var_p)

    # The mean vector from the encoder.
    mu = mlp(src_output, W_mean, V_mean, b_mean)
    # This is the diagonal vector of the log co-variance matrix from the encoder
    # (regard this as log variance is easier for furture implementation)
    log_var = mlp(src_output, W_var, V_var, b_var)

    # Compute KL[N(u(x), sigma(x)) || N(0, I)]
    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
    kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var))

    z = reparameterize(mu, log_var)

    # now step through the output sentence
    all_losses = []

    current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)])
    prev_word = trg[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    for next_word in trg[1:]:
        # feed the current state into the
        current_state = current_state.add_input(LOOKUP_TRG[prev_word])
        output_embedding = current_state.output()

        s = dy.affine_transform([b_sm, W_sm, output_embedding])
        all_losses.append(dy.pickneglogsoftmax(s, next_word))

        prev_word = next_word

    softmax_loss = dy.esum(all_losses)

    return kl_loss, softmax_loss
Exemple #46
0
def calc_score_of_histories(words, dropout=0.0):
  # This will change from a list of histories, to a list of words in each history position
  words = np.transpose(words)
  # Lookup the embeddings and concatenate them
  emb = dy.concatenate([dy.lookup_batch(W_emb, x) for x in words])
  # Create the hidden layer
  h = dy.tanh(dy.affine_transform([b_h, W_h, emb]))
  # Perform dropout
  if dropout != 0.0:
    h = dy.dropout(h, dropout)
  # Calculate the score and return
  return dy.affine_transform([b_sm, W_sm, h])
Exemple #47
0
 def expr_for_tree(self, tree):
     if tree.isleaf():
         return self.E[self.w2i.get(tree.label,0)]
     if len(tree.children) == 1:
         assert(tree.children[0].isleaf())
         expr = self.expr_for_tree(tree.children[0])
         return expr
     assert(len(tree.children) == 2),tree.children[0]
     e1 = self.expr_for_tree(tree.children[0])
     e2 = self.expr_for_tree(tree.children[1])
     W = dy.parameter(self.W)
     expr = dy.tanh(W*dy.concatenate([e1,e2]))
     return expr
Exemple #48
0
def attend(input_mat, state, w1dt):
    global attention_w2
    global attention_v
    w2 = dy.parameter(attention_w2)
    v = dy.parameter(attention_v)

    # input_mat: (encoder_state x seqlen) => input vecs concatenated as cols
    # w1dt: (attdim x seqlen)
    # w2dt: (attdim x attdim)
    w2dt = w2*dy.concatenate(list(state.s()))
    # att_weights: (seqlen,) row vector
    unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt)))
    att_weights = dy.softmax(unnormalized)
    # context: (encoder_state)
    context = input_mat * att_weights
    return context
Exemple #49
0
def attend(input_vectors, state):
    global attention_w1
    global attention_w2
    global attention_v
    w1 = dy.parameter(attention_w1)
    w2 = dy.parameter(attention_w2)
    v = dy.parameter(attention_v)
    attention_weights = []

    w2dt = w2*dy.concatenate(list(state.s()))
    for input_vector in input_vectors:
        attention_weight = v*dy.tanh(w1*input_vector + w2dt)
        attention_weights.append(attention_weight)
    attention_weights = dy.softmax(dy.concatenate(attention_weights))
    output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)])
    return output_vectors
def attend2(blstm_outputs, s_prev, y_feedback, v_a, W_a, U_a, U_o, V_o, C_o):

    # attention mechanism - Bahdanau style
    # iterate through input states to compute alphas
    # print 'computing scores...'

    # W_a: hidden x hidden, U_a: hidden x 2 hidden, v_a: hidden, each score: scalar
    scores = [v_a * pc.tanh(W_a * s_prev + U_a * h_j) for h_j in blstm_outputs]
    alphas = pc.softmax(pc.concatenate(scores))

    # c_i: 2 hidden
    c_i = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)])

    # U_o = 2l x hidden, V_o = 2l x input, C_o = 2l x 2 hidden
    attention_output_vector = U_o * s_prev + V_o * y_feedback + C_o * c_i

    return attention_output_vector, alphas
Exemple #51
0
def build_tagging_graph(words):
    dy.renew_cg()
    # parameters -> expressions
    H = dy.parameter(pH)
    O = dy.parameter(pO)

    # initialize the RNNs
    f_init = fwdRNN.initial_state()
    b_init = bwdRNN.initial_state()

    cf_init = cFwdRNN.initial_state()
    cb_init = cBwdRNN.initial_state()

    # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word.
    wembs = [word_rep(w, cf_init, cb_init) for w in words]
    wembs = [dy.noise(we,0.2) for we in wembs] # optional

    # feed word vectors into biLSTM
    fw_exps = f_init.transduce(wembs)
    bw_exps = b_init.transduce(reversed(wembs))
# OR
#    fw_exps = []
#    s = f_init
#    for we in wembs:
#        s = s.add_input(we)
#        fw_exps.append(s.output())
#    bw_exps = []
#    s = b_init
#    for we in reversed(wembs):
#        s = s.add_input(we)
#        bw_exps.append(s.output())

    # biLSTM states
    bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))]

    # feed each biLSTM state to an MLP
    exps = []
    for x in bi_exps:
        r_t = O*(dy.tanh(H * x))
        exps.append(r_t)

    return exps
Exemple #52
0
pW1 = m.add_parameters((HIDDEN_SIZE, 2), device="GPU:1")
pb1 = m.add_parameters(HIDDEN_SIZE, device="GPU:1")
pW2 = m.add_parameters((HIDDEN_SIZE, HIDDEN_SIZE), device="GPU:0")
pb2 = m.add_parameters(HIDDEN_SIZE, device="GPU:0")
pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU")
pa = m.add_parameters(1, device="CPU")

if len(sys.argv) == 2:
  m.populate_from_textfile(sys.argv[1])

dy.renew_cg()
W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa)

x = dy.vecInput(2, "GPU:1")
y = dy.scalarInput(0, "CPU")
h1 = dy.tanh((W1*x) + b1)
h1_gpu0 = dy.to_device(h1, "GPU:0")
h2 = dy.tanh((W2*h1_gpu0) + b2)
h2_cpu = dy.to_device(h2, "CPU")
if xsent:
    y_pred = dy.logistic((V*h2_cpu) + a)
    loss = dy.binary_log_loss(y_pred, y)
    T = 1 
    F = 0 
else:
    y_pred = (V*h2_cpu) + a 
    loss = dy.squared_distance(y_pred, y)
    T = 1 
    F = -1

Exemple #53
0
def calc_scores(words):
  dy.renew_cg()
  h = dy.esum([dy.lookup(W_emb, x) for x in words])
  for W_h_i, b_h_i in zip(W_h, b_h):
    h = dy.tanh( W_h_i * h + b_h_i )
  return W_sm * h + b_sm
Exemple #54
0
    def __init__(self, vocab, w2i, pos, rels, options):
        if isinstance(options, dict):
            options = _dict_to_obj(options, 'Values')

        self.model = ParameterCollection()
        random.seed(1)
        self.trainer = AdamTrainer(self.model)

        self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify,
                            'tanh3': (lambda x: tanh(cmult(cmult(x, x), x)))}
        self.activation = self.activations[options.activation]

        self.blstm_flag = options.blstmFlag
        self.labels_flag = options.labelsFlag
        self.costaug_flag = options.costaugFlag
        self.bibi_flag = options.bibiFlag

        self.ldims = options.lstm_dims
        self.wdims = options.wembedding_dims
        self.pdims = options.pembedding_dims
        self.rdims = options.rembedding_dims
        self.layers = options.lstm_layers
        self.words_count = vocab
        self.vocab = {word: ind + 3 for word, ind in list(w2i.items())}
        self.pos = {word: ind + 3 for ind, word in enumerate(pos)}
        self.rels = {word: ind for ind, word in enumerate(rels)}
        self.irels = rels

        if self.bibi_flag:
            self.builders = [LSTMBuilder(1, self.wdims + self.pdims, self.ldims, self.model),
                             LSTMBuilder(1, self.wdims + self.pdims, self.ldims, self.model)]
            self.bbuilders = [LSTMBuilder(1, self.ldims * 2, self.ldims, self.model),
                              LSTMBuilder(1, self.ldims * 2, self.ldims, self.model)]
        elif self.layers > 0:
            self.builders = \
                [LSTMBuilder(self.layers, self.wdims + self.pdims, self.ldims, self.model),
                 LSTMBuilder(self.layers, self.wdims + self.pdims, self.ldims, self.model)]
        else:
            self.builders = [SimpleRNNBuilder(1, self.wdims + self.pdims, self.ldims, self.model),
                             SimpleRNNBuilder(1, self.wdims + self.pdims, self.ldims, self.model)]

        self.hidden_units = options.hidden_units
        self.hidden2_units = options.hidden2_units

        self.vocab['*PAD*'] = 1
        self.pos['*PAD*'] = 1

        self.vocab['*INITIAL*'] = 2
        self.pos['*INITIAL*'] = 2

        self.wlookup = self.model.add_lookup_parameters((len(vocab) + 3, self.wdims))
        self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims))
        self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims))

        self.hid_layer_foh = self.model.add_parameters((self.hidden_units, self.ldims * 2))
        self.hid_layer_fom = self.model.add_parameters((self.hidden_units, self.ldims * 2))
        self.hid_bias = self.model.add_parameters((self.hidden_units))

        self.hid2_layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
        self.hid2_bias = self.model.add_parameters((self.hidden2_units))

        self.out_layer = self.model.add_parameters(
            (1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))

        if self.labels_flag:
            self.rhid_layer_foh = self.model.add_parameters((self.hidden_units, 2 * self.ldims))
            self.rhid_layer_fom = self.model.add_parameters((self.hidden_units, 2 * self.ldims))
            self.rhid_bias = self.model.add_parameters((self.hidden_units))
            self.rhid2_layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
            self.rhid2_bias = self.model.add_parameters((self.hidden2_units))
            self.rout_layer = self.model.add_parameters(
                (len(self.irels),
                 self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
            self.rout_bias = self.model.add_parameters((len(self.irels)))
def calc_loss(sents):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    src_sents = [x[0] for x in sents]
    tgt_sents = [x[1] for x in sents]
    src_cws = []

    src_len = [len(sent) for sent in src_sents]        
    max_src_len = np.max(src_len)
    num_words = 0

    for i in range(max_src_len):
        src_cws.append([sent[i] for sent in src_sents])


    #get the outputs of the first LSTM
    src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])]
    src_output = src_outputs[-1]

    #gets the parameters for the attention
    src_output_matrix = dy.concatenate_cols(src_outputs)
    w1_att_src = dy.parameter(w1_att_src_p)
    fixed_attentional_component = w1_att_src * src_output_matrix

    #now decode
    all_losses = []

    # Decoder
    #need to mask padding at end of sentence
    tgt_cws = []
    tgt_len = [len(sent) for sent in sents]
    max_tgt_len = np.max(tgt_len)
    masks = []

    for i in range(max_tgt_len):
        tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents])
        mask = [(1 if len(sent) > i else 0) for sent in tgt_sents]
        masks.append(mask)
        num_words += sum(mask)



    current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)])
    prev_words = tgt_cws[0]
    W_sm = dy.parameter(W_sm_p)
    b_sm = dy.parameter(b_sm_p)

    W_m = dy.parameter(W_m_p)
    b_m = dy.parameter(b_m_p)

    for next_words, mask in zip(tgt_cws[1:], masks):
        #feed the current state into the 
        current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words))
        output_embedding = current_state.output()
        att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component)
        middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])]))
        s = dy.affine_transform([b_sm, W_sm, middle_expr])
        loss = (dy.pickneglogsoftmax_batch(s, next_words))
        mask_expr = dy.inputVector(mask)
        mask_expr = dy.reshape(mask_expr, (1,),len(sents))
        mask_loss = loss * mask_expr
        all_losses.append(mask_loss)
        prev_words = next_words
    return dy.sum_batches(dy.esum(all_losses)), num_words
Exemple #56
0
 def _combine(self, attn, query):
     comb = super(LuongAttention, self)._combine(attn, query)
     return dy.tanh(comb)
Exemple #57
0
 def _combine(self, attn, query):
     comb = super(ScaledDotProductAttention, self)._combine(attn, query)
     return dy.tanh(comb)
Exemple #58
0
 def _combine(self, attn, query):
     comb = super(DotProductAttention, self)._combine(attn, query)  # ((H,), B)
     return dy.tanh(comb)
Exemple #59
0
def mlp(x, W, V, b):
    # A mlp with only one hidden layer.
    return V * dy.tanh(W * x + b)
Exemple #60
0
ITERATIONS = 2000

m = dy.Model()
trainer = dy.SimpleSGDTrainer(m)

W = m.add_parameters((HIDDEN_SIZE, 2))
b = m.add_parameters(HIDDEN_SIZE)
V = m.add_parameters((1, HIDDEN_SIZE))
a = m.add_parameters(1)

if len(sys.argv) == 2:
  m.populate_from_textfile(sys.argv[1])

x = dy.vecInput(2)
y = dy.scalarInput(0)
h = dy.tanh((W*x) + b)
if xsent:
    y_pred = dy.logistic((V*h) + a)
    loss = dy.binary_log_loss(y_pred, y)
    T = 1
    F = 0
else:
    y_pred = (V*h) + a
    loss = dy.squared_distance(y_pred, y)
    T = 1
    F = -1


for iter in range(ITERATIONS):
    mloss = 0.0
    for mi in range(4):