def __call__(self, input_exp, hidden_exp, mask=None): # two kinds of dropouts if self.idrop > 0.: input_exp = dy.dropout(input_exp, self.idrop) input_exp_g = input_exp_t = input_exp hidden_exp_g = hidden_exp_t = hidden_exp["H"] if self.gdrop > 0.: input_exp_g = dy.cmult(input_exp, self.masks[0]) hidden_exp_g = dy.cmult(hidden_exp_g, self.masks[1]) input_exp_t = dy.cmult(input_exp, self.masks[2]) hidden_exp_t = dy.cmult(hidden_exp_t, self.masks[3]) rzt = dy.affine_transform([ self.iparams["brz"], self.iparams["x2rz"], input_exp_g, self.iparams["h2rz"], hidden_exp_g ]) rzt = dy.logistic(rzt) rt, zt = dy.pick_range(rzt, 0, self.n_hidden), BK.pick_range( rzt, self.n_hidden, 2 * self.n_hidden) h_reset = dy.cmult(rt, hidden_exp_t) ht = dy.affine_transform([ self.iparams["bh"], self.iparams["x2h"], input_exp_t, self.iparams["h2h"], h_reset ]) ht = dy.tanh(ht) hidden = dy.cmult(zt, hidden_exp["H"]) + dy.cmult( (1. - zt), ht) # first one use original hh # mask: if 0 then pass through if mask is not None: mask_array = np.asarray(mask).reshape((1, -1)) m1 = dy.inputTensor(mask_array, True) # 1.0 for real words m0 = dy.inputTensor(1.0 - mask_array, True) # 1.0 for padding words (mask=0) hidden = hidden * m1 + hidden_exp["H"] * m0 return {"H": hidden}
def attend(self, input_mat, state, w1dt, w2, v, coverage): w2dt = w2 * dy.concatenate(list(state.s())) if coverage: w1dt = w1dt + self.w_cov * dy.transpose(coverage) a_t = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) a_t = dy.softmax(a_t) return a_t, (input_mat * a_t)
def __call__(self, x, h_matrix, noprob=False): s_t = x for i in range(self.layers - 1): e_t = self.V[i] * dy.tanh(self.W1[i] * h_matrix + self.W2[i] * s_t) a_t = dy.softmax(dy.transpose(e_t)) c_t = h_matrix * a_t s_t = dy.concatenate([x, c_t]) e_t = self.V[-1] * dy.tanh(self.W1[-1] * h_matrix + self.W2[-1] * s_t) + self.B1 * h_matrix + self.B2 * s_t if len(h_matrix.dim()[0]) > 1: e_t = dy.reshape(e_t, (self.V[-1].dim()[0][0] * h_matrix.dim()[0][1], )) if not noprob: p_t = dy.softmax(e_t) return p_t else: return e_t
def get_top_k_paths(self, all_paths, relation_index, threshold): """ Get the top k scoring paths """ builder = self.builder model = self.model model_parameters = self.model_parameters lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] path_scores = [] for i, path in enumerate(all_paths): if i % 1000 == 0: cg = dy.renew_cg() W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if self.num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) path_embedding = get_path_embedding(builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path) if self.use_xy_embeddings: zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim) path_embedding = dy.concatenate( [zero_word, path_embedding, zero_word]) h = W1 * path_embedding + b1 if self.num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 path_score = dy.softmax(h).npvalue().T path_scores.append(path_score) path_scores = np.vstack(path_scores) top_paths = [] for i in range(len(relation_index)): indices = np.argsort(-path_scores[:, i]) top_paths.append([ (all_paths[index], path_scores[index, i]) for index in indices if threshold is None or path_scores[index, i] >= threshold ]) return top_paths
def attend(self, encoded_inputs, h_t, input_masks=None): # encoded_inputs dimension is: seq len x 2*h x batch size, h_t dimension is h x batch size (for bilstm encoder) if len(encoded_inputs) == 1: # no need to attend if only one input state, compute output directly h_output = dn.tanh(self.w_c * dn.concatenate([h_t, encoded_inputs[0]])) # return trivial alphas (all 1's since one input gets all attention) if input_masks: # if batching alphas = dn.inputTensor([1] * len(input_masks[0]), batched=True) else: alphas = dn.inputTensor([1], batched=True) return h_output, alphas # iterate through input states to compute attention scores # scores = [v_a * dn.tanh(w_a * h_t + u_a * h_input) for h_input in blstm_outputs] w_a_h_t = self.w_a * h_t scores = [ self.v_a * dn.tanh(dn.affine_transform([w_a_h_t, self.u_a, h_input])) for h_input in encoded_inputs ] concatenated = dn.concatenate(scores) if input_masks: # if batching, multiply attention scores with input masks to zero-out scores for padded inputs dn_masks = dn.inputTensor(input_masks, batched=True) concatenated = dn.cmult(concatenated, dn_masks) # normalize scores alphas = dn.softmax(concatenated) # compute context vector with weighted sum for each seq in batch bo = dn.concatenate_cols(encoded_inputs) c = bo * alphas # c = dn.esum([h_input * dn.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # compute output vector using current decoder state and context vector h_output = dn.tanh(self.w_c * dn.concatenate([h_t, c])) return h_output, alphas
def __call__(self, sent, n, caches): # s: list(len==steps) of {(n_s,), batch_size}, n: {(n_h,), batch_size} caches = self._restart_caches(sent, caches) val_h = self.iparams["h2e"] * n # {(n_hidden,), batch_size} att_hidden_bef = dy.colwise_add( caches["V"], val_h) # {(n_didden, steps), batch_size} att_hidden = dy.tanh(att_hidden_bef) # if self.hdrop > 0: # save some space # att_hidden = dy.dropout(att_hidden, self.hdrop) att_e = dy.reshape(self.iparams["v"] * att_hidden, (BK.dims(caches["V"])[1], ), batch_size=bs(att_hidden)) att_alpha = dy.softmax(att_e) ctx = caches["S"] * att_alpha # {(n_s, sent_len), batch_size} # append and return caches["ctx"] = ctx caches["att"] = att_alpha return caches
def build_network(params, x_data): _, E, b, U, W, bp = params if type(x_data) == dict: # print("DICT") prefix_ordinals = x_data['prefix'] suffix_ordinals = x_data['suffix'] x_ordinals = x_data['fullwords'] else: prefix_ordinals = None suffix_ordinals = None x_ordinals = x_data x = dy.concatenate([E[ord] for ord in x_ordinals]) if prefix_ordinals: x_pre = dy.concatenate([E[ord] for ord in prefix_ordinals]) x = x + x_pre if suffix_ordinals: x_suf = dy.concatenate([E[ord] for ord in suffix_ordinals]) x = x + x_suf output = dy.softmax(U * (dy.tanh(W * x + b)) + bp) return output
def encoder_forward(self, src1, src2): embedded_src1 = self.embed_idx(src1, self.src1_lookup) if self.single_source: embedded_src2 = [dy.vecInput(EMBEDDING_DIM) for idx in src2] else: embedded_src2 = self.embed_idx(src2, self.src2_lookup) encoded_src1 = self.encode( embedded_src1, self.enc1_fwd_lstm, self.enc1_bwd_lstm ) encoded_src2 = self.encode( embedded_src2, self.enc2_fwd_lstm, self.enc2_bwd_lstm ) src1_mat = dy.concatenate_cols(encoded_src1) src1_w1dt = self.att1_w1 * src1_mat src2_mat = dy.concatenate_cols(encoded_src2) src2_w1dt = self.att2_w1 * src2_mat if not self.single_source: start = ( self.W_s * dy.concatenate([encoded_src1[-1], encoded_src2[-1]]) + self.b_s ) else: start = ( self.W_s * dy.concatenate([encoded_src1[-1], dy.vecInput(2 * HIDDEN_DIM)]) + self.b_s ) last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)] c1_t = dy.vecInput(2 * HIDDEN_DIM) c2_t = dy.vecInput(2 * HIDDEN_DIM) decoder_state = self.dec_lstm.initial_state([start, dy.tanh(start)]).add_input( dy.concatenate([c1_t, c2_t, last_output_embeddings]) ) return src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state
def __call__(self, x, tm1s=None, test=False): if test: # Initial states s_tm1 = tm1s[0] c_tm1 = tm1s[1] w_tm1 = x # GRU s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input( dy.concatenate([w_tm1, c_tm1])).output() # Attention e_t = dy.pick( self.va * dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = dy.softmax(self.Wo * m_t) return s_t, c_t, y_t else: w_embs = x # Initial states s_tm1 = self.s_0 c_tm1 = self.c_0 GRU = self.GRUBuilder.initial_state().set_s([s_tm1]) y = [] for w_tm1 in w_embs: # GRU GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1])) s_t = GRU.output() # Attention e_t = dy.pick( self.va * dy.tanh( dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = self.Wo * m_t y.append(y_t) # t -> tm1 s_tm1 = s_t c_tm1 = c_t return y
def set_initial_states(self, hp, hb_1): self.s_0 = dy.tanh(self.Wd * hb_1 + self.bd) self.c_0 = dy.zeroes((2 * self.hid_dim, )) self.hp = hp
def get_features_for_tagging(self, sentence, training): word_feats = [ dy.affine_transform( [ self.feat_b, self.feat_w, dy.inputTensor(feats.reshape(self.featsize, 1)), ] ) for chars, word, feats, tag in sentence ] zero_feats = [ dy.inputTensor(np.zeros(shape=(FEAT_OUT_SIZE, 1))) for chars, word, feats, tag in sentence ] # Non-linear transform for soft gazetteer features if self.feat_func == "tanh": word_feats = [dy.tanh(feat) for feat in word_feats] elif self.feat_func == "relu": word_feats = [dy.rectify(feat) for feat in word_feats] # Soft gazetteer features at the LSTM level if self.lstm_feats: cur_feats = word_feats else: cur_feats = zero_feats word_reps = [ dy.concatenate( [self.cnn.encode(chars, training), self.word_embeds[word], enc_feat] ) for enc_feat, (chars, word, feats, tag) in zip(cur_feats, sentence) ] contexts = self.word_lstm.transduce(word_reps) # Soft gazetteer features at the CRF level if self.crf_feats: cur_feats = word_feats else: cur_feats = zero_feats features = [ dy.affine_transform( [ self.context_to_emit_b, self.context_to_emit_w, dy.concatenate([context, feats]), ] ) for context, feats in zip(contexts, cur_feats) ] t_features = [ dy.reshape( dy.affine_transform( [ self.context_to_trans_b, self.context_to_trans_w, dy.concatenate([context, feats]), ] ), (self.num_tags, self.num_tags), ) for context, feats in zip(contexts, cur_feats) ] # Autoencoder feature reconstruction if self.lstm_feats: feat_reconstruct = [ dy.logistic( dy.affine_transform( [self.feat_reconstruct_b, self.feat_reconstruct_w, context] ) ) for context in contexts ] else: feat_reconstruct = [ dy.inputTensor(np.zeros(shape=(self.featsize,))) for context in contexts ] return features, t_features, feat_reconstruct
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence, output, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split(' ') ] loss = [] generate = [] s_vector = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] index = 1 input_word = "<start>" _lookup = params_decoder["lookup"] while True: dy_env = dy.inputTensor(get_state_embed3(env)) word = vocab_out.index(input_word) gt_y = vocab_out.index(output[index]) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_output = dy_s_vector * weight encode_state = dy_sc_vector * weight_char _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) prediction = np.argsort(probs.npvalue())[-1] if (vocab_out[prediction]) == '<start>': prediction = np.argsort(probs.npvalue())[-2] generate.append(vocab_out[prediction]) loss.append(-dy.log(dy.pick(probs, gt_y))) if output[index] == '<end>': break index += 1 input_word = vocab_out[prediction] if input_word == '<end>': continue env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' loss = dy.esum(loss) while '<start>' in generate: generate.remove('<start>') previous = s.output() return loss, generate, previous
def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size): self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # initial "input feeding" vectors to feed decoder - 3*h init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size) # initial feedback embeddings for the decoder, use begin seq symbol embedding init_feedback = dn.lookup_batch( self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size) # init decoder rnn decoder_init = dn.concatenate([init_feedback, init_input_feeding]) s = s_0.add_input(decoder_init) # loss per timestep losses = [] # run the decoder through the output sequences and aggregate loss for i, step_word_ids in enumerate(output_word_ids): # returns h x batch size matrix decoder_rnn_output = s.output() # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix) attention_output_vector, alphas = self.attend( encoded_inputs, decoder_rnn_output, input_masks) # compute output scores (returns vocab_size x batch size matrix) # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # encourage diversity by punishing highly confident predictions # TODO: support batching - esp. w.r.t. scalar inputs if self.diverse: soft = dn.softmax(dn.tanh(h)) batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \ - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4)) else: # get batch loss for this timestep batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids) # mask the loss if at least one sentence is shorter if output_masks and output_masks[i][-1] != 1: mask_expr = dn.inputVector(output_masks[i]) # noinspection PyArgumentList mask_expr = dn.reshape(mask_expr, (1, ), batch_size) batch_loss = batch_loss * mask_expr # input feeding approach - input h (attention_output_vector) to the decoder # prepare for the next iteration - "feedback" feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids) decoder_input = dn.concatenate( [feedback_embeddings, attention_output_vector]) s = s.add_input(decoder_input) losses.append(batch_loss) # sum the loss over the time steps and batch total_batch_loss = dn.sum_batches(dn.esum(losses)) return total_batch_loss
def predict_beamsearch(self, encoder, input_seq): if len(input_seq) == 0: return [] dn.renew_cg() self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) alphas_mtx = [] # encode input sequence blstm_outputs, input_masks = encoder.encode_batch([input_seq]) # complete sequences and their probabilities final_states = [] # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # holds beam step index mapped to (sequence, probability, decoder state, attn_vector) tuples beam = {-1: [([common.BEGIN_SEQ], 1.0, s_0, self.init_lookup[0])]} i = 0 # expand another step if didn't reach max length and there's still beams to expand #while i < self.max_prediction_len and len(beam[i - 1]) > 0: while ((self.max_prediction_len is None) or (i < self.max_prediction_len)) and len(beam[i - 1]) > 0: # create all expansions from the previous beam: new_hypos = [] for hypothesis in beam[i - 1]: prefix_seq, prefix_prob, prefix_decoder, prefix_attn = hypothesis last_hypo_symbol = prefix_seq[-1] # cant expand finished sequences if last_hypo_symbol == common.END_SEQ: continue # expand from the last symbol of the hypothesis try: prev_output_vec = self.output_lookup[ self.y2int[last_hypo_symbol]] except KeyError: # not a known symbol print 'impossible to expand, key error: ' + str( last_hypo_symbol) continue decoder_input = dn.concatenate([prev_output_vec, prefix_attn]) s = prefix_decoder.add_input(decoder_input) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas = self.attend( blstm_outputs, decoder_rnn_output) # save attention weights for plotting # TODO: add attention weights properly to allow building the attention matrix for the best path if self.plot: val = alphas.vec_value() alphas_mtx.append(val) # compute output probabilities # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # TODO: understand why diverse needs tanh before softmax if self.diverse: h = dn.tanh(h) probs = dn.softmax(h) probs_val = probs.npvalue() # TODO: maybe should choose nbest from all expansions and not only from nbest of each hypothesis? # find best candidate outputs n_best_indices = common.argmax(probs_val, self.beam_size) for index in n_best_indices: p = probs_val[index] new_seq = prefix_seq + [self.int2y[index]] new_prob = prefix_prob * p #if new_seq[-1] == common.END_SEQ or i == self.max_prediction_len - 1: if new_seq[-1] == common.END_SEQ or ( (self.max_prediction_len is not None) and (i == self.max_prediction_len - 1)): # TODO: add to final states only if fits in k best? # if found a complete sequence or max length - add to final states final_states.append((new_seq[1:-1], new_prob)) else: new_hypos.append( (new_seq, new_prob, s, attention_output_vector)) # add the most probable expansions from all hypotheses to the beam new_probs = np.array([p for (s, p, r, a) in new_hypos]) argmax_indices = common.argmax(new_probs, self.beam_size) beam[i] = [new_hypos[l] for l in argmax_indices] i += 1 # get nbest results from final states found in search final_probs = np.array([p for (s, p) in final_states]) argmax_indices = common.argmax(final_probs, self.beam_size) nbest_seqs = [final_states[l] for l in argmax_indices] return nbest_seqs, alphas_mtx
def predict_greedy(self, encoder, input_seq): dn.renew_cg() self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) alphas_mtx = [] if len(input_seq) == 0: return [] # encode input sequence blstm_outputs, input_masks = encoder.encode_batch([input_seq]) # initialize the decoder rnn s = self.decoder_rnn.initial_state() # set prev_output_vec for first lstm step as BEGIN_WORD concatenated with special padding vector prev_output_vec = dn.concatenate([ self.output_lookup[self.y2int[common.BEGIN_SEQ]], self.init_lookup[0] ]) predicted_sequence = [] i = 0 # run the decoder through the sequence and predict output symbols while (self.max_prediction_len is None) or (i < self.max_prediction_len): # get current h of the decoder s = s.add_input(prev_output_vec) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas = self.attend( blstm_outputs, decoder_rnn_output) if self.plot: val = alphas.vec_value() alphas_mtx.append(val) # compute output probabilities # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # TODO: understand why diverse needs tanh before softmax if self.diverse: h = dn.tanh(h) probs = dn.softmax(h) # find best candidate output - greedy next_element_index = np.argmax(probs.npvalue()) predicted_sequence.append(self.int2y[next_element_index]) # check if reached end of word if predicted_sequence[-1] == common.END_SEQ: break # prepare for the next iteration - "feedback" prev_output_vec = dn.concatenate([ self.output_lookup[next_element_index], attention_output_vector ]) i += 1 # remove the end seq symbol return predicted_sequence[0:-1], alphas_mtx
def __call__(self, s_t, h_matrix): e_t = self.v * dy.tanh(self.W1*h_matrix + self.W2 * s_t) a_t = dy.softmax(dy.transpose(e_t)) c_t = h_matrix * a_t return c_t
def process_one_instance(builder, model, model_parameters, instance, path_cache, update=True, dropout=0.0, x_y_vectors=None, num_hidden_layers=0): """ Return the LSTM output vector of a single term-pair - the average path embedding :param builder: the LSTM builder :param model: the LSTM model :param model_parameters: the model parameters :param instance: a Counter object with paths :param path_cache: the cache for path embeddings :param update: whether to update the lemma embeddings :param dropout: word dropout rate :param x_y_vectors: the current word vectors for x and y :param num_hidden_layers The number of hidden layers for the term-pair classification network :return: the LSTM output vector of a single term-pair """ W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] # Use the LSTM output vector and feed it to the MLP # Add the empty path paths = instance if len(paths) == 0: paths[EMPTY_PATH] = 1 # Compute the averaged path num_paths = reduce(lambda x, y: x + y, instance.itervalues()) path_embbedings = [ get_path_embedding_from_cache(path_cache, builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path, update, dropout) * count for path, count in instance.iteritems() ] input_vec = dy.esum(path_embbedings) * (1.0 / num_paths) # Concatenate x and y embeddings if x_y_vectors is not None: x_vector, y_vector = dy.lookup(lemma_lookup, x_y_vectors[0]), dy.lookup( lemma_lookup, x_y_vectors[1]) input_vec = dy.concatenate([x_vector, input_vec, y_vector]) h = W1 * input_vec + b1 if num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 output = dy.softmax(h) return output
def generator(encoder, decoder, params_encoder, params_decoder, sentence, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split() ] s_vector = [] generate = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] input_word = "<start>" _lookup = params_decoder["lookup"] repeat = 0 while True: dy_env = dy.inputTensor(get_state_embed3(env)) repeat += 1 word = vocab_out.index(input_word) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_state = dy_sc_vector * weight_char encode_output = dy_s_vector * weight _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) top = 0 while True: top += 1 if top == 50: top = 1 break prediction = np.argsort(probs.vec_value())[-top] if (vocab_out[prediction] == '<end>'): break if (vocab_out[prediction] == '<start>'): continue new_env = str(execute(env, [vocab_out[prediction]])) if new_env == 'None': continue break prediction = np.argsort(probs.vec_value())[-top] input_word = vocab_out[prediction] if input_word == '<end>': break if repeat >= 10: break generate.append(input_word) env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' while '<start>' in generate: generate.remove('<start>') previous = s.output() return generate, previous
def __call__(self, x=None, t=None, test=False): if test: tt_embs = [dy.lookup(self.E, t_t) for t_t in t] if self.encoder_type == 'bow': # Neural language model tt_c = dy.concatenate(tt_embs) h = dy.tanh(self.U * tt_c) # Output with softmax y_t = dy.softmax(self.V * h + self.W_enc) elif self.encoder_type == 'attention': ttp_embs = [dy.lookup(self.G, t_t) for t_t in t] # Neural language model tt_c = dy.concatenate(tt_embs) h = dy.tanh(self.U * tt_c) # Attention ttp_c = dy.concatenate(ttp_embs) p = dy.softmax(self.xt * self.P * ttp_c) # Attention weight enc = self.xb * p # Context vector # Output with softmax y_t = dy.softmax(self.V * h + self.W * enc) return y_t else: xt_embs = [dy.lookup(self.F, x_t) for x_t in x] tt_embs = [dy.lookup(self.E, t_t) for t_t in t] y = [] if self.encoder_type == 'bow': # BoW enc = dy.average(xt_embs) W_enc = self.W * enc for i in range(len(t) - self.c + 1): # Neural language model tt_c = dy.concatenate(tt_embs[i:i + self.c]) h = dy.tanh(self.U * tt_c) # Output without softmax y_t = self.V * h + W_enc y.append(y_t) elif self.encoder_type == 'attention': xb = dy.concatenate([ dy.esum(xt_embs[max(i - self.q, 0 ):min(len(x) - 1 + 1, i + self.q + 1)]) / self.q for i in range(len(x)) ], d=1) xt = dy.transpose(dy.concatenate(xt_embs, d=1)) ttp_embs = [dy.lookup(self.G, t_t) for t_t in t] for i in range(len(t) - self.c + 1): # Neural language model tt_c = dy.concatenate(tt_embs[i:i + self.c]) h = dy.tanh(self.U * tt_c) # Attention ttp_c = dy.concatenate( ttp_embs[i:i + self.c]) # Window-sized embedding p = dy.softmax(xt * self.P * ttp_c) # Attention weight enc = xb * p # Context vector # Output without softmax y_t = self.V * h + self.W * enc y.append(y_t) return y
def get_graph(self, embedding): dy.renew_cg() w = dy.parameter(self.pW) u = dy.parameter(self.pU) return u * dy.tanh(w * dy.inputTensor(embedding))