def selection_by_tree(self, tree, mode, idx=0): input_layers, pairs = self._select_by_tree(tree, mode, True) if len(pairs) == 0: if not self.opt['allow_partial']: input_layers, pairs = self._select_by_tree(tree, mode, False) else: print 'early stop! discard {} / {}.'.format( len(tree.V), len(tree.terms)) return None, None W1_rl = dy.parameter(self.model_parameters['W1_rl']) b1_rl = dy.parameter(self.model_parameters['b1_rl']) if not self.opt['one_layer']: W2_rl = dy.parameter(self.model_parameters['W2_rl']) b2_rl = dy.parameter(self.model_parameters['b2_rl']) # pr = W2_rl * dy.rectify(W1_rl * dy.concatenate_to_batch(input_layers) + b1_rl) + b2_rl # (V x N)x160 160x50 50x60 60x1 input_layers = dy.concatenate_cols(input_layers) input_layers = dy.transpose(input_layers) if not self.opt['one_layer']: if self.opt['use_history']: pr = input_layers * dy.rectify(W2_rl * dy.rectify( W1_rl * self.history[idx].output() + b1_rl) + b2_rl) else: pr = dy.rectify(input_layers * W2_rl + b2_rl) * W1_rl + b1_rl else: if self.opt['use_history']: pr = input_layers * dy.rectify( W1_rl * self.history[idx].output() + b1_rl) else: pr = input_layers * W1_rl + b1_rl # (#actions, ) pr = dy.reshape(pr, (len(pairs), )) return dy.softmax(pr), pairs
def attend(self, input_mat, state, w1dt, w2, v, coverage): w2dt = w2 * dy.concatenate(list(state.s())) if coverage: w1dt = w1dt + self.w_cov * dy.transpose(coverage) a_t = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) a_t = dy.softmax(a_t) return a_t, (input_mat * a_t)
def generate(self, num, limit=40, beam=3): dy.renew_cg() generated = [] W = dy.parameter(self.W) b = dy.parameter(self.b) for wordi in range(num): # Initialize the LSTM state with EOW token. start_state = self.lstm.initial_state() start_state = start_state.add_input(self.lookup[self.c2i[EOW]]) best_states = [('', start_state, 0)] final_hypotheses = [] # Perform beam search. while len(final_hypotheses) < beam and len(best_states) > 0: new_states = [] for hyp, s, p in best_states: # Cutoff when we exceed the character limit. if len(hyp) >= limit: final_hypotheses.append((hyp, p)) continue # Get the prediction from the current LSTM state. unnormalized = dy.affine_transform([b, W, s.output()]) softmax = dy.softmax(unnormalized).npvalue() # Sample beam number of times. for beami in range(beam): ci = sample_softmax(softmax) c = self.i2c[ci] next_p = softmax[ci] logp = p - np.log(next_p) if c == EOW: # Add final hypothesis if we reach end of word. final_hypotheses.append((hyp, logp)) else: # Else add to states to search next time step. new_states.append((hyp + c, s.add_input(self.lookup[ci]), logp)) # Sort and prune the states to within the beam. new_states.sort(key=lambda t: t[-1]) best_states = new_states[:beam] final_hypotheses.sort(key=lambda t: t[-1]) generated.append(final_hypotheses[0][0]) return generated
def compute_output_layer(self, input): res = [input] for i, p in enumerate(self.parameters): W, b = dy.parameter(p[0]), dy.parameter(p[1]) if i == len(self.parameters) - 1: res.append(dy.softmax(W * res[-1] + b)) else: res.append(self.activation(W * res[-1] + b)) return res
def __call__(self, x, h_matrix, noprob=False): s_t = x for i in range(self.layers - 1): e_t = self.V[i] * dy.tanh(self.W1[i] * h_matrix + self.W2[i] * s_t) a_t = dy.softmax(dy.transpose(e_t)) c_t = h_matrix * a_t s_t = dy.concatenate([x, c_t]) e_t = self.V[-1] * dy.tanh(self.W1[-1] * h_matrix + self.W2[-1] * s_t) + self.B1 * h_matrix + self.B2 * s_t if len(h_matrix.dim()[0]) > 1: e_t = dy.reshape(e_t, (self.V[-1].dim()[0][0] * h_matrix.dim()[0][1], )) if not noprob: p_t = dy.softmax(e_t) return p_t else: return e_t
def get_top_k_paths(self, all_paths, relation_index, threshold): """ Get the top k scoring paths """ builder = self.builder model = self.model model_parameters = self.model_parameters lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] path_scores = [] for i, path in enumerate(all_paths): if i % 1000 == 0: cg = dy.renew_cg() W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if self.num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) path_embedding = get_path_embedding(builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path) if self.use_xy_embeddings: zero_word = dy.inputVector([0.0] * self.lemma_embeddings_dim) path_embedding = dy.concatenate( [zero_word, path_embedding, zero_word]) h = W1 * path_embedding + b1 if self.num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 path_score = dy.softmax(h).npvalue().T path_scores.append(path_score) path_scores = np.vstack(path_scores) top_paths = [] for i in range(len(relation_index)): indices = np.argsort(-path_scores[:, i]) top_paths.append([ (all_paths[index], path_scores[index, i]) for index in indices if threshold is None or path_scores[index, i] >= threshold ]) return top_paths
def __call__(self, sent, n, caches): caches = self._restart_caches(sent, caches) # s: list(len==steps) of {(n_s,), batch_size}, n: {(n_h,), batch_size} wn_t = dy.reshape(n, (1, self.n_h), batch_size=bs(n)) att_e = dy.reshape(wn_t * caches["V"], (BK.dims(caches["V"])[1], ), batch_size=bs(n)) att_alpha = dy.softmax(att_e) ctx = caches["S"] * att_alpha # append and return caches["ctx"] = ctx caches["att"] = att_alpha return caches
def decode_loss(self, src1, src2, tgt): src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward( src1, src2 ) _, prev_coverage = self.get_coverage( a_t=dy.vecInput(len(src1)), prev_coverage=dy.vecInput(len(src1)) ) loss = [] cov_loss = [] diag_loss = [] embedded_tgt = self.embed_idx(tgt, self.tgt_lookup) last_output_embeddings = self.tgt_lookup[self.tgt_vocab.str2int(EOS)] for t, (char, embedded_char) in enumerate(zip(tgt, embedded_tgt)): a_t, c1_t = self.attend( src1_mat, decoder_state, src1_w1dt, self.att1_w2, self.att1_v, prev_coverage, ) if not self.single_source: _, c2_t = self.attend( src2_mat, decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None ) else: c2_t = dy.vecInput(2 * HIDDEN_DIM) x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings]) decoder_state = decoder_state.add_input(x_t) out_vector = self.dec_w * decoder_state.output() + self.dec_b probs = dy.softmax(out_vector) probs, _ = self.get_pointergen_probs( c1_t, decoder_state, x_t, a_t, probs, src1 ) loss.append(-dy.log(dy.pick(probs, char))) cov_loss_cur, prev_coverage = self.get_coverage(a_t, prev_coverage) cov_loss.append(cov_loss_cur) diag_loss.append(self.get_diag_loss(a_t, t)) last_output_embeddings = embedded_char loss = dy.esum(loss) cov_loss = dy.esum(cov_loss) diag_loss = dy.esum(diag_loss) return loss + COV_LOSS_WEIGHT * cov_loss + DIAG_LOSS_WEIGHT * diag_loss
def calc_attend(self, a_vecs, b_vecs, dropout): l_a = a_vecs.dim()[1] l_b = b_vecs.dim()[1] fa = self.attend.evaluate_network(a_vecs, True, dropout) fb = self.attend.evaluate_network(b_vecs, True, dropout) e_ij = list() for i in range(l_a): e_ij.append(list()) for j in range(l_b): e_ij[i].append( dy.dot_product(dy.pick_batch_elem(fa, i), dy.pick_batch_elem(fb, j))) beta_softmaxes = [ dy.softmax(dy.concatenate(e_ij[i])) for i in range(l_a) ] alpha_softmaxes = [ dy.softmax(dy.concatenate([e_ij[i][j] for j in range(l_b)])) for i in range(l_a) ] betas = [ dy.esum([ dy.pick_batch_elem(b_vecs, j) * beta_softmaxes[i][j] for j in range(l_b) ]) for i in range(l_a) ] alphas = [ dy.esum([ dy.pick_batch_elem(a_vecs, i) * alpha_softmaxes[i][j] for i in range(l_a) ]) for j in range(l_b) ] return alphas, betas
def __call__(self, sent, n, caches): # s: list(len==steps) of {(n_s,), batch_size}, n: {(n_h,), batch_size} caches = self._restart_caches(sent, caches) val_h = self.iparams["h2e"] * n # {(n_hidden,), batch_size} att_hidden_bef = dy.colwise_add( caches["V"], val_h) # {(n_didden, steps), batch_size} att_hidden = dy.tanh(att_hidden_bef) # if self.hdrop > 0: # save some space # att_hidden = dy.dropout(att_hidden, self.hdrop) att_e = dy.reshape(self.iparams["v"] * att_hidden, (BK.dims(caches["V"])[1], ), batch_size=bs(att_hidden)) att_alpha = dy.softmax(att_e) ctx = caches["S"] * att_alpha # {(n_s, sent_len), batch_size} # append and return caches["ctx"] = ctx caches["att"] = att_alpha return caches
def attend(self, encoded_inputs, h_t, input_masks=None): # encoded_inputs dimension is: seq len x 2*h x batch size, h_t dimension is h x batch size (for bilstm encoder) if len(encoded_inputs) == 1: # no need to attend if only one input state, compute output directly h_output = dn.tanh(self.w_c * dn.concatenate([h_t, encoded_inputs[0]])) # return trivial alphas (all 1's since one input gets all attention) if input_masks: # if batching alphas = dn.inputTensor([1] * len(input_masks[0]), batched=True) else: alphas = dn.inputTensor([1], batched=True) return h_output, alphas # iterate through input states to compute attention scores # scores = [v_a * dn.tanh(w_a * h_t + u_a * h_input) for h_input in blstm_outputs] w_a_h_t = self.w_a * h_t scores = [ self.v_a * dn.tanh(dn.affine_transform([w_a_h_t, self.u_a, h_input])) for h_input in encoded_inputs ] concatenated = dn.concatenate(scores) if input_masks: # if batching, multiply attention scores with input masks to zero-out scores for padded inputs dn_masks = dn.inputTensor(input_masks, batched=True) concatenated = dn.cmult(concatenated, dn_masks) # normalize scores alphas = dn.softmax(concatenated) # compute context vector with weighted sum for each seq in batch bo = dn.concatenate_cols(encoded_inputs) c = bo * alphas # c = dn.esum([h_input * dn.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # compute output vector using current decoder state and context vector h_output = dn.tanh(self.w_c * dn.concatenate([h_t, c])) return h_output, alphas
def build_network(params, x_data): _, E, b, U, W, bp = params if type(x_data) == dict: # print("DICT") prefix_ordinals = x_data['prefix'] suffix_ordinals = x_data['suffix'] x_ordinals = x_data['fullwords'] else: prefix_ordinals = None suffix_ordinals = None x_ordinals = x_data x = dy.concatenate([E[ord] for ord in x_ordinals]) if prefix_ordinals: x_pre = dy.concatenate([E[ord] for ord in prefix_ordinals]) x = x + x_pre if suffix_ordinals: x_suf = dy.concatenate([E[ord] for ord in suffix_ordinals]) x = x + x_suf output = dy.softmax(U * (dy.tanh(W * x + b)) + bp) return output
def generator(encoder, decoder, params_encoder, params_decoder, sentence, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split() ] s_vector = [] generate = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] input_word = "<start>" _lookup = params_decoder["lookup"] repeat = 0 while True: dy_env = dy.inputTensor(get_state_embed3(env)) repeat += 1 word = vocab_out.index(input_word) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_state = dy_sc_vector * weight_char encode_output = dy_s_vector * weight _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) top = 0 while True: top += 1 if top == 50: top = 1 break prediction = np.argsort(probs.vec_value())[-top] if (vocab_out[prediction] == '<end>'): break if (vocab_out[prediction] == '<start>'): continue new_env = str(execute(env, [vocab_out[prediction]])) if new_env == 'None': continue break prediction = np.argsort(probs.vec_value())[-top] input_word = vocab_out[prediction] if input_word == '<end>': break if repeat >= 10: break generate.append(input_word) env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' while '<start>' in generate: generate.remove('<start>') previous = s.output() return generate, previous
def __call__(self, x, tm1s=None, test=False): if test: # Initial states s_tm1 = tm1s[0] c_tm1 = tm1s[1] w_tm1 = x # GRU s_t = self.GRUBuilder.initial_state().set_s([s_tm1]).add_input( dy.concatenate([w_tm1, c_tm1])).output() # Attention e_t = dy.pick( self.va * dy.tanh(dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = dy.softmax(self.Wo * m_t) return s_t, c_t, y_t else: w_embs = x # Initial states s_tm1 = self.s_0 c_tm1 = self.c_0 GRU = self.GRUBuilder.initial_state().set_s([s_tm1]) y = [] for w_tm1 in w_embs: # GRU GRU = GRU.add_input(dy.concatenate([w_tm1, c_tm1])) s_t = GRU.output() # Attention e_t = dy.pick( self.va * dy.tanh( dy.colwise_add(self.Ua * self.hp, self.Wa * s_tm1)), 0) a_t = dy.softmax(e_t) c_t = dy.esum([ dy.cmult(a_t_i, h_i) for a_t_i, h_i in zip(a_t, dy.transpose(self.hp)) ]) #c_t = self.hp*a_t # memory error? # Output r_t = dy.concatenate_cols([ Wr_j * w_tm1 + Ur_j * c_t + Vr_j * s_t for Wr_j, Ur_j, Vr_j in zip(self.Wr, self.Ur, self.Vr) ]) # Maxout m_t = dy.max_dim(r_t, d=1) y_t = self.Wo * m_t y.append(y_t) # t -> tm1 s_tm1 = s_t c_tm1 = c_t return y
def generate_beam(self, src1, src2): src1_mat, src2_mat, src1_w1dt, src2_w1dt, decoder_state = self.encoder_forward( src1, src2 ) hypothesis_list = [ Hypothesis( text_list=[self.tgt_vocab.str2int(EOS)], decoder_state=decoder_state, c1_t=dy.vecInput(2 * HIDDEN_DIM), c2_t=dy.vecInput(2 * HIDDEN_DIM), prev_coverage=self.get_coverage( a_t=dy.vecInput(len(src1)), training=False, prev_coverage=dy.vecInput(len(src1)), ), score=0.0, p_gens=[], ) ] completed_list = [] for t in range(int(len(src1) * 1.1)): new_hyp_list = [] new_hyp_scores = [] for hyp in hypothesis_list: last_output_embeddings = self.tgt_lookup[hyp.text_list[-1]] a_t, c1_t = self.attend( src1_mat, hyp.decoder_state, src1_w1dt, self.att1_w2, self.att1_v, hyp.prev_coverage, ) if not self.single_source: _, c2_t = self.attend( src2_mat, hyp.decoder_state, src2_w1dt, self.att2_w2, self.att2_v, None, ) else: c2_t = dy.vecInput(2 * HIDDEN_DIM) x_t = dy.concatenate([c1_t, c2_t, last_output_embeddings]) decoder_state = hyp.decoder_state.add_input(x_t) probs = dy.softmax(self.dec_w * decoder_state.output() + self.dec_b) probs, cur_p_gen = self.get_pointergen_probs( c1_t, decoder_state, x_t, a_t, probs, src1 ) probs = probs.npvalue() for ind in range(len(probs)): text_list = hyp.text_list + [ind] p_gens = hyp.p_gens + [cur_p_gen] score = (hyp.score + math.log(probs[ind])) / (len(text_list) ** 0.0) coverage = self.get_coverage(a_t, hyp.prev_coverage, training=False) new_hyp_list.append( Hypothesis( text_list=text_list, decoder_state=decoder_state, c1_t=c1_t, c2_t=c2_t, prev_coverage=coverage, score=score, p_gens=p_gens, ) ) new_hyp_scores.append(score) top_inds = np.argpartition(np.array(new_hyp_scores), -self.beam_size)[ -self.beam_size : ] new_hyp_list = np.array(new_hyp_list)[top_inds] hypothesis_list = [] for new_hyp in new_hyp_list: if new_hyp.text_list[-1] == self.tgt_vocab.str2int(EOS) and t > 0: completed_list.append(new_hyp) else: hypothesis_list.append(new_hyp) if len(completed_list) >= self.beam_size: break if len(completed_list) == 0: sorted(hypothesis_list, key=lambda x: x.score, reverse=True) completed_list = [hypothesis_list[0]] for hyp in completed_list: hyp.text_list = [self.tgt_vocab.int2str(i) for i in hyp.text_list] top_hyp = sorted(completed_list, key=lambda x: x.score, reverse=True)[0] return "".join(top_hyp.text_list).replace(EOS, "").strip(), top_hyp.p_gens[1:-1]
def __call__(self, s_t, h_matrix): e_t = self.v * dy.tanh(self.W1*h_matrix + self.W2 * s_t) a_t = dy.softmax(dy.transpose(e_t)) c_t = h_matrix * a_t return c_t
def process_one_instance(builder, model, model_parameters, instance, path_cache, update=True, dropout=0.0, x_y_vectors=None, num_hidden_layers=0): """ Return the LSTM output vector of a single term-pair - the average path embedding :param builder: the LSTM builder :param model: the LSTM model :param model_parameters: the model parameters :param instance: a Counter object with paths :param path_cache: the cache for path embeddings :param update: whether to update the lemma embeddings :param dropout: word dropout rate :param x_y_vectors: the current word vectors for x and y :param num_hidden_layers The number of hidden layers for the term-pair classification network :return: the LSTM output vector of a single term-pair """ W1 = dy.parameter(model_parameters['W1']) b1 = dy.parameter(model_parameters['b1']) W2 = None b2 = None if num_hidden_layers == 1: W2 = dy.parameter(model_parameters['W2']) b2 = dy.parameter(model_parameters['b2']) lemma_lookup = model_parameters['lemma_lookup'] pos_lookup = model_parameters['pos_lookup'] dep_lookup = model_parameters['dep_lookup'] dir_lookup = model_parameters['dir_lookup'] # Use the LSTM output vector and feed it to the MLP # Add the empty path paths = instance if len(paths) == 0: paths[EMPTY_PATH] = 1 # Compute the averaged path num_paths = reduce(lambda x, y: x + y, instance.itervalues()) path_embbedings = [ get_path_embedding_from_cache(path_cache, builder, lemma_lookup, pos_lookup, dep_lookup, dir_lookup, path, update, dropout) * count for path, count in instance.iteritems() ] input_vec = dy.esum(path_embbedings) * (1.0 / num_paths) # Concatenate x and y embeddings if x_y_vectors is not None: x_vector, y_vector = dy.lookup(lemma_lookup, x_y_vectors[0]), dy.lookup( lemma_lookup, x_y_vectors[1]) input_vec = dy.concatenate([x_vector, input_vec, y_vector]) h = W1 * input_vec + b1 if num_hidden_layers == 1: h = W2 * dy.tanh(h) + b2 output = dy.softmax(h) return output
def do_one_sentence(encoder, decoder, params_encoder, params_decoder, sentence, output, env, first, previous): pos_lookup = params_encoder["pos_lookup"] char_lookup = params_encoder["char_lookup"] char_v = params_decoder["attention_v"] char_w1 = params_decoder["attention_wc"] char_w2 = params_decoder["attention_bc"] sc_vector = [] for i, world in enumerate(_state(env)): world = world sc0 = char_encoder.initial_state() sc = sc0 for char in world: sc = sc.add_input(char_lookup[char2int[char]]) sc_vector.append(dy.concatenate([sc.output(), pos_lookup[i]])) dy_sc_vector = dy.concatenate(sc_vector, d=1) s0 = encoder.initial_state() s = s0 lookup = params_encoder["lookup"] attention_w = params_decoder["attention_w"] attention_b = params_decoder["attention_b"] sentence = sentence + ' <end>' sentence = [ vocab.index(c) if c in vocab else vocab.index('<unknown>') for c in sentence.split(' ') ] loss = [] generate = [] s_vector = [] for word in (sentence): s = s.add_input(lookup[word]) s_vector.append(dy.softmax(attention_w * s.output() + attention_b)) encode_output = s.output() dy_s_vector = dy.concatenate(s_vector, d=1) _s0 = decoder.initial_state(s.s()) _s = _s0 R = params_decoder["R"] bias = params_decoder["bias"] index = 1 input_word = "<start>" _lookup = params_decoder["lookup"] while True: dy_env = dy.inputTensor(get_state_embed3(env)) word = vocab_out.index(input_word) gt_y = vocab_out.index(output[index]) weight = dy.softmax( dy.concatenate([dy.dot_product(x, _s.output()) for x in s_vector])) weight_char = dy.softmax( dy.concatenate([ char_v * dy.tanh(char_w1 * x + char_w2 * _s.output()) for x in sc_vector ])) encode_output = dy_s_vector * weight encode_state = dy_sc_vector * weight_char _s = _s.add_input( dy.concatenate([_lookup[word], encode_output, encode_state])) probs = dy.softmax((R) * _s.output() + bias) prediction = np.argsort(probs.npvalue())[-1] if (vocab_out[prediction]) == '<start>': prediction = np.argsort(probs.npvalue())[-2] generate.append(vocab_out[prediction]) loss.append(-dy.log(dy.pick(probs, gt_y))) if output[index] == '<end>': break index += 1 input_word = vocab_out[prediction] if input_word == '<end>': continue env = str(execute(env, [input_word])) if env == 'None': env = '1:_ 2:_ 3:_ 4:_ 5:_ 6:_ 7:_' loss = dy.esum(loss) while '<start>' in generate: generate.remove('<start>') previous = s.output() return loss, generate, previous
def compute_decoder_batch_loss(self, encoded_inputs, input_masks, output_word_ids, output_masks, batch_size): self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # initial "input feeding" vectors to feed decoder - 3*h init_input_feeding = dn.lookup_batch(self.init_lookup, [0] * batch_size) # initial feedback embeddings for the decoder, use begin seq symbol embedding init_feedback = dn.lookup_batch( self.output_lookup, [self.y2int[common.BEGIN_SEQ]] * batch_size) # init decoder rnn decoder_init = dn.concatenate([init_feedback, init_input_feeding]) s = s_0.add_input(decoder_init) # loss per timestep losses = [] # run the decoder through the output sequences and aggregate loss for i, step_word_ids in enumerate(output_word_ids): # returns h x batch size matrix decoder_rnn_output = s.output() # compute attention context vector for each sequence in the batch (returns 2h x batch size matrix) attention_output_vector, alphas = self.attend( encoded_inputs, decoder_rnn_output, input_masks) # compute output scores (returns vocab_size x batch size matrix) # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # encourage diversity by punishing highly confident predictions # TODO: support batching - esp. w.r.t. scalar inputs if self.diverse: soft = dn.softmax(dn.tanh(h)) batch_loss = dn.pick_batch(-dn.log(soft), step_word_ids) \ - dn.log(dn.scalarInput(1) - dn.pick_batch(soft, step_word_ids)) - dn.log(dn.scalarInput(4)) else: # get batch loss for this timestep batch_loss = dn.pickneglogsoftmax_batch(h, step_word_ids) # mask the loss if at least one sentence is shorter if output_masks and output_masks[i][-1] != 1: mask_expr = dn.inputVector(output_masks[i]) # noinspection PyArgumentList mask_expr = dn.reshape(mask_expr, (1, ), batch_size) batch_loss = batch_loss * mask_expr # input feeding approach - input h (attention_output_vector) to the decoder # prepare for the next iteration - "feedback" feedback_embeddings = dn.lookup_batch(self.output_lookup, step_word_ids) decoder_input = dn.concatenate( [feedback_embeddings, attention_output_vector]) s = s.add_input(decoder_input) losses.append(batch_loss) # sum the loss over the time steps and batch total_batch_loss = dn.sum_batches(dn.esum(losses)) return total_batch_loss
def predict_beamsearch(self, encoder, input_seq): if len(input_seq) == 0: return [] dn.renew_cg() self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) alphas_mtx = [] # encode input sequence blstm_outputs, input_masks = encoder.encode_batch([input_seq]) # complete sequences and their probabilities final_states = [] # initialize the decoder rnn s_0 = self.decoder_rnn.initial_state() # holds beam step index mapped to (sequence, probability, decoder state, attn_vector) tuples beam = {-1: [([common.BEGIN_SEQ], 1.0, s_0, self.init_lookup[0])]} i = 0 # expand another step if didn't reach max length and there's still beams to expand #while i < self.max_prediction_len and len(beam[i - 1]) > 0: while ((self.max_prediction_len is None) or (i < self.max_prediction_len)) and len(beam[i - 1]) > 0: # create all expansions from the previous beam: new_hypos = [] for hypothesis in beam[i - 1]: prefix_seq, prefix_prob, prefix_decoder, prefix_attn = hypothesis last_hypo_symbol = prefix_seq[-1] # cant expand finished sequences if last_hypo_symbol == common.END_SEQ: continue # expand from the last symbol of the hypothesis try: prev_output_vec = self.output_lookup[ self.y2int[last_hypo_symbol]] except KeyError: # not a known symbol print 'impossible to expand, key error: ' + str( last_hypo_symbol) continue decoder_input = dn.concatenate([prev_output_vec, prefix_attn]) s = prefix_decoder.add_input(decoder_input) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas = self.attend( blstm_outputs, decoder_rnn_output) # save attention weights for plotting # TODO: add attention weights properly to allow building the attention matrix for the best path if self.plot: val = alphas.vec_value() alphas_mtx.append(val) # compute output probabilities # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # TODO: understand why diverse needs tanh before softmax if self.diverse: h = dn.tanh(h) probs = dn.softmax(h) probs_val = probs.npvalue() # TODO: maybe should choose nbest from all expansions and not only from nbest of each hypothesis? # find best candidate outputs n_best_indices = common.argmax(probs_val, self.beam_size) for index in n_best_indices: p = probs_val[index] new_seq = prefix_seq + [self.int2y[index]] new_prob = prefix_prob * p #if new_seq[-1] == common.END_SEQ or i == self.max_prediction_len - 1: if new_seq[-1] == common.END_SEQ or ( (self.max_prediction_len is not None) and (i == self.max_prediction_len - 1)): # TODO: add to final states only if fits in k best? # if found a complete sequence or max length - add to final states final_states.append((new_seq[1:-1], new_prob)) else: new_hypos.append( (new_seq, new_prob, s, attention_output_vector)) # add the most probable expansions from all hypotheses to the beam new_probs = np.array([p for (s, p, r, a) in new_hypos]) argmax_indices = common.argmax(new_probs, self.beam_size) beam[i] = [new_hypos[l] for l in argmax_indices] i += 1 # get nbest results from final states found in search final_probs = np.array([p for (s, p) in final_states]) argmax_indices = common.argmax(final_probs, self.beam_size) nbest_seqs = [final_states[l] for l in argmax_indices] return nbest_seqs, alphas_mtx
def predict_greedy(self, encoder, input_seq): dn.renew_cg() self.readout = dn.parameter(self.params['readout']) self.bias = dn.parameter(self.params['bias']) self.w_c = dn.parameter(self.params['w_c']) self.u_a = dn.parameter(self.params['u_a']) self.v_a = dn.parameter(self.params['v_a']) self.w_a = dn.parameter(self.params['w_a']) alphas_mtx = [] if len(input_seq) == 0: return [] # encode input sequence blstm_outputs, input_masks = encoder.encode_batch([input_seq]) # initialize the decoder rnn s = self.decoder_rnn.initial_state() # set prev_output_vec for first lstm step as BEGIN_WORD concatenated with special padding vector prev_output_vec = dn.concatenate([ self.output_lookup[self.y2int[common.BEGIN_SEQ]], self.init_lookup[0] ]) predicted_sequence = [] i = 0 # run the decoder through the sequence and predict output symbols while (self.max_prediction_len is None) or (i < self.max_prediction_len): # get current h of the decoder s = s.add_input(prev_output_vec) decoder_rnn_output = s.output() # perform attention step attention_output_vector, alphas = self.attend( blstm_outputs, decoder_rnn_output) if self.plot: val = alphas.vec_value() alphas_mtx.append(val) # compute output probabilities # h = readout * attention_output_vector + bias h = dn.affine_transform( [self.bias, self.readout, attention_output_vector]) # TODO: understand why diverse needs tanh before softmax if self.diverse: h = dn.tanh(h) probs = dn.softmax(h) # find best candidate output - greedy next_element_index = np.argmax(probs.npvalue()) predicted_sequence.append(self.int2y[next_element_index]) # check if reached end of word if predicted_sequence[-1] == common.END_SEQ: break # prepare for the next iteration - "feedback" prev_output_vec = dn.concatenate([ self.output_lookup[next_element_index], attention_output_vector ]) i += 1 # remove the end seq symbol return predicted_sequence[0:-1], alphas_mtx
def __call__(self, x=None, t=None, test=False): if test: tt_embs = [dy.lookup(self.E, t_t) for t_t in t] if self.encoder_type == 'bow': # Neural language model tt_c = dy.concatenate(tt_embs) h = dy.tanh(self.U * tt_c) # Output with softmax y_t = dy.softmax(self.V * h + self.W_enc) elif self.encoder_type == 'attention': ttp_embs = [dy.lookup(self.G, t_t) for t_t in t] # Neural language model tt_c = dy.concatenate(tt_embs) h = dy.tanh(self.U * tt_c) # Attention ttp_c = dy.concatenate(ttp_embs) p = dy.softmax(self.xt * self.P * ttp_c) # Attention weight enc = self.xb * p # Context vector # Output with softmax y_t = dy.softmax(self.V * h + self.W * enc) return y_t else: xt_embs = [dy.lookup(self.F, x_t) for x_t in x] tt_embs = [dy.lookup(self.E, t_t) for t_t in t] y = [] if self.encoder_type == 'bow': # BoW enc = dy.average(xt_embs) W_enc = self.W * enc for i in range(len(t) - self.c + 1): # Neural language model tt_c = dy.concatenate(tt_embs[i:i + self.c]) h = dy.tanh(self.U * tt_c) # Output without softmax y_t = self.V * h + W_enc y.append(y_t) elif self.encoder_type == 'attention': xb = dy.concatenate([ dy.esum(xt_embs[max(i - self.q, 0 ):min(len(x) - 1 + 1, i + self.q + 1)]) / self.q for i in range(len(x)) ], d=1) xt = dy.transpose(dy.concatenate(xt_embs, d=1)) ttp_embs = [dy.lookup(self.G, t_t) for t_t in t] for i in range(len(t) - self.c + 1): # Neural language model tt_c = dy.concatenate(tt_embs[i:i + self.c]) h = dy.tanh(self.U * tt_c) # Attention ttp_c = dy.concatenate( ttp_embs[i:i + self.c]) # Window-sized embedding p = dy.softmax(xt * self.P * ttp_c) # Attention weight enc = xb * p # Context vector # Output without softmax y_t = self.V * h + self.W * enc y.append(y_t) return y
def classify_single(self, embedding): return vocab.getw( np.argmax(dy.softmax(self.get_graph(embedding)).npvalue()))