def copy_src_probs_pick(token_type, token_literal): if token_type not in copy_atts: return dy.scalarInput(0.0) selected_indexes = copy_history[token_type][token_literal] if len(selected_indexes) == 0: return dy.scalarInput(0.0) probs = copy_src_probs(token_type) return dy.sum_elems(dy.select_rows(probs, selected_indexes))
def embed_word(self, word): if self.tied: word_embs = self.final_mlp.layers[-1].w word_emb = dy.select_rows(word_embs, [word]) word_emb = dy.transpose(word_emb) else: word_emb = dy.lookup(self.word_embs, word) return word_emb
def copy_src_probs_map(token_type, lazy=False): if token_type not in copy_atts: return {} literal_history = copy_history[token_type] if all(len(history) == 0 for history in literal_history.values()): return {} probs = copy_src_probs(token_type) if lazy: return { literal: dy.sum_elems(dy.select_rows(probs, history)) for literal, history in literal_history.items() if len(history) > 0 } return { literal: dy.sum_elems(dy.select_rows(probs, history)).value() for literal, history in literal_history.items() if len(history) > 0 }
def cal_context(self, s, selected=None): ws = self.cal_scores(s) if selected is None: return self.es_matrix * ws, ws selected_ws = dy.select_rows(ws, selected) selected_ws = dy.cdiv(selected_ws, dy.sum_elems(selected_ws)) return dy.concatenate_cols( [es[index] for index in selected]) * selected_ws, ws
def split(x, dim=1): head_shape, batch_size = x.dim() res = [] if dim == 0: for i in range(head_shape[0]): res.append(dy.select_rows(x, [i])) elif dim == 1: for i in range(head_shape[1]): res.append(dy.select_cols(x, [i])) return res
def embed_word(self, word): if self.tied: word_embs = self.output_mlp.layers[-1].w word_emb = dy.select_rows(word_embs, [word]) word_emb = dy.transpose(word_emb) else: word_emb = dy.lookup(self.word_embs, word) # Normalize word vectors to have length one #word_emb_norm = dy.pow(dy.dot_product(word_emb, word_emb), self.exp) #word_emb = word_emb * word_emb_norm return word_emb
def _lm_model_step(m, beam_indices, tag_indices): m["beam_lm_states"] = [ m["beam_lm_states"][b_idx].add_input(t2e[t_idx]) for (b_idx, t_idx) in izip(beam_indices, tag_indices) ] m["beam_lm_hs"] = dy.concatenate_cols( [x.output() for x in m["beam_lm_states"]]) m["idx"] = m["idx"] + 1 if cfg["accumulate_scores"]: beam_size_prev, num_tags = m["scores"].dim()[0] scores_flat = dy.reshape(m["scores"], (beam_size_prev * num_tags, 1)) m["acc_scores"] = dy.select_rows( scores_flat, beam_indices + tag_indices * beam_size_prev)
def from_input_prob(self, selected_indexes, neg=False): assert type(selected_indexes) == set selected_indexes = [ index for index, old_index in enumerate(input_num_indexes) if old_index in selected_indexes ] if len(selected_indexes) == 0: return dy.scalarInput(0.0), dy.zeros(decoder.arg_dim) signed_h = self._signed_h(neg=neg) input_ref, probs = input_atts.cal_context( signed_h, selected_indexes) with parameters(decoder.neg_input_embed, decoder.pos_input_embed) as (neg_input_embed, pos_input_embed): input_ref = dy.concatenate([ input_ref, neg_input_embed if neg else pos_input_embed ]) return dy.sum_elems(dy.select_rows( probs, selected_indexes)), input_ref
def from_exprs_prob(self, selected_indexes, neg=False): assert type(selected_indexes) == set selected_indexes = [ index for index, old_index in enumerate(self.exprs_num_indexs) if old_index in selected_indexes ] if len(selected_indexes) == 0: return dy.scalarInput(0.0), dy.zeros(decoder.arg_dim) ht = dy.tanh(decoder.h2ht(self.s.output())) signed_h = self._signed_h(ht, neg) exprs_ref, probs = self.expr_atts.cal_context( signed_h, selected_indexes) with parameters(decoder.neg_exprs_embed, decoder.pos_exprs_embed) as (neg_exprs_embed, pos_exprs_embed): exprs_ref = dy.concatenate([ exprs_ref, neg_exprs_embed if neg else pos_exprs_embed ]) return dy.sum_elems(dy.select_rows( probs, selected_indexes)), exprs_ref
def attend(input_vectors, state, params, dropout_amount=0.): """Attends on some input vectors given a state and attention parameters. Inputs: input_vectors (list of dy.Expression): Vectors to attend on. state (dy.Expression): A state (query). params (dy.Expression): Attentional weights to transform the state before computing attention. dropout_amount (float, optional): The amount of dropout to apply after transforming the state. Returns: dy.Expression representing the weighted sum of input vectors given the computed attentional weights. """ projected_state = dy.transpose( dy.reshape(state, (1, state.dim()[0][0])) * params) projected_state = dy.dropout(projected_state, dropout_amount) attention_weights = dy.select_rows( dy.transpose(projected_state) * dy.concatenate_cols(input_vectors), [0])[0] context = dy.concatenate_cols(input_vectors) * dy.softmax( attention_weights) return context, dy.softmax(attention_weights)
def get_rlstm_output(self, hypothesis, word2int, P_mat_in, prem_seq_len, improvement): lookup = self.params["lookup"] # get lookup parameters hypo_seq = [lookup[word2int.get(i)] for i in hypothesis] # get embeddings of each word # get initial state fw_s0 = self.fw_hypo_builder.initial_state() bw_s0 = self.bw_hypo_builder.initial_state() # will get the last state each time fw_s = fw_s0 bw_s = bw_s0 # get fw parameter expressions fw_At_prev = dy.parameter(self.params["fw_A_t0"]) fw_Wp = dy.parameter(self.params["fw_Wp"]) fw_Wm = dy.parameter(self.params["fw_Wm"]) fw_Wc = dy.parameter(self.params["fw_Wc"]) fw_Walpha = dy.parameter(self.params["fw_Walpha"]) bw_At_prev = dy.parameter(self.params["bw_A_t0"]) bw_Wp = dy.parameter(self.params["bw_Wp"]) bw_Wm = dy.parameter(self.params["bw_Wm"]) bw_Wc = dy.parameter(self.params["bw_Wc"]) bw_Walpha = dy.parameter(self.params["bw_Walpha"]) # create mask for the attend vector to take into account only the length of the current sequence if prem_seq_len < self.max_seq_len: mask = dy.concatenate([ dy.ones(prem_seq_len), dy.zeros(self.max_seq_len - prem_seq_len) ]) # bw_mask = dy.concatenate([dy.zeros(self.max_seq_len-prem_seq_len), dy.ones(prem_seq_len)]) else: mask = dy.ones(prem_seq_len) # bw_mask = dy.ones(prem_seq_len) # calculate forward & backward mask At_mask_fw = dy.cmult(fw_At_prev, mask) At_mask_bw = dy.cmult(bw_At_prev, mask) if improvement == "2" or improvement == "3": if prem_seq_len < self.max_seq_len: bw_mask = dy.concatenate([ dy.zeros(self.max_seq_len - prem_seq_len), dy.ones(prem_seq_len) ]) else: bw_mask = dy.ones(prem_seq_len) At_mask_bw = dy.cmult(bw_At_prev, bw_mask) if improvement == "2" or improvement == "3": P_mat = P_mat_in[0] else: P_mat = P_mat_in idx = 0 fw_output_vec = [] # calculate the new output with the attention of the fw lstm for word in hypo_seq: fw_s = fw_s.add_input(word) # add input to the network h_t = fw_s.h()[0] # get the output vector of the current timestep # get the output gate value: Weights = self.fw_hypo_builder.get_parameter_expressions() Wox = dy.select_rows( Weights[0][0], range(self.params_size * 2, self.params_size * 3)) Woh = dy.select_rows( Weights[0][1], range(self.params_size * 2, self.params_size * 3)) bo = dy.select_rows( Weights[0][2], range(self.params_size * 2, self.params_size * 3)) if idx == 0: out_gate = dy.logistic(Wox * word + bo) else: h_t_prev = fw_s.prev().h()[0] out_gate = dy.logistic(Wox * word + Woh * h_t_prev + bo) # matrix multiplication - [params_size x max_len_seq] x [max_len_seq x 1] # m dim: params_size x 1 mt = P_mat * At_mask_fw # get the new out vector m_gated = dy.cmult(dy.tanh(mt), out_gate) h_t_new = h_t + m_gated fw_output_vec.append(h_t_new) # calculate alpha alpha = dy.colwise_add(fw_Wp * P_mat, fw_Wm * mt) if idx > 0: s_t_prev = fw_s.prev().s()[0] alpha = dy.colwise_add(alpha, fw_Wc * s_t_prev) if improvement == "1" or improvement == "3": alpha = dy.tanh(alpha) # compute the next At At_fw = dy.transpose(dy.transpose(fw_Walpha) * alpha) At_fw_exp = dy.exp(At_fw) At_fw_exp_mask = dy.cmult(At_fw_exp, mask) At_mask_fw = dy.cdiv(At_fw_exp_mask, dy.sum_elems(At_fw_exp_mask)) idx += 1 if improvement == "2" or improvement == "3": P_mat = P_mat_in[1] else: P_mat = P_mat_in idx = 0 bw_output_vec = [] # calculate the new output with the attention of the bw lstm for word in reversed(hypo_seq): bw_s = bw_s.add_input(word) # add input to the network h_t = bw_s.h()[0] # get the output vector of the current timestep # get the output gate value: Weights = self.bw_hypo_builder.get_parameter_expressions() Wox = dy.select_rows( Weights[0][0], range(self.params_size * 2, self.params_size * 3)) Woh = dy.select_rows( Weights[0][1], range(self.params_size * 2, self.params_size * 3)) bo = dy.select_rows( Weights[0][2], range(self.params_size * 2, self.params_size * 3)) if idx == 0: out_gate = dy.logistic(Wox * word + bo) else: h_t_prev = bw_s.prev().h()[0] out_gate = dy.logistic(Wox * word + Woh * h_t_prev + bo) # matrix multiplication - [params_size x max_len_seq] x [max_len_seq x 1] # m dim: params_size x 1 mt = P_mat * At_mask_bw # get the new out vector m_gated = dy.cmult(dy.tanh(mt), out_gate) h_t_new = h_t + m_gated bw_output_vec.append(h_t_new) # calculate alpha alpha = dy.colwise_add(bw_Wp * P_mat, bw_Wm * mt) if idx > 0: s_t_prev = bw_s.prev().s()[0] alpha = dy.colwise_add(alpha, bw_Wc * s_t_prev) if improvement == "1" or improvement == "3": alpha = dy.tanh(alpha) # compute the next At At_bw = dy.transpose(dy.transpose(bw_Walpha) * alpha) At_bw_exp = dy.exp(At_bw) At_bw_exp_mask = dy.cmult(At_bw_exp, mask) At_mask_bw = dy.cdiv(At_bw_exp_mask, dy.sum_elems(At_bw_exp_mask)) idx += 1 return fw_output_vec, bw_output_vec