def flatten_triple(action_scores, location_scores, argument_scores): """ Flattens three scores vectors by summing over all possibilities. """ num_actions = action_scores.dim()[0][0] num_locations = location_scores.dim()[0][0] num_arguments = argument_scores.dim()[0][0] expanded_arguments = dy.reshape(argument_scores, (num_arguments, 1)) \ * dy.ones((1, num_locations)) expanded_locations = dy.ones((num_arguments, 1)) \ * dy.reshape(location_scores, (1, num_locations)) # num_locations x num_arguments location_and_argument_scores = expanded_locations + expanded_arguments location_and_argument_expanded = dy.reshape(location_and_argument_scores, (num_locations * num_arguments, 1)) \ * dy.ones((1, num_actions)) expanded_actions = dy.ones((num_arguments * num_locations, 1)) \ * dy.reshape(action_scores, (1, num_actions)) final_scores = location_and_argument_expanded + expanded_actions # num_actions * num_locations x num_arguments final_scores = dy.reshape(final_scores, (num_actions * num_locations * num_arguments, 1)) return final_scores
def init_sequence(self, test=False): self.test = test if not test: self.dropout_mask_x = dy.dropout(dy.ones((self.n_in, )), self.dropout_x) self.dropout_mask_h = dy.dropout(dy.ones((self.n_hidden, )), self.dropout_h)
def set_dropouts(self, input_drop=0, recur_drop=0): self.input_drop = input_drop self.recur_drop = recur_drop self.input_drop_mask = dy.dropout(dy.ones(self.input_size), self.input_drop) self.recur_drop_mask = dy.dropout(dy.ones(self.recur_size), self.recur_drop)
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols( src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify( dy.affine_transform( [b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify( dy.affine_transform( [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])) h_label_dep = dy.rectify( dy.affine_transform( [b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def l2_normalize(vector): square_sum = dy.sqrt( dy.bmax( dy.sum_elems(dy.square(vector)), np.finfo(float).eps * dy.ones((1))[0], )) return dy.cdiv(vector, square_sum)
def beam_decode(self, encodings, input_len=10, beam_size=1): batch_size = 1 self.__dec.init_params(encodings, batch_size, self.__train_flag) context = dy.zeros((self.__enc.output_dim, )) beams = [Beam(self.__dec.dec_state, context, [self.__trg_sos], 0.0)] for i in xrange(int(min(self.__max_len, input_len * 1.5))): new_beams = [] p_list = [] for b in beams: if b.words[-1] == self.__trg_eos: p_list.append(dy.ones((self.__trg_vsize, ))) continue hidden, embs, b.state = self.__dec.next([b.words[-1]], b.context, self.__train_flag, b.state) b.context, _ = self.attend(encodings, hidden) score = self.__dec.score(hidden, b.context, embs, self.__train_flag) p_list.append(dy.softmax(score)) p_list = dy.concatenate_to_batch(p_list).npvalue().T.reshape(-1, self.__trg_vsize) for p, b in zip(p_list, beams): p = p.flatten() / p.sum() kbest = np.argsort(p) if b.words[-1] == self.__trg_eos: new_beams.append(Beam(b.state, b.context, b.words, b.log_prob)) else: for next_word in kbest[-beam_size:]: new_beams.append(Beam(b.state, b.context, b.words + [next_word], b.log_prob + np.log(p[next_word]))) beams = sorted(new_beams, key=lambda b: b.log_prob)[-beam_size:] if beams[-1].words[-1] == self.__trg_eos: break return beams[-1].words
def __init__(self,model,input_size,recur_size,forget_bias=0.0): self.input_size = input_size self.recur_size = recur_size self.input_drop_mask = dy.ones(self.input_size) self.recur_drop_mask = dy.ones(self.recur_size) self.forget_bias = forget_bias self.cell_previous = None self.hidden_previous = None self.init = False self.input_drop = 0 self.recur_drop = 0 Saxe_initializer = Saxe.Orthogonal() gates_init = Saxe_initializer(((self.recur_size,self.input_size+self.recur_size))) gates_init = np.concatenate([gates_init]*4) self.WXH = model.add_parameters((self.recur_size*4,self.input_size+self.recur_size), init=dy.NumpyInitializer(gates_init)) self.b = model.add_parameters((self.recur_size*4), init = dy.ConstInitializer(0))
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols(src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify(dy.affine_transform([b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify(dy.affine_transform([b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify(dy.affine_transform([b_label_hidden_to_head, W_label_hidden_to_head, src_encodings])) h_label_dep = dy.rectify(dy.affine_transform([b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def __call__(self, X): d_x = X.dim()[0][0] d_y = X.dim()[0][1] g = dy.ones((d_x, d_y)) b = dy.zeros((d_x, d_y)) Y = [] for attention in self.attention: Y.append(attention(X)) Y = dy.esum(Y) Y = dy.layer_norm(X + Y, g, b) Y = dy.layer_norm(Y + dy.transpose(self.feedforward(dy.transpose(Y))), g, b) return Y
def _lm_model_scores(m): # assert not (cfg["use_beam_bilstm"] or cfg["use_beam_mlp"]) idx = m["idx"] cur_beam_size = len(m["beam_lm_states"]) q = dy.reshape(m["i_embs"][idx], (input_dim, 1)) * dy.ones( (1, cur_beam_size)) x = dy.concatenate([m["beam_lm_hs"], q]) scores = W * x + b scores = dy.transpose(scores) if cfg["accumulate_scores"]: scores = m["acc_scores"] + scores m["scores"] = scores return scores
def train(self, train_path): with open(train_path, "r") as train: shuffledData = list(read_data(train)) random.shuffle(shuffledData) for iPair, (sentence, lf) in enumerate(shuffledData): print(iPair, sentence) # I-Context Encoding lstm_forward = self.context_encoder[0].initial_state() for entry in sentence: lstm_forward = lstm_forward.add_input( self.wlookup[self.w2i[entry] if entry in self.w2i else self.w2i["_UNK"]]) hidden_context = lstm_forward.h() init_h = [dy.ones(self.ldims)] state = self.logical_form_decoder.initial_state() state.set_h(init_h) dy.renew_cg()
def GetQDScore(self, qwds, qw2v, qvecs, dwds, dw2v, dvecs, extra, train=False): nq = len(qvecs) nd = len(dvecs) qgl = [self.W_gate.expr() * dy.concatenate([qv, dy.constant(1, self.idf_val(qw))]) for qv, qw in zip(qvecs, qwds)] qgates = dy.softmax(dy.concatenate(qgl)) sims = [] for qv in qvecs: dsims = [] for dv in dvecs: dsims.append(self.Cosine(qv, dv)) sims.append(dsims) w2v_sims = [] for qv in qw2v: dsims = [] for dv in dw2v: dsims.append(self.Cosine(qv, dv)) w2v_sims.append(dsims) matches = [] for qw in qwds: dmatch = [] for dw in dwds: dmatch.append(dy.ones(1) if qw == dw else dy.zeros(1)) matches.append(dmatch) qscores = self.GetPOSIT(qvecs, sims, w2v_sims, matches) # Final scores and ultimate classifier. qterm_score = dy.dot_product(dy.concatenate(qscores), qgates) fin_score = (self.W_final.expr() * dy.concatenate([qterm_score, extra])) return fin_score
def test_set_s(self): dy.renew_cg() init_s = [dy.ones(10), dy.ones(10), dy.ones(10), dy.ones(10)] state = self.rnn.initial_state() state.set_s(init_s)
def test_initial_state_vec(self): dy.renew_cg() init_s = [dy.ones(10), dy.ones(10), dy.ones(10), dy.ones(10)] self.rnn.initial_state(init_s)
def set_dropout_masks(self,batch_size): self.input_drop_mask = dy.dropout(dy.ones((self.input_size),batch_size),self.input_drop) self.recur_drop_mask = dy.dropout(dy.ones((self.recur_size),batch_size),self.recur_drop)
def calc_loss(sent, epsilon=0.0): #dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] tags = sent[1] # initialize the LSTM init_state_src = lstm_encode.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([embed[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mu_tweet = dy.parameter(W_mu_tweet_p) V_mu_tweet = dy.parameter(V_mu_tweet_p) b_mu_tweet = dy.parameter(b_mu_tweet_p) W_sig_tweet = dy.parameter(W_sig_tweet_p) V_sig_tweet = dy.parameter(V_sig_tweet_p) b_sig_tweet = dy.parameter(b_sig_tweet_p) # Compute tweet encoding mu_tweet = dy.dropout(mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet), DROPOUT) log_var_tweet = dy.dropout( mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet), DROPOUT) W_mu_tag = dy.parameter(W_mu_tag_p) V_mu_tag = dy.parameter(V_mu_tag_p) b_mu_tag = dy.parameter(b_mu_tag_p) W_sig_tag = dy.parameter(W_sig_tag_p) V_sig_tag = dy.parameter(V_sig_tag_p) b_sig_tag = dy.parameter(b_sig_tag_p) # Compute tag encoding tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags), )), (NUM_TAGS, )) mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag), DROPOUT) log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag), DROPOUT) # Combine encodings for mean and diagonal covariance W_mu = dy.parameter(W_mu_p) b_mu = dy.parameter(b_mu_p) W_sig = dy.parameter(W_sig_p) b_sig = dy.parameter(b_sig_p) # Slowly phase out getting both inputs if random.random() < epsilon: mask = dy.zeros(HIDDEN_DIM) else: mask = dy.ones(HIDDEN_DIM) if random.random() < 0.5: mu_tweet = dy.cmult(mu_tweet, mask) log_var_tweet = dy.cmult(log_var_tweet, mask) else: mu_tag = dy.cmult(mu_tag, mask) log_var_tag = dy.cmult(log_var_tag, mask) mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])]) log_var = dy.affine_transform( [b_sig, W_sig, dy.concatenate([log_var_tweet, log_var_tag])]) # KL-Divergence loss computation kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)]) prev_word = src[0] W_sm = dy.parameter(W_tweet_softmax_p) b_sm = dy.parameter(b_tweet_softmax_p) for next_word in src[1:]: # feed the current state into the current_state = current_state.add_input(embed[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) # Slowly phase out teacher forcing (this may be slow??) if random.random() < epsilon: p = dy.softmax(s).npvalue() prev_word = np.random.choice(VOCAB_SIZE, p=p / p.sum()) else: prev_word = next_word softmax_loss = dy.esum(all_losses) W_hidden = dy.parameter(W_hidden_p) b_hidden = dy.parameter(b_hidden_p) W_out = dy.parameter(W_tag_output_p) b_out = dy.parameter(b_tag_output_p) h = dy.dropout(dy.tanh(b_hidden + W_hidden * z), DROPOUT) o = dy.logistic(b_out + W_out * h) crossentropy_loss = dy.binary_log_loss(o, tags_tensor) return kl_loss, softmax_loss, crossentropy_loss
def transduce(self, h_below: 'expression_seqs.ExpressionSequence', h_above, z_below) -> 'expression_seqs.ExpressionSequence': if self.c == None: self.c = dy.zeroes( dim=(self.hidden_dim, )) #?? does (hidden,) take care of batch_size? if self.h == None: self.h = dy.zeroes(dim=(self.hidden_dim, )) if self.z == None: self.z = dy.ones(dim=(1, )) W_1l_r = dy.parameter(self.p_W_1l_r) bias = dy.parameter(self.p_bias) h = dy.parameter(self.h) s_recur = W_1l_r * h #matrix multiply is *, element-wise is dy.cmult. CURRERROR: stale expression if not self.last_layer: W_2l_td = dy.parameter(self.p_W_2l_td) W_0l_bu = dy.parameter(self.p_W_0l_bu) s_bottomup = W_0l_bu * h_below #?? this is becoming (2049,). does it need to be (2049,1) to do scalar * matrix? s_topdown = W_2l_td * h_above else: s_topdown = dy.zeroes( s_recur.dim()[0][0], ) #?? this gets the shape e.g. ((5, 1), 1). do i actually want batch_size as well? s_bottomup = W_1l_r * h s_bottomup = dy.cmult( z_below, s_bottomup ) #to handle batched scalar * matrix -> e.g. (1x10, 2049x10) s_topdown = dy.cmult( self.z, s_topdown ) #will be zeros if last_layer. is this right, or should z=1 in this case ?? fslice = s_recur + s_topdown + s_bottomup + bias #?? checkme. bias has same shape as s_recur et al? [4*hidden+1, batch_size]? i_ft = dy.pick_range(fslice, 0, self.hidden_dim) i_it = dy.pick_range(fslice, self.hidden_dim, self.hidden_dim * 2) i_ot = dy.pick_range(fslice, self.hidden_dim * 2, self.hidden_dim * 3) i_gt = dy.pick_range(fslice, self.hidden_dim * 3, self.hidden_dim * 4) f_t = dy.logistic( i_ft + 1.0 ) #+1.0 bc a paper said it was better to init that way (matthias) i_t = dy.logistic(i_it) o_t = dy.logistic(i_ot) g_t = dy.tanh(i_gt) #z * normal_update + (1-z)*copy: ie, when z_below is 0, z_new = z (copied prev timestamp). when z_below is 1, z_new = dy.round etc #hier = True # z_tmp = dy.pick_range(fslice, self.hidden_dim*4,self.hidden_dim*4+1) # z_tilde = dy.logistic(z_tmp) #original: hard sigmoid + slope annealing (a) # z_new = dy.cmult(1-z_below, self.z) + dy.cmult(z_below, dy.round(z_tilde, gradient_mode="straight_through_gradient")) #hier = False z_tmp = dy.pick_range(fslice, self.hidden_dim * 4, self.hidden_dim * 4 + 1) z_tilde = dy.logistic( z_tmp) #original: hard sigmoid + slope annealing (a) z_new = dy.round( z_tilde, gradient_mode="straight_through_gradient" ) #use straight-through estimator for gradient: step fn forward, hard sigmoid backward #z = z_l,t-1 #z_below = z_l-1,t # if self.z.value() == 1: #FLUSH # c_new = dy.cmult(i_t, g_t) # h_new = dy.cmult(o_t, dy.tanh(c_new)) # elif z_below.value() == 0: #COPY # if flush removed, only copy or normal update # when z_below is 0, c_new and h_new are self.c and self.h. when z_below is 1, c_new, h_new = normal update c_new = dy.cmult((1 - z_below), self.c) + dy.cmult( z_below, (dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t))) h_new = dy.cmult((1 - z_below), self.h) + dy.cmult( z_below, dy.cmult(o_t, dy.tanh(c_new))) # if z_below.value() == 0: #COPY # c_new = self.c # h_new = self.h # else: #UPDATE # c_new = dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t) # h_new = dy.cmult(o_t, dy.tanh(c_new)) self.c = c_new self.h = h_new self.z = z_new return h_new, z_new
def transduce( self, xs: 'expression_seqs.ExpressionSequence' ) -> 'expression_seqs.ExpressionSequence': batch_size = xs[0][0].dim()[1] h_bot = [] h_mid = [] h_top = [] z_bot = [] z_mid = [] z_top = [] self.top_layer.h = None self.top_layer.c = None self.top_layer.z = None self.mid_layer.h = None self.mid_layer.c = None self.mid_layer.z = None self.bottom_layer.h = None self.bottom_layer.c = None self.bottom_layer.z = None #?? checkme. want to init z to ones? (cherry paper) z_one = dy.ones(1, batch_size=batch_size) h_bot.append( dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) #indices for timesteps are +1 h_mid.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) h_top.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)) for i, x_t in enumerate(xs): h_t_bot, z_t_bot = self.bottom_layer.transduce( h_below=x_t, h_above=h_mid[i], z_below=z_one ) #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell) h_t_mid, z_t_mid = self.mid_layer.transduce( h_below=h_t_bot, h_above=h_top[i], z_below=z_t_bot ) #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell) h_t_top, z_t_top = self.top_layer.transduce( h_below=h_t_mid, h_above=None, z_below=z_t_mid ) #uses z_t_bot and h_t_bot from previous layer call, h_t_top and z_t_top from previous time step (saved in hmlstmcell) h_bot.append(h_t_bot) z_bot.append(z_t_bot) h_mid.append(h_t_mid) z_mid.append(z_t_mid) h_top.append(h_t_top) z_top.append(z_t_top) # #gated output module # # #sigmoid # W_layer = dy.parameters(dim=(len(self.modules), hidden_dim)) #needs to be moved to init? num layers by hidden_dim # h_cat = dy.transpose(dy.concatenate([h_bot, h_mid, h_top])) # dotted = dy.dot_product(e1, e2) # gates = dy.logistic(dotted) # #relu # # om = dy.relu() #final state is last hidden state from top layer self._final_states = [transducers.FinalTransducerState(h_top[-1])] fin_xs = expression_seqs.ExpressionSequence(expr_list=h_top[1:]) return fin_xs #removes the init zeros to make it same length as seq
def decode(self, vectors, tag_vectors, output, lang_id, weight, teacher_prob=1.0): output = [self.EOS] + list(output) + [self.EOS] output = [self.char2int[c] for c in output] N = len(vectors) input_mat = dy.concatenate_cols(vectors) w1dt = None input_mat = dy.dropout(input_mat, self.DROPOUT_PROB) tag_input_mat = dy.concatenate_cols(tag_vectors) tag_w1dt = None last_output_embeddings = self.output_lookup[self.char2int[self.EOS]] s = self.dec_lstm.initial_state().add_input( dy.concatenate( [vectors[-1], tag_vectors[-1], last_output_embeddings])) loss = [] prev_att = dy.zeros(5) if self.USE_ATT_REG: total_att = dy.zeros(N) if self.USE_TAG_ATT_REG: total_tag_att = dy.zeros(len(tag_vectors)) for char in output: # w1dt can be computed and cached once for the entire decoding phase w1dt = w1dt or self.attention_w1 * input_mat tag_w1dt = tag_w1dt or self.tag_attention_w1 * tag_input_mat state = dy.concatenate(list(s.s())) tag_att_weights = self.attend_tags(state, tag_w1dt) tag_context = tag_input_mat * tag_att_weights tag_context2 = dy.concatenate([tag_context, tag_context]) new_state = state + tag_context2 att_weights = self.attend_with_prev(new_state, w1dt, prev_att) context = input_mat * att_weights best_ic = np.argmax(att_weights.vec_value()) context = input_mat * att_weights startt = min(best_ic - 2, N - 6) if startt < 0: startt = 0 endd = startt + 5 if N < 5: prev_att = dy.concatenate([att_weights] + [dy.zeros(1)] * (5 - N)) else: prev_att = att_weights[startt:endd] #if prev_att.dim()[0][0] != 5: # print(prev_att.dim()) if self.USE_ATT_REG: total_att = total_att + att_weights if self.USE_TAG_ATT_REG: total_tag_att = total_tag_att + tag_att_weights vector = dy.concatenate( [context, tag_context, last_output_embeddings]) s = s.add_input(vector) s_out = dy.dropout(s.output(), self.DROPOUT_PROB) out_vector = self.decoder_w * s_out + self.decoder_b probs = dy.softmax(out_vector) if teacher_prob == 1: last_output_embeddings = self.output_lookup[char] else: if random() > teacher_prob: out_char = np.argmax(probs.npvalue()) last_output_embeddings = self.output_lookup[out_char] else: last_output_embeddings = self.output_lookup[char] loss.append(-dy.log(dy.pick(probs, char))) loss = dy.esum(loss) * weight if self.PREDICT_LANG: last_enc_state = vectors[-1] adv_state = dy.flip_gradient(last_enc_state) pred_lang = dy.transpose( dy.transpose(adv_state) * self.lang_class_w) lang_probs = dy.softmax(pred_lang) lang_loss_1 = -dy.log(dy.pick(lang_probs, lang_id)) first_enc_state = vectors[0] adv_state2 = dy.flip_gradient(first_enc_state) pred_lang2 = dy.transpose( dy.transpose(adv_state2) * self.lang_class_w) lang_probs2 = dy.softmax(pred_lang2) lang_loss_2 = -dy.log(dy.pick(lang_probs2, lang_id)) loss += lang_loss_1 + lang_loss_2 if self.USE_ATT_REG: loss += dy.huber_distance(dy.ones(N), total_att) if self.USE_TAG_ATT_REG: loss += dy.huber_distance(dy.ones(len(tag_vectors)), total_tag_att) return loss
def l2_normalize(x): epsilon = np.finfo(float).eps * dy.ones(pred.dim()[0]) norm = dy.sqrt(dy.sum_elems(dy.square(x))) sign = dy.cdiv(x, dy.bmax(dy.abs(x), epsilon)) return dy.cdiv(dy.cmult(sign, dy.bmax(dy.abs(x), epsilon)), dy.bmax(norm, epsilon[0]))
def __call__(self, x: dy.Expression, att_mask: np.ndarray, batch_mask: np.ndarray, p: numbers.Real): """ x: expression of dimensions (input_dim, time) x batch att_mask: numpy array of dimensions (time, time); pre-transposed batch_mask: numpy array of dimensions (batch, time) p: dropout prob """ sent_len = x.dim()[0][1] batch_size = x[0].dim()[1] if self.downsample_factor > 1: if sent_len % self.downsample_factor != 0: raise ValueError( "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. " "Configure batcher accordingly.") if batch_mask is not None: batch_mask = batch_mask[:, ::self.downsample_factor] sent_len_out = sent_len // self.downsample_factor sent_len = sent_len_out out_mask = x.mask if self.downsample_factor > 1 and out_mask is not None: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_factor) x = ExpressionSequence(expr_tensor=dy.reshape( x.as_tensor(), (x.dim()[0][0] * self.downsample_factor, x.dim()[0][1] / self.downsample_factor), batch_size=batch_size), mask=out_mask) residual = SAAMTimeDistributed()(x) else: residual = SAAMTimeDistributed()(x) sent_len_out = sent_len if self.model_dim != self.input_dim * self.downsample_factor: residual = self.res_shortcut.transform(residual) # Concatenate all the words together for doing vectorized affine transform if self.kq_pos_encoding_type is None: kvq_lin = self.linear_kvq.transform(SAAMTimeDistributed()(x)) key_up = self.shape_projection( dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head), batch_size) value_up = self.shape_projection( dy.pick_range(kvq_lin, self.head_count * self.dim_per_head, 2 * self.head_count * self.dim_per_head), batch_size) query_up = self.shape_projection( dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head, 3 * self.head_count * self.dim_per_head), batch_size) else: assert self.kq_pos_encoding_type == "embedding" encoding = self.kq_positional_embedder.embed_sent( sent_len).as_tensor() kq_lin = self.linear_kq.transform(SAAMTimeDistributed()( ExpressionSequence( expr_tensor=dy.concatenate([x.as_tensor(), encoding])))) key_up = self.shape_projection( dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head), batch_size) query_up = self.shape_projection( dy.pick_range(kq_lin, self.head_count * self.dim_per_head, 2 * self.head_count * self.dim_per_head), batch_size) v_lin = self.linear_v.transform(SAAMTimeDistributed()(x)) value_up = self.shape_projection(v_lin, batch_size) if self.cross_pos_encoding_type: assert self.cross_pos_encoding_type == "embedding" emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0, sent_len) emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0, sent_len) key_up = dy.reshape(key_up, (sent_len, self.dim_per_head, self.head_count), batch_size=batch_size) key_up = dy.concatenate_cols( [dy.cmult(key_up, emb1), dy.cmult(key_up, emb2)]) key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2), batch_size=self.head_count * batch_size) query_up = dy.reshape( query_up, (sent_len, self.dim_per_head, self.head_count), batch_size=batch_size) query_up = dy.concatenate_cols( [dy.cmult(query_up, emb2), dy.cmult(query_up, -emb1)]) query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2), batch_size=self.head_count * batch_size) scaled = query_up * dy.transpose( key_up / math.sqrt(self.dim_per_head) ) # scale before the matrix multiplication to save memory # Apply Mask here if not self.ignore_masks: if att_mask is not None: att_mask_inp = att_mask * -100.0 if self.downsample_factor > 1: att_mask_inp = att_mask_inp[::self.downsample_factor, :: self.downsample_factor] scaled += dy.inputTensor(att_mask_inp) if batch_mask is not None: # reshape (batch, time) -> (time, head_count*batch), then *-100 inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :], (sent_len, self.head_count, batch_size)), (1, sent_len, self.head_count * batch_size)) \ * -100 mask_expr = dy.inputTensor(inp, batched=True) scaled += mask_expr if self.diag_gauss_mask: diag_growing = np.zeros((sent_len, sent_len, self.head_count)) for i in range(sent_len): for j in range(sent_len): diag_growing[i, j, :] = -(i - j)**2 / 2.0 e_diag_gauss_mask = dy.inputTensor(diag_growing) e_sigma = dy.parameter(self.diag_gauss_mask_sigma) if self.square_mask_std: e_sigma = dy.square(e_sigma) e_sigma_sq_inv = dy.cdiv( dy.ones(e_sigma.dim()[0], batch_size=batch_size), dy.square(e_sigma)) e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask, e_sigma_sq_inv) scaled += dy.reshape(e_diag_gauss_mask_final, (sent_len, sent_len), batch_size=batch_size * self.head_count) # Computing Softmax here. attn = dy.softmax(scaled, d=1) if LOG_ATTENTION: yaml_logger.info({ "key": "selfatt_mat_ax0", "value": np.average(attn.value(), axis=0).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax1", "value": np.average(attn.value(), axis=1).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax0_ent", "value": entropy(attn.value()).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax1_ent", "value": entropy(attn.value().transpose()).dumps(), "desc": self.desc }) self.select_att_head = 0 if self.select_att_head is not None: attn = dy.reshape(attn, (sent_len, sent_len, self.head_count), batch_size=batch_size) sel_mask = np.zeros((1, 1, self.head_count)) sel_mask[0, 0, self.select_att_head] = 1.0 attn = dy.cmult(attn, dy.inputTensor(sel_mask)) attn = dy.reshape(attn, (sent_len, sent_len), batch_size=self.head_count * batch_size) # Applying dropout to attention if p > 0.0: drop_attn = dy.dropout(attn, p) else: drop_attn = attn # Computing weighted attention score attn_prod = drop_attn * value_up # Reshaping the attn_prod to input query dimensions out = dy.reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), batch_size=batch_size) out = dy.transpose(out) out = dy.reshape(out, (self.model_dim, ), batch_size=batch_size * sent_len_out) # out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out) if self.plot_attention: from sklearn.metrics.pairwise import cosine_similarity assert batch_size == 1 mats = [] for i in range(attn.dim()[1]): mats.append(dy.pick_batch_elem(attn, i).npvalue()) self.plot_att_mat( mats[-1], "{}.sent_{}.head_{}.png".format( self.plot_attention, self.plot_attention_counter, i), 300) avg_mat = np.average(mats, axis=0) self.plot_att_mat( avg_mat, "{}.sent_{}.head_avg.png".format(self.plot_attention, self.plot_attention_counter), 300) cosim_before = cosine_similarity(x.as_tensor().npvalue().T) self.plot_att_mat( cosim_before, "{}.sent_{}.cosim_before.png".format( self.plot_attention, self.plot_attention_counter), 600) cosim_after = cosine_similarity(out.npvalue().T) self.plot_att_mat( cosim_after, "{}.sent_{}.cosim_after.png".format( self.plot_attention, self.plot_attention_counter), 600) self.plot_attention_counter += 1 # Adding dropout and layer normalization if p > 0.0: res = dy.dropout(out, p) + residual else: res = out + residual ret = self.layer_norm.transform(res) return ret
def cal_scores(self, src_encodings, predict=False): src_len = len(src_encodings) src_encodings = dy.concatenate_cols( src_encodings) # src_ctx_dim, src_len, batch_size batch_size = src_encodings.dim()[1] W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] if predict: h_arc_head = self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings ])) # n_arc_ml_units, src_len, bs h_arc_dep = self.leaky_ReLu( dy.affine_transform( [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])) h_label_dep = self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings ])) else: src_encodings = dy.dropout_dim(src_encodings, 1, self.arc_mlp_dropout) h_arc_head = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings ])), 1, self.arc_mlp_dropout) # n_arc_ml_units, src_len, bs h_arc_dep = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings ])), 1, self.arc_mlp_dropout) h_label_head = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])), 1, self.label_mlp_dropout) h_label_dep = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings ])), 1, self.label_mlp_dropout) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def attend(self, context, x): context_emb = dy.esum(context) weights = dy.softmax(dy.ones((len(context), ))) return context_emb, weights
def l2_normalize(x): square_sum = dynet.sqrt(dynet.bmax(dynet.sum_elems(dynet.square(x)), np.finfo(float).eps * dynet.ones((1))[0])) return dynet.cdiv(x, square_sum)
def layer_norm(xs): head_shape, batch_size = xs[0].dim() g = dy.ones(head_shape) b = dy.zeros(head_shape) return [dy.layer_norm(x, g, b) for x in xs]
def get_rlstm_output(self, hypothesis, word2int, P_mat_in, prem_seq_len, improvement): lookup = self.params["lookup"] # get lookup parameters hypo_seq = [lookup[word2int.get(i)] for i in hypothesis] # get embeddings of each word # get initial state fw_s0 = self.fw_hypo_builder.initial_state() bw_s0 = self.bw_hypo_builder.initial_state() # will get the last state each time fw_s = fw_s0 bw_s = bw_s0 # get fw parameter expressions fw_At_prev = dy.parameter(self.params["fw_A_t0"]) fw_Wp = dy.parameter(self.params["fw_Wp"]) fw_Wm = dy.parameter(self.params["fw_Wm"]) fw_Wc = dy.parameter(self.params["fw_Wc"]) fw_Walpha = dy.parameter(self.params["fw_Walpha"]) bw_At_prev = dy.parameter(self.params["bw_A_t0"]) bw_Wp = dy.parameter(self.params["bw_Wp"]) bw_Wm = dy.parameter(self.params["bw_Wm"]) bw_Wc = dy.parameter(self.params["bw_Wc"]) bw_Walpha = dy.parameter(self.params["bw_Walpha"]) # create mask for the attend vector to take into account only the length of the current sequence if prem_seq_len < self.max_seq_len: mask = dy.concatenate([ dy.ones(prem_seq_len), dy.zeros(self.max_seq_len - prem_seq_len) ]) # bw_mask = dy.concatenate([dy.zeros(self.max_seq_len-prem_seq_len), dy.ones(prem_seq_len)]) else: mask = dy.ones(prem_seq_len) # bw_mask = dy.ones(prem_seq_len) # calculate forward & backward mask At_mask_fw = dy.cmult(fw_At_prev, mask) At_mask_bw = dy.cmult(bw_At_prev, mask) if improvement == "2" or improvement == "3": if prem_seq_len < self.max_seq_len: bw_mask = dy.concatenate([ dy.zeros(self.max_seq_len - prem_seq_len), dy.ones(prem_seq_len) ]) else: bw_mask = dy.ones(prem_seq_len) At_mask_bw = dy.cmult(bw_At_prev, bw_mask) if improvement == "2" or improvement == "3": P_mat = P_mat_in[0] else: P_mat = P_mat_in idx = 0 fw_output_vec = [] # calculate the new output with the attention of the fw lstm for word in hypo_seq: fw_s = fw_s.add_input(word) # add input to the network h_t = fw_s.h()[0] # get the output vector of the current timestep # get the output gate value: Weights = self.fw_hypo_builder.get_parameter_expressions() Wox = dy.select_rows( Weights[0][0], range(self.params_size * 2, self.params_size * 3)) Woh = dy.select_rows( Weights[0][1], range(self.params_size * 2, self.params_size * 3)) bo = dy.select_rows( Weights[0][2], range(self.params_size * 2, self.params_size * 3)) if idx == 0: out_gate = dy.logistic(Wox * word + bo) else: h_t_prev = fw_s.prev().h()[0] out_gate = dy.logistic(Wox * word + Woh * h_t_prev + bo) # matrix multiplication - [params_size x max_len_seq] x [max_len_seq x 1] # m dim: params_size x 1 mt = P_mat * At_mask_fw # get the new out vector m_gated = dy.cmult(dy.tanh(mt), out_gate) h_t_new = h_t + m_gated fw_output_vec.append(h_t_new) # calculate alpha alpha = dy.colwise_add(fw_Wp * P_mat, fw_Wm * mt) if idx > 0: s_t_prev = fw_s.prev().s()[0] alpha = dy.colwise_add(alpha, fw_Wc * s_t_prev) if improvement == "1" or improvement == "3": alpha = dy.tanh(alpha) # compute the next At At_fw = dy.transpose(dy.transpose(fw_Walpha) * alpha) At_fw_exp = dy.exp(At_fw) At_fw_exp_mask = dy.cmult(At_fw_exp, mask) At_mask_fw = dy.cdiv(At_fw_exp_mask, dy.sum_elems(At_fw_exp_mask)) idx += 1 if improvement == "2" or improvement == "3": P_mat = P_mat_in[1] else: P_mat = P_mat_in idx = 0 bw_output_vec = [] # calculate the new output with the attention of the bw lstm for word in reversed(hypo_seq): bw_s = bw_s.add_input(word) # add input to the network h_t = bw_s.h()[0] # get the output vector of the current timestep # get the output gate value: Weights = self.bw_hypo_builder.get_parameter_expressions() Wox = dy.select_rows( Weights[0][0], range(self.params_size * 2, self.params_size * 3)) Woh = dy.select_rows( Weights[0][1], range(self.params_size * 2, self.params_size * 3)) bo = dy.select_rows( Weights[0][2], range(self.params_size * 2, self.params_size * 3)) if idx == 0: out_gate = dy.logistic(Wox * word + bo) else: h_t_prev = bw_s.prev().h()[0] out_gate = dy.logistic(Wox * word + Woh * h_t_prev + bo) # matrix multiplication - [params_size x max_len_seq] x [max_len_seq x 1] # m dim: params_size x 1 mt = P_mat * At_mask_bw # get the new out vector m_gated = dy.cmult(dy.tanh(mt), out_gate) h_t_new = h_t + m_gated bw_output_vec.append(h_t_new) # calculate alpha alpha = dy.colwise_add(bw_Wp * P_mat, bw_Wm * mt) if idx > 0: s_t_prev = bw_s.prev().s()[0] alpha = dy.colwise_add(alpha, bw_Wc * s_t_prev) if improvement == "1" or improvement == "3": alpha = dy.tanh(alpha) # compute the next At At_bw = dy.transpose(dy.transpose(bw_Walpha) * alpha) At_bw_exp = dy.exp(At_bw) At_bw_exp_mask = dy.cmult(At_bw_exp, mask) At_mask_bw = dy.cdiv(At_bw_exp_mask, dy.sum_elems(At_bw_exp_mask)) idx += 1 return fw_output_vec, bw_output_vec