def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Sample K negative words for each predicted word at each position all_neg_words = np.random.choice(nwords, size=2 * N * K * len(emb), replace=True, p=word_probabilities) # W_w = dy.parameter(W_w_p) # Step through the sentence and calculate the negative and positive losses all_losses = [] for i, my_emb in enumerate(emb): neg_words = all_neg_words[i * K * 2 * N:(i + 1) * K * 2 * N] pos_words = ( [sent[x] if x >= 0 else S for x in range(i - N, i)] + [sent[x] if x < len(sent) else S for x in range(i + 1, i + N + 1)]) neg_loss = -dy.log( dy.logistic( -dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words)))) pos_loss = -dy.log( dy.logistic( dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words)))) all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss)) return dy.esum(all_losses)
def GetQDScore(self, qwords, qreps, dwords, dreps, extra): nq = len(qreps) nd = len(dreps) qgl = [ self.W_gate.expr() * dy.concatenate([qv, dy.constant(1, self.idf_val(qw))]) for qv, qw in zip(qreps, qwords) ] qgates = dy.softmax(dy.concatenate(qgl)) qscores = [] for qtok in range(len(qreps)): qrep = qreps[qtok] att_scores = [dy.dot_product(qrep, drep) for drep in dreps] att_probs = dy.softmax(dy.concatenate(att_scores)) doc_rep = dy.esum([v * p for p, v in zip(att_probs, dreps)]) input_vec = dy.cmult(qrep, doc_rep) #input_dot = dy.sum_elems(input_vec) #input_len = dy.l2_norm(qrep - doc_rep) #input_vec = dy.concatenate([input_vec, input_dot, input_len]) layer = utils.leaky_relu(self.b_term.expr() + self.W_term.expr() * input_vec) score = (self.b_term2.expr() + self.W_term2.expr() * layer) qscores.append(score) # Final scores and ultimate classifier. qterm_score = dy.dot_product(dy.concatenate(qscores), qgates) fin_score = ( self.b_final.expr() + self.W_final.expr() * dy.concatenate([qterm_score, extra])) return fin_score
def beam_search(self, char_seq, truth = None, mu =0.): start_agenda = Agenda(self.options['beam_size']) init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) start_agenda.push(Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None)) agenda = [start_agenda] for idx, _ in enumerate(char_seq,1): # from left to right, character by character now = Agenda(self.options['beam_size']) for wlen in xrange(1,min(idx,self.options['max_word_len'])+1): # generate candidate word vectors word = self.word_repr(char_seq[idx-wlen:idx]) word_score = dy.dot_product(word,self.param_exprs['U']) for sent in agenda[idx-wlen]: # join segmentation if truth is not None: margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.) score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score else: score = sent.score_expr + dy.dot_product(sent.y, word) + word_score if now.happy_with(score.scalar_value()): new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) now.push(Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen)) agenda.append(now) if truth is not None: return agenda[-1].max().score_expr return agenda
def __call__(self, x, y): # x_bias = parameter(self.x_bias) # y_bias = parameter(self.y_bias) # bias = parameter(self.bias) # return bias + dot_product(x_bias, x) + dot_product(y_bias, y) + self.U(x, y) return self.bias + dot_product(self.x_bias, x) + dot_product( self.y_bias, y) + self.U(x, y)
def decomp_attend(self, vecsA, vecsB): # Fq^T Fc -> need to expedite using native matrix/tensor multiplication Fq = vecsA # the original word vector, not yet passing a NN as in Eq.1, # need a function F Fc = vecsB # need a function F expE = [] for fq in Fq: row = [] for fc in Fc: row.append(dt.exp(dt.dot_product(fq, fc))) expE.append(row) #print ("debug: expE", expE[0][0].value()) invSumExpEi = [] for i in xrange(len(Fq)): invSumExpEi.append(dt.pow(dt.esum(expE[i]), dt.scalarInput(-1))) invSumExpEj = [] for j in xrange(len(Fc)): invSumExpEj.append( dt.pow(dt.esum([expE[i][j] for i in xrange(len(Fq))]), dt.scalarInput(-1))) beta = [] for i in xrange(len(Fq)): s = dt.esum([Fc[j] * expE[i][j] for j in xrange(len(Fc))]) beta.append(s * invSumExpEi[i]) #print("debug: beta", beta[0].value()) alpha = [] for j in xrange(len(Fc)): s = dt.esum([Fc[j] * expE[i][j] for i in xrange(len(Fq))]) alpha.append(s * invSumExpEj[j]) #print("debug: alpha", alpha[0].value()) # Compare v1i = [ dt.logistic(dt.concatenate([Fq[i], beta[i]])) for i in xrange(len(Fq)) ] # need a function G v2j = [ dt.logistic(dt.concatenate([Fc[j], alpha[j]])) for j in xrange(len(Fc)) ] # need a function G #print ("debug: v1i", v1i[0].value()) #print ("debug: v2j", v2j[0].value()) # Aggregate v1 = dt.esum(v1i) v2 = dt.esum(v2j) #print ("debug: v1.value()", v1.value()) #print ("debug: v2.value()", v2.value()) #colScore = dt.logistic(dt.dot_product(self.SelHW, dt.concatenate([v1,v2]))) return dt.dot_product(v1, v2)
def truth_score(self, word_seq): wembs = [self.param_exprs['<bos>']]+[self.word_repr(word) for word in word_seq] init_state = self.params['lstm'].initial_state() hidden_states = init_state.transduce(wembs) score = dy.scalarInput(0.) for h, w in zip(hidden_states[:-1],wembs[1:]): y = dy.tanh(self.param_exprs['pW'] * h + self.param_exprs['pb']) score = score + dy.dot_product(y,w) +dy.dot_product(w,self.param_exprs['U']) return score
def greedy_search(self, char_seq, truth = None, mu =0.): init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) init_sentence = Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None, golden=True) if truth is not None: cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] else: cembs = [dy.lookup(self.params['embed'],char) for char in char_seq ] #cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] start_agenda = init_sentence agenda = [start_agenda] for idx, _ in enumerate(char_seq,1): # from left to right, character by character now = None for wlen in range(1,min(idx,self.options['max_word_len'])+1): # generate word candidate vectors # join segmentation sent + word word = self.word_repr(char_seq[idx-wlen:idx], cembs[idx-wlen:idx]) sent = agenda[idx-wlen] if truth is not None: word = dy.dropout(word,self.options['dropout_rate']) word_score = dy.dot_product(word,self.param_exprs['U']) if truth is not None: golden = sent.golden and truth[idx-1]==wlen margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.) score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score else: golden = False score = sent.score_expr + dy.dot_product(sent.y, word) + word_score good = (now is None or now.score < score.scalar_value()) if golden or good: new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) new_sent = Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen, golden=golden) if good: now = new_sent if golden: golden_sent = new_sent agenda.append(now) if truth is not None and truth[idx-1]>0 and (not now.golden): return (now.score_expr - golden_sent.score_expr) if truth is not None: return (now.score_expr - golden_sent.score_expr) return agenda
def truth_score(self, word_seq): wembs = [self.param_exprs['<bos>'] ] + [self.word_repr(word) for word in word_seq] init_state = self.params['lstm'].initial_state() hidden_states = init_state.transduce(wembs) score = dy.scalarInput(0.) for h, w in zip(hidden_states[:-1], wembs[1:]): y = dy.tanh(self.param_exprs['pW'] * h + self.param_exprs['pb']) score = score + dy.dot_product(y, w) + dy.dot_product( w, self.param_exprs['U']) return score
def __call__(self, x, y): W = dy.parameter(self.W) w_x = dy.parameter(self.w_x) w_y = dy.parameter(self.w_y) b = dy.parameter(self.b) out = dy.transpose(x) * W * y out += dy.dot_product(w_x, x) out += dy.dot_product(w_y, y) out = dy.concatenate([dy.scalarInput(0)] * (self.n_out - 1) + [out]) out += b return out
def beam_search(self, char_seq, truth=None, mu=0.): start_agenda = Agenda(self.options['beam_size']) init_state = self.params['lstm'].initial_state().add_input( self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) start_agenda.push( Sentence(score=init_score.scalar_value(), score_expr=init_score, LSTMState=init_state, y=init_y, prevState=None, wlen=None)) agenda = [start_agenda] for idx, _ in enumerate( char_seq, 1): # from left to right, character by character now = Agenda(self.options['beam_size']) for wlen in xrange(1, min(idx, self.options['max_word_len']) + 1): # generate candidate word vectors word = self.word_repr(char_seq[idx - wlen:idx]) word_score = dy.dot_product(word, self.param_exprs['U']) for sent in agenda[idx - wlen]: # join segmentation if truth is not None: margin = dy.scalarInput( mu * wlen if truth[idx - 1] != wlen else 0.) score = margin + sent.score_expr + dy.dot_product( sent.y, word) + word_score else: score = sent.score_expr + dy.dot_product( sent.y, word) + word_score if now.happy_with(score.scalar_value()): new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) now.push( Sentence(score=score.scalar_value(), score_expr=score, LSTMState=new_state, y=new_y, prevState=sent, wlen=wlen)) agenda.append(now) if truth is not None: return agenda[-1].max().score_expr return agenda
def test_update(self): ones = np.ones((10, 10)) dy.renew_cg() a = self.p1 * self.lp1[1] b = self.p2 * self.lp2[1] loss = dy.dot_product(a, b) / 100 self.assertEqual(loss.scalar_value(), 10, msg=str(loss.scalar_value())) loss.backward() # Check the gradients self.assertTrue(np.allclose(self.p1.grad_as_array(), 0.1 * ones), msg=np.array_str(self.p1.grad_as_array())) self.assertTrue(np.allclose(self.p2.grad_as_array(), 0.1 * ones), msg=np.array_str(self.p2.grad_as_array())) self.assertTrue(np.allclose(self.lp1.grad_as_array()[1], ones[ 0]), msg=np.array_str(self.lp1.grad_as_array())) self.assertTrue(np.allclose(self.lp2.grad_as_array()[1], ones[ 0]), msg=np.array_str(self.lp2.grad_as_array())) self.trainer.update() # Check the updated parameters self.assertTrue(np.allclose(self.p1.as_array(), ones * 0.99), msg=np.array_str(self.p1.as_array())) self.assertTrue(np.allclose(self.p2.as_array(), ones * 0.99), msg=np.array_str(self.p2.as_array())) self.assertTrue(np.allclose(self.lp1.as_array()[1], ones[ 0] * 0.9), msg=np.array_str(self.lp1.as_array()[1])) self.assertTrue(np.allclose(self.lp2.as_array()[1], ones[ 0] * 0.9), msg=np.array_str(self.lp2.as_array()))
def intra_sent_attend(self, vecs): numVecs = len(vecs) fVecs = [dt.tanh(self.SelIntraFW * v) for v in vecs] expE = [] for i, fq in enumerate(fVecs): row = [] for j, fc in enumerate(fVecs): row.append( dt.exp( dt.dot_product(fq, fc) + self.SelIntraBias[i - j + int(config.d["DIST_BIAS_DIM"] / 2)])) expE.append(row) invSumExpE = [] for i in xrange(numVecs): invSumExpE.append(dt.pow(dt.esum(expE[i]), dt.scalarInput(-1))) alpha = [] for i in xrange(numVecs): s = dt.esum([vecs[j] * expE[i][j] for j in xrange(numVecs)]) alpha.append(s * invSumExpE[i]) return [ dt.tanh(self.SelIntraHW * dt.concatenate([v, a])) for v, a in zip(vecs, alpha) ]
def test_gradient_sanity(self): dy.renew_cg() x=dy.inputTensor(self.v1) y=dy.inputTensor(self.v2) l = dy.dot_product(x,y) l.forward() self.assertRaises(RuntimeError, gradient_callable, x)
def test_gradient_sanity(self): dy.renew_cg() x = dy.inputTensor(self.v1) y = dy.inputTensor(self.v2) l = dy.dot_product(x, y) l.forward() self.assertRaises(RuntimeError, gradient_callable, x)
def __call__(self, htA, HO, transform_flag=True): """ :param htA: :param HO: :param transform_flag: determine if the model needs selective transformation, :return: """ seq_len = len(HO) HO_hat = [] Weights = [] for i in range(seq_len): hiO = HO[i] if transform_flag: hiO_hat = hiO + dy.rectify(self.W_A * htA + self.W_O * hiO + self.b) else: hiO_hat = hiO wi = dy.tanh(dy.dot_product(self.W_concat, dy.concatenate([htA, hiO_hat]))) HO_hat.append(hiO_hat) Weights.append(wi) HO_hat = dy.concatenate([dy.reshape(ele, d=(1, 2 * self.dim_opi)) for ele in HO_hat]) Weights = dy.concatenate(Weights) # length: seq_len Weights = dy.softmax(Weights) Weights_np = Weights.npvalue() ho_summary_t = dy.reshape(Weights, (1, seq_len)) * HO_hat return dy.reshape(ho_summary_t, (2 * self.dim_opi,)), Weights_np
def recurrence(self, xt, hmtm1, cmtm1, h_tilde_tm1, dropout_flag): """ recurrence function of LSTM with truncated self-attention :param xt: current input, shape: (n_in) :param hmtm1: hidden memory [htm1, ..., h1], shape: (n_steps, n_out) :param cmtm1: cell memory: (n_steps, n_out) :param h_tilde_tm1: previous hidden summary, shape: (n_out, ) :param h_tilde_tm1: previous cell summary :param dropout_flag: where perform partial dropout :return: """ score = dy.concatenate([dy.dot_product(self.u, dy.tanh(\ self.W_h * hmtm1[i] + self.W_x * xt + self.W_htilde * h_tilde_tm1)) for i in range(self.n_steps)]) # normalize the attention score score = dy.softmax(score) # shape: (1, n_out) h_tilde_t = dy.reshape(dy.transpose(score) * hmtm1, d=(self.n_out,)) c_tilde_t = dy.transpose(score) * cmtm1 Wx = self.W * xt if dropout_flag: # perform partial dropout over the lstm Wx = dy.dropout(Wx, self.dropout_rate) Uh = self.U * h_tilde_t # shape: (4*n_out) sum_item = Wx + Uh + self.b it = dy.logistic(sum_item[:self.n_out]) ft = dy.logistic(sum_item[self.n_out:2*self.n_out]) ot = dy.logistic(sum_item[2*self.n_out:3*self.n_out]) c_hat = dy.tanh(sum_item[3*self.n_out:]) ct = dy.cmult(ft, dy.reshape(c_tilde_t, d=(self.n_out,))) + dy.cmult(it, c_hat) ht = dy.cmult(ot, dy.tanh(ct)) hmt = dy.concatenate([hmtm1[1:], dy.reshape(ht, (1, self.n_out))]) cmt = dy.concatenate([cmtm1[1:], dy.reshape(ct, (1, self.n_out))]) return hmt, cmt, h_tilde_t
def recurrence(self, xt, hmtm1, h_history_tm1, dropout_flag): """ :param xt: input vector at the time step t :param hmtm1: hidden memories in previous n_steps steps :param h_tilde_tm1: previous hidden summary :param dropout_flag: make a decision for conducting partial dropout :return: """ score = dy.concatenate([dy.dot_product(self.u, dy.tanh( \ self.W_h * hmtm1[i] + self.W_x * xt + self.W_htilde * h_history_tm1)) for i in range(self.n_steps)]) # normalize the attention score score = dy.softmax(score) # shape: (1, n_out), history of [h[t-n_steps-1], ..., h[t-2]] h_history_t = dy.reshape(dy.transpose(score) * hmtm1[:-1], d=(self.n_out,)) htm1 = hmtm1[-1] #h_tilde_t = dy.concatenate([h_history_t, htm1]) h_tilde_t = htm1 + dy.rectify(h_history_t) if dropout_flag: # perform partial dropout, i.e., add dropout over the matrices W_x* rt = dy.logistic(dy.dropout(self.W_xr, self.dropout_rate) * xt + self.W_hr * h_tilde_t + self.br) zt = dy.logistic(dy.dropout(self.W_xz, self.dropout_rate) * xt + self.W_hz * h_tilde_t + self.bz) ht_hat = dy.tanh(dy.dropout(self.W_xh, self.dropout_rate) * xt + self.W_hh * dy.cmult(rt, h_tilde_t) \ + self.bh) ht = dy.cmult(zt, h_tilde_t) + dy.cmult((1.0 - zt), ht_hat) else: rt = dy.logistic(self.W_xr * xt + self.W_hr * h_tilde_t + self.br) zt = dy.logistic(self.W_xz * xt + self.W_hz * h_tilde_t + self.bz) ht_hat = dy.tanh(self.W_xh * xt + self.W_hh * dy.cmult(rt, h_tilde_t) + self.bh) ht = dy.cmult(zt, h_tilde_t) + dy.cmult((1.0 - zt), ht_hat) hmt = dy.concatenate([hmtm1[1:], dy.reshape(ht, (1, self.n_out))]) return hmt, h_history_t
def test_update(self): ones = np.ones((10, 10)) updated = np.ones((10, 10)) * 0.99 gradient = np.ones((10, 10)) * 0.01 dy.renew_cg() pp1 = dy.parameter(self.p1) pp2 = dy.parameter(self.p2) a = pp1 * self.lp1[1] b = pp2 * self.lp2[1] l = dy.dot_product(a, b) / 100 self.assertEqual(l.scalar_value(), 10, msg=str(l.scalar_value())) l.backward() self.assertTrue(np.allclose(self.p1.grad_as_array(), 0.1 * ones), msg=np.array_str(self.p1.grad_as_array())) self.assertTrue(np.allclose(self.p2.grad_as_array(), 0.1 * ones), msg=np.array_str(self.p2.grad_as_array())) self.assertTrue(np.allclose(self.lp1.grad_as_array()[1], ones[0]), msg=np.array_str(self.lp1.grad_as_array())) self.assertTrue(np.allclose(self.lp2.grad_as_array()[1], ones[0]), msg=np.array_str(self.lp2.grad_as_array())) self.trainer.update() self.assertTrue(np.allclose(self.p1.as_array(), ones * 0.99), msg=np.array_str(self.p1.as_array())) self.assertTrue(np.allclose(self.p2.as_array(), ones * 0.99), msg=np.array_str(self.p2.as_array())) self.assertTrue(np.allclose(self.lp1.as_array()[1], ones[0] * 0.9), msg=np.array_str(self.lp1.as_array()[1])) self.assertTrue(np.allclose(self.lp2.as_array()[1], ones[0] * 0.9), msg=np.array_str(self.lp2.as_array()))
def _get_prob_of_each_word_at_every_pos(self, x, w2i, model_params): context = x["context"] question = x["question"] # encode the context c_f_init = model_params["c_fwdRnn"].initial_state() c_b_init = model_params["c_bwdRnn"].initial_state() c_wemb = [self._word_rep(w, w2i, model_params) for w in context] c_f_exps = c_f_init.transduce(c_wemb) c_b_exps = c_b_init.transduce(reversed(c_wemb)) # biGru state for context c_bi = [ dy.concatenate([f, b]) for f, b in zip(c_f_exps, reversed(c_b_exps)) ] # encode the question q_f_init = model_params["q_fwdRnn"].initial_state() q_b_init = model_params["q_bwdRnn"].initial_state() q_wemb = [self._word_rep(w, w2i, model_params) for w in question] q_f_exps_last = q_f_init.transduce(q_wemb)[-1] q_b_exps_last = q_b_init.transduce(reversed(q_wemb))[-1] # biGru state for question q_bi = dy.concatenate([q_f_exps_last, q_b_exps_last]) # for each word in the context, calculate its probability to be the answer score_of_each_word_at_every_pos = [ dy.dot_product(c_bi[i], q_bi) for i in range(len(context)) ] prob_of_each_word_at_every_pos = dy.softmax( dy.concatenate(score_of_each_word_at_every_pos)) return prob_of_each_word_at_every_pos
def build_graph(self, x): conv_W_1 = dy.parameter(self.params['conv_W_1']) conv_b_1 = dy.parameter(self.params['conv_b_1']) conv_W_2 = dy.parameter(self.params['conv_W_2']) conv_b_2 = dy.parameter(self.params['conv_b_2']) conv_W_3 = dy.parameter(self.params['conv_W_3']) conv_b_3 = dy.parameter(self.params['conv_b_3']) W = dy.parameter(self.params['W']) b = dy.parameter(self.params['b']) (n, d), _ = x.dim() x = dy.reshape(x, (1, n, d)) # 一维卷积网络 conv_1 = dy.tanh( dy.conv2d_bias(x, conv_W_1, conv_b_1, (1, 1), is_valid=False)) conv_2 = dy.tanh( dy.conv2d_bias(x, conv_W_2, conv_b_2, (1, 1), is_valid=False)) conv_3 = dy.tanh( dy.conv2d_bias(x, conv_W_3, conv_b_3, (1, 1), is_valid=False)) pool_1 = dy.max_dim(dy.reshape(conv_1, (n, self.options['channel_1']))) pool_2 = dy.max_dim(dy.reshape(conv_2, (n, self.options['channel_2']))) pool_3 = dy.max_dim(dy.reshape(conv_3, (n, self.options['channel_3']))) # 全连接分类 pool = dy.concatenate([pool_1, pool_2, pool_3], 0) logit = dy.dot_product(pool, W) + b return logit
def get_morph_analysis_scores(self, morph_analysis_representations, context_representations): # (10) and (11) in Shen et al. "The Role of Context ..." def transform_context(context): return dynet.tanh( dynet.affine_transform([ self.transform_context_layer_b.expr(), self.transform_context_layer_W.expr(), context ])) #return dynet.tanh(dynet.sum_cols(dynet.reshape(context, (int(self.sentence_level_bilstm_contexts_length/2), 2)))) if self.parameters['debug'] == 1: print(("morph_analysis_representations", morph_analysis_representations)) print(("context_representations", context_representations)) morph_analysis_scores = \ [dynet.softmax( dynet.concatenate([dynet.dot_product(morph_analysis_representation, transform_context(context)) # sum + tanh for context[:half] and contet[half:] for morph_analysis_representation in morph_analysis_representations[word_pos]])) for word_pos, context in enumerate(context_representations)] return morph_analysis_scores
def test_update(self): ones=np.ones((10, 10)) updated = np.ones((10, 10)) * 0.99 gradient = np.ones((10, 10)) * 0.01 dy.renew_cg() pp1 = dy.parameter(self.p1) pp2 = dy.parameter(self.p2) a = pp1 * self.lp1[1] b = pp2 * self.lp2[1] l = dy.dot_product(a, b) / 100 self.assertEqual(l.scalar_value(),10,msg=str(l.scalar_value())) l.backward() self.assertTrue(np.allclose(self.p1.grad_as_array(), 0.1 * ones),msg=np.array_str(self.p1.grad_as_array())) self.assertTrue(np.allclose(self.p2.grad_as_array(), 0.1 * ones),msg=np.array_str(self.p2.grad_as_array())) self.assertTrue(np.allclose(self.lp1.grad_as_array()[1], ones[0]),msg=np.array_str(self.lp1.grad_as_array())) self.assertTrue(np.allclose(self.lp2.grad_as_array()[1], ones[0]),msg=np.array_str(self.lp2.grad_as_array())) self.trainer.update() self.assertTrue(np.allclose(self.p1.as_array(), ones * 0.99),msg=np.array_str(self.p1.as_array())) self.assertTrue(np.allclose(self.p2.as_array(), ones * 0.99),msg=np.array_str(self.p2.as_array())) self.assertTrue(np.allclose(self.lp1.as_array()[1], ones[0] * 0.9),msg=np.array_str(self.lp1.as_array()[1])) self.assertTrue(np.allclose(self.lp2.as_array()[1], ones[0] * 0.9),msg=np.array_str(self.lp2.as_array()))
def test_update(self): ones = np.ones((10, 10)) dy.renew_cg() a = self.p1 * self.lp1[1] b = self.p2 * self.lp2[1] loss = dy.dot_product(a, b) / 100 self.assertEqual(loss.scalar_value(), 10, msg=str(loss.scalar_value())) loss.backward() # Check the gradients self.assertTrue(np.allclose(self.p1.grad_as_array(), 0.1 * ones), msg=np.array_str(self.p1.grad_as_array())) self.assertTrue(np.allclose(self.p2.grad_as_array(), 0.1 * ones), msg=np.array_str(self.p2.grad_as_array())) self.assertTrue(np.allclose(self.lp1.grad_as_array()[1], ones[0]), msg=np.array_str(self.lp1.grad_as_array())) self.assertTrue(np.allclose(self.lp2.grad_as_array()[1], ones[0]), msg=np.array_str(self.lp2.grad_as_array())) self.trainer.update() # Check the updated parameters self.assertTrue(np.allclose(self.p1.as_array(), ones * 0.99), msg=np.array_str(self.p1.as_array())) self.assertTrue(np.allclose(self.p2.as_array(), ones * 0.99), msg=np.array_str(self.p2.as_array())) self.assertTrue(np.allclose(self.lp1.as_array()[1], ones[0] * 0.9), msg=np.array_str(self.lp1.as_array()[1])) self.assertTrue(np.allclose(self.lp2.as_array()[1], ones[0] * 0.9), msg=np.array_str(self.lp2.as_array()))
def score_expression(self, qwVecs, qwAvgVec, qLSTMVec, colnameVec, colWdVecs): colPriorScore = dt.dot_product(self.ColW, colnameVec) colMaxScore = AvgMaxScore(qwVecs, colWdVecs) colAvgScore = AvgScore(qwAvgVec, colnameVec) colQLSTMScore = AvgScore(qLSTMVec, colnameVec) ret = [colPriorScore, colMaxScore, colAvgScore, colQLSTMScore] return ret
def test_gradient(self): dy.renew_cg() x=dy.inputTensor(self.v1) y=dy.inputTensor(self.v2) l = dy.dot_product(x,y) l.forward() l.backward(full=True) self.assertTrue(np.allclose(x.gradient(), self.v2),msg="{}\n{}\n{}".format(l.value(),x.gradient(),self.v2,y.gradient(),self.v2))
def determine_coverage_by_name(self, qwVecs, avgVec): return None # Compute question coverage -- hard/rough implementation to test idea first qWdMatchScore = [ dt.dot_product(qwVec, avgVec).value() for qwVec in qwVecs ] ret = dt.softmax(dt.inputVector(np.array(qWdMatchScore))) return ret
def get_all_next(total_p, _, state, interp, last_op_name, last_arg_ref, last_arg_num, num_candidates, expr_nums_pos, expr_vals, trace): if last_op_name == 'exit': return if last_op_name is not None: trace = trace + [(last_op_name, last_arg_num)] interp = Interpreter(interp) end_expr, expr_val = interp.next_op(last_op_name, last_arg_num) if end_expr: try: expr_val = float(expr_val) except: return expr_nums_pos = defaultdict(set, expr_nums_pos) expr_nums_pos[expr_val].add(state.step) num_candidates = num_candidates | {expr_val, -expr_val} expr_vals = expr_vals + [expr_val] interp = Interpreter() state = state.next_state(expr_val, last_op_name, last_arg_ref) p_op = dy.log(state.op_probs()).npvalue() for op_id, op_name in decoder.opid2name.items(): if op_name not in interp.valid_ops: continue op_p = p_op[op_id] if op_name == 'load': copy_p = state.copy_probs() for arg_num in num_candidates: from_pos_prior_p, pos_prior_ref = state.from_prior_prob( arg_num) from_neg_prior_p, neg_prior_ref = state.from_prior_prob( -arg_num, True) from_pos_input_p, pos_input_ref = state.from_input_prob( input_nums_pos[arg_num]) from_neg_input_p, neg_input_ref = state.from_input_prob( input_nums_pos[-arg_num], True) from_pos_exprs_p, pos_exprs_ref = state.from_exprs_prob( expr_nums_pos[arg_num]) from_neg_exprs_p, neg_exprs_ref = state.from_exprs_prob( expr_nums_pos[-arg_num], True) from_p = dy.concatenate([ from_pos_prior_p, from_neg_prior_p, from_pos_input_p, from_neg_input_p, from_pos_exprs_p, from_neg_exprs_p ]) arg_ref = (dy.concatenate_cols([ pos_prior_ref, neg_prior_ref, pos_input_ref, neg_input_ref, pos_exprs_ref, neg_exprs_ref ]) * copy_p) instruct_p = ( op_p + dy.log(dy.dot_product(copy_p, from_p))).value() if not math.isinf(instruct_p): yield total_p + instruct_p, np.random.uniform( ), state, interp, op_name, arg_ref, arg_num, num_candidates, expr_nums_pos, expr_vals, trace else: instruct_p = op_p yield total_p + instruct_p, np.random.uniform( ), state, interp, op_name, None, None, num_candidates, expr_nums_pos, expr_vals, trace
def __call__(self, dec_state, enc_states): w = dy.parameter(self.W) attention_weights = [] for enc_state in enc_states: attention_weight = dy.dot_product(w * enc_state, dec_state) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) return attention_weights
def __call__(self, state_dec, states_enc): w = dy.parameter(self.W) a_weights = [] for state_enc in states_enc: a_w = dy.dot_product(w * state_enc, state_dec) a_weights.append(a_w) a_weights = dy.softmax(dy.concatenate(a_weights)) return a_weights
def __cosine_loss(self, pred, gold): sn1 = dy.l2_norm(pred) sn2 = dy.l2_norm(gold) mult = dy.cmult(sn1, sn2) dot = dy.dot_product(pred, gold) div = dy.cdiv(dot, mult) vec_y = dy.scalarInput(2) res = dy.cdiv(1 - div, vec_y) return res
def score_expression(self, qwVecs, numWdPos): if numWdPos == 0: kwVec = qwVecs[numWdPos + 1] elif numWdPos == 1: kwVec = qwVecs[0] else: kwVec = dt.average(qwVecs[numWdPos - 2:numWdPos]) ret = dt.dot_product(kwVec, self.OpW) return ret
def calc_sent_loss(sent): # Create a computation graph dy.renew_cg() # Get embeddings for the sentence emb = [W_w_p[x] for x in sent] # Sample K negative words for each predicted word at each position all_neg_words = np.random.choice(nwords, size=2*N*K*len(emb), replace=True, p=word_probabilities) # W_w = dy.parameter(W_w_p) # Step through the sentence and calculate the negative and positive losses all_losses = [] for i, my_emb in enumerate(emb): neg_words = all_neg_words[i*K*2*N:(i+1)*K*2*N] pos_words = ([sent[x] if x >= 0 else S for x in range(i-N,i)] + [sent[x] if x < len(sent) else S for x in range(i+1,i+N+1)]) neg_loss = -dy.log(dy.logistic(-dy.dot_product(my_emb, dy.lookup_batch(W_c_p, neg_words)))) pos_loss = -dy.log(dy.logistic(dy.dot_product(my_emb, dy.lookup_batch(W_c_p, pos_words)))) all_losses.append(dy.sum_batches(neg_loss) + dy.sum_batches(pos_loss)) return dy.esum(all_losses)
def cross_entropy_loss(self, score, next_word, cur_word): if self.__ls: log_prob = dy.log_softmax(score) if self.__lm is None: loss = - dy.pick_batch(log_prob, next_word) * (1 - self.__ls_eps) - \ dy.mean_elems(log_prob) * self.__ls_eps else: loss = - dy.pick_batch(log_prob, next_word) * (1 - self.__ls_eps) - \ dy.dot_product(self.__lm.next_expr(cur_word), log_prob) * self.__ls_eps else: loss = dy.pickneglogsoftmax(score, next_word) return loss
def cross_entropy_loss(self, s, nw, cw): """Calculates the cross-entropy """ if self.ls: log_prob = dy.log_softmax(s) if self.lm is None: loss = - dy.pick_batch(log_prob, nw) * (1 - self.ls_eps) - \ dy.mean_elems(log_prob) * self.ls_eps else: loss = - dy.pick_batch(log_prob, nw) * (1 - self.ls_eps) - \ dy.dot_product(self.lm_e, log_prob) * self.ls_eps else: loss = dy.pickneglogsoftmax_batch(s, nw) return loss
def bidirect_pass(x, p): """ This function will wrap all the steps needed to feed one sentence through the biLSTM :param x: a <list> of indices """ # convert sequence of ints to sequence of embeddings #input_seq = [embedding_parameters[i] for i in x] # embedding_parameters can be used like <dict> input_seq = [dy.lookup(embedding_parameters, i, update=False) for i in x] # embedding_parameters can be used like <dict> # convert Parameters to Expressions v1 = dy.parameter(pv1) b1 = dy.parameter(pb1) v2 = dy.parameter(pv2) b2 = dy.parameter(pb2) # initialize the RNN unit fw_rnn_seq = fw_RNN_unit.initial_state() bw_rnn_seq = bw_RNN_unit.initial_state() # run each timestep(word) through the RNN fw_rnn_hidden_outs = fw_rnn_seq.transduce(input_seq) bw_rnn_hidden_outs = bw_rnn_seq.transduce(reversed(input_seq)) second_input_seq = [ dy.concatenate([f, b]) for f, b in zip(fw_rnn_hidden_outs, reversed(bw_rnn_hidden_outs)) ] second_fw_rnn_seq = second_fw_RNN_unit.initial_state() second_bw_rnn_seq = second_bw_RNN_unit.initial_state() fw_rnn_second_hidden_outs = second_fw_rnn_seq.transduce(second_input_seq) bw_rnn_second_hidden_outs = second_bw_rnn_seq.transduce( reversed(second_input_seq)) # biLSTM states bi = [ dy.concatenate([f, b]) for f, b in zip( fw_rnn_second_hidden_outs, reversed(bw_rnn_second_hidden_outs)) ] # hidden_state at the position of predicate bi_pred = bi[p] # a two-layer regression model outputs = dy.dot_product(v2, dy.tanh(v1 * bi_pred + b1)) + b2 return outputs
def action_in_state_context_bonuses(corpus, state, inputs, W_context_action, W_action, predict_invalid, past_states, past_actions): all_inputs = dy.concatenate(inputs) bonuses = [] # actions we're scoring could be all actions if we have an unconstrained model. so compute the valid actions for this corpus, and if we have an action that can't be applied in the state, just return a bonus of 0 valid_actions = set(corpus.valid_actions(state)) for action in corpus.ACTIONS: if action in valid_actions: next_state = corpus.take_action(state, action) embedded_action_sc = dy.inputVector(corpus.embed_action_in_state_context(action, state, next_state, past_states, past_actions)) bonus = dy.dot_product(W_context_action * embedded_action_sc, all_inputs) + W_action * embedded_action_sc else: bonus = dy.scalarInput(0) bonuses.append(bonus) if predict_invalid: bonuses.append(dy.scalarInput(0)) return dy.concatenate(bonuses)
def test_set_updated(self): self.p2.set_updated(False) self.lp1.set_updated(False) self.assertTrue(self.p1.is_updated()) self.assertFalse(self.p2.is_updated()) self.assertFalse(self.lp1.is_updated()) self.assertTrue(self.lp2.is_updated()) self.p1.set_updated(True) self.p2.set_updated(False) self.lp1.set_updated(False) self.lp2.set_updated(True) self.assertTrue(self.p1.is_updated()) self.assertFalse(self.p2.is_updated()) self.assertFalse(self.lp1.is_updated()) self.assertTrue(self.lp2.is_updated()) self.p1.set_updated(False) self.p2.set_updated(True) self.lp1.set_updated(True) self.lp2.set_updated(False) self.assertFalse(self.p1.is_updated()) self.assertTrue(self.p2.is_updated()) self.assertTrue(self.lp1.is_updated()) self.assertFalse(self.lp2.is_updated()) dy.renew_cg() a = self.p1 * self.lp1[1] b = self.p2 * self.lp2[1] loss = dy.dot_product(a, b) / 100 loss.backward() self.trainer.update() ones = np.ones((10, 10)) self.assertTrue(np.allclose(self.p1.as_array(), ones), msg=np.array_str(self.p1.as_array())) self.assertTrue(np.allclose(self.lp2.as_array()[1], ones[ 0]), msg=np.array_str(self.lp2.as_array()))
# It can be improved by following the speed tricks covered in class: # 1) Don't repeat operations. # 2) Minimize the number of operations. # 3) Minimize the number of CPU-GPU memory copies, make them earlier. # Create the model model = dy.ParameterCollection() trainer = dy.SimpleSGDTrainer(model) W = model.add_parameters((100,100)) # Create the "training data" x_vecs = [] y_vecs = [] for i in range(10): x_vecs.append(np.random.rand(100)) y_vecs.append(np.random.rand(100)) # Do the processing for my_iter in range(1000): dy.renew_cg() total = 0 for x in x_vecs: for y in y_vecs: x_exp = dy.inputTensor(x) y_exp = dy.inputTensor(y) total = total + dy.dot_product(W * x_exp, y_exp) total.forward() total.backward() trainer.update()