def _gen(self, node): # input word embedding wv_t = sigmoid(self.Wemb_np[node.wordid, :]) # attention b_t = np.zeros((node.sv.shape[0])) for j in range(node.sv.shape[0]): b_t[j] = np.dot( tanh( np.dot(np.concatenate([wv_t, node.h, node.sv[j]], axis=0), self.Wha_np)), self.Vha_np) b_t = softmax(b_t) sv_emb_t = np.dot(b_t, node.sv) da_emb_t = tanh(node.a + sv_emb_t) # compute ig, fg, og together and slice it gates_t = np.dot(np.concatenate([wv_t, node.h, da_emb_t], axis=0), self.Wgate_np) ig = sigmoid(gates_t[:self.dh]) fg = sigmoid(gates_t[self.dh:self.dh * 2]) og = sigmoid(gates_t[self.dh * 2:self.dh * 3]) cx_t = tanh(gates_t[self.dh * 3:]) # update lstm internal state c_t = np.multiply(ig, cx_t) + np.multiply(fg, node.c) # obtain new hiddne layer h_t = np.multiply(og, tanh(c_t)) # compute output distribution target word prob o_t = softmax(np.dot(h_t, self.Who_np)) # make sure we won't sample unknown word o_t[0] = 0.0 selected_words = np.argsort(o_t)[::-1][:self.beamwidth].tolist() # return results return selected_words, o_t[selected_words], c_t, h_t
def _sample_from_posterior(self, belief_t, degree_t, intent_t, masked_source_t, masked_target_t): # Posterior # response encoding target_intent_t = bidirectional_read(self.tfEncoder, self.tbEncoder, masked_target_t) source_intent_t = bidirectional_read(self.sfEncoder, self.sbEncoder, masked_source_t) # posterior parameterisation q_logit_t = np.dot( tanh( np.dot(belief_t, self.Wq1_backup) + np.dot(degree_t, self.Wq2_backup) + np.dot(source_intent_t, self.Wq3_backup) + np.dot(target_intent_t, self.Wq4_backup)), self.Wq5_backup) # sampling from a scaled posterior sortedIndex = np.argsort(q_logit_t)[::-1][:self.topN] topN_posterior_t = softmax(q_logit_t[sortedIndex]) z_t = sortedIndex[np.argmax( np.random.multinomial(n=1, pvals=topN_posterior_t))] #z_t = sortedIndex[0] z_t = np.expand_dims(z_t, axis=0) print sortedIndex[:3] print softmax(q_logit_t)[sortedIndex][:3] print 'Posterior : %s' % sortedIndex print 'probability: %s' % topN_posterior_t return z_t, softmax(q_logit_t)
def _gen(self,node): # input word embedding wv_t = sigmoid(self.Wemb_np[node.wordid,:]) # attention b_t = np.zeros((node.sv.shape[0])) for j in range(node.sv.shape[0]): b_t[j] = np.dot(tanh(np.dot( np.concatenate([wv_t,node.h,node.sv[j]],axis=0), self.Wha_np)),self.Vha_np) b_t = softmax(b_t) sv_emb_t = np.dot(b_t,node.sv) da_emb_t = tanh( node.a+sv_emb_t ) # compute ig, fg, og together and slice it gates_t = np.dot( np.concatenate([wv_t,node.h,da_emb_t],axis=0), self.Wgate_np) ig = sigmoid(gates_t[:self.dh]) fg = sigmoid(gates_t[self.dh:self.dh*2]) og = sigmoid(gates_t[self.dh*2:self.dh*3]) cx_t= tanh( gates_t[self.dh*3:] ) # update lstm internal state c_t = np.multiply(ig,cx_t) + np.multiply(fg,node.c) # obtain new hiddne layer h_t = np.multiply(og,tanh(c_t)) # compute output distribution target word prob o_t = softmax( np.dot(h_t,self.Who_np) ) # make sure we won't sample unknown word o_t[0] = 0.0 selected_words = np.argsort(o_t)[::-1][:self.beamwidth].tolist() # return results return selected_words, o_t[selected_words], c_t, h_t
def _sample_from_posterior(self, belief_t, degree_t, intent_t, masked_source_t, masked_target_t): # Posterior # response encoding target_intent_t = bidirectional_read( self.tfEncoder, self.tbEncoder, masked_target_t) source_intent_t = bidirectional_read( self.sfEncoder, self.sbEncoder, masked_source_t) # posterior parameterisation q_logit_t = np.dot(tanh( np.dot(belief_t,self.Wq1_backup)+ np.dot(degree_t,self.Wq2_backup)+ np.dot(source_intent_t,self.Wq3_backup)+ np.dot(target_intent_t,self.Wq4_backup)), self.Wq5_backup ) # sampling from a scaled posterior sortedIndex = np.argsort(q_logit_t)[::-1][:self.topN] topN_posterior_t= softmax(q_logit_t[sortedIndex]) z_t = sortedIndex[ np.argmax( np.random.multinomial(n=1, pvals=topN_posterior_t)) ] #z_t = sortedIndex[0] z_t = np.expand_dims(z_t,axis=0) print sortedIndex[:3] print softmax(q_logit_t)[sortedIndex][:3] print 'Posterior : %s' % sortedIndex print 'probability: %s' % topN_posterior_t return z_t, softmax(q_logit_t)
def track(self, b_jm1, ngs_j, ngt_j): # padding dummy Wfbs = np.concatenate([self.Wfbs_backup,\ np.zeros_like(self.Wfbs_backup[-1:,:])],axis=0) Wfbt = np.concatenate([self.Wfbt_backup,\ np.zeros_like(self.Wfbt_backup[-1:,:])],axis=0) # new belief g_j = np.zeros(self.dbm1) for v in range(self.dbm1): ngsidx = ngs_j[v] ngtidx = ngt_j[v] fembs_v = np.sum(Wfbs[ngsidx,:],axis=0) fembt_v = np.sum(Wfbt[ngtidx,:],axis=0) g_jv = np.dot( self.Whb_backup, sigmoid( fembs_v + fembt_v + b_jm1[v] *self.Wrec_backup + b_jm1[-1]*self.Wnon_backup + self.B0_backup )) g_j[v] = g_jv g_j = np.concatenate([g_j,self.B_backup],axis=0) b_j = softmax( g_j ) return b_j
def track(self, ms_j, mt_jm1, ssrcpos_js, vsrcpos_js, starpos_js, vtarpos_js ): # cnn encoding ngms_j, uttms_j = self.sCNN.read(ms_j) ngmt_jm1,uttmt_jm1 = self.tCNN.read(mt_jm1) # padding dummy vector ngms_j = np.concatenate([ngms_j,np.zeros_like(ngms_j[-1:,:])],axis=0) ngmt_jm1 = np.concatenate([ngmt_jm1,np.zeros_like(ngmt_jm1[-1:,:])],axis=0) # source features ssrcemb_js = np.sum(ngms_j[ssrcpos_js,:],axis=0) vsrcemb_js = np.sum(ngms_j[vsrcpos_js,:],axis=0) src_js = np.concatenate([ssrcemb_js,vsrcemb_js,uttms_j],axis=0) # target features staremb_js = np.sum(ngmt_jm1[starpos_js,:],axis=0) vtaremb_js = np.sum(ngmt_jm1[vtarpos_js,:],axis=0) tar_js = np.concatenate([staremb_js,vtaremb_js,uttmt_jm1],axis=0) # update g_j g_j = np.dot( self.Whb_backup, sigmoid( np.dot(src_js,self.Wfbs_backup) + np.dot(tar_js,self.Wfbt_backup) + self.B0_backup )) # update b_j g_j = np.array([g_j,self.B_backup]) b_j = softmax( g_j ) return b_j
def _gen(self, node): # input word embedding wv_t = sigmoid(self.Wemb_np[node.wordid, :]) # compute ig, fg, og together and slice it gates_t = np.dot(np.concatenate([wv_t, node.h, node.sv], axis=0), self.Wgate_np) ig = sigmoid(gates_t[:self.dh]) fg = sigmoid(gates_t[self.dh:self.dh * 2]) og = sigmoid(gates_t[self.dh * 2:self.dh * 3]) # compute reading rg rg = sigmoid( np.dot(np.concatenate([wv_t, node.h, node.sv], axis=0), self.Wrgate_np)) # compute proposed cell value cx_t = np.tanh( np.dot(np.concatenate([wv_t, node.h], axis=0), self.Wcx_np)) # update DA 1-hot vector sv_t = np.multiply(rg, node.sv) # update lstm internal state c_t = np.multiply(ig, cx_t) + \ np.multiply(fg, node.c) + \ tanh(np.dot(np.concatenate([node.a, sv_t], axis=0), self.Wfc_np)) # obtain new hiddne layer h_t = np.multiply(og, tanh(c_t)) # compute output distribution target word prob o_t = softmax(np.dot(h_t, self.Who_np)) # make sure we won't sample unknown word o_t[0] = 0.0 selected_words = np.argsort(o_t)[::-1][:self.beamwidth].tolist() # return results return selected_words, o_t[selected_words], sv_t, c_t, h_t
def track(self, b_jm1, ngs_j, ngt_j): # padding dummy Wfbs = np.concatenate([self.Wfbs_backup,\ np.zeros_like(self.Wfbs_backup[-1:,:])],axis=0) Wfbt = np.concatenate([self.Wfbt_backup,\ np.zeros_like(self.Wfbt_backup[-1:,:])],axis=0) # new belief g_j = np.zeros(self.dbm1) for v in range(self.dbm1): ngsidx = ngs_j[v] ngtidx = ngt_j[v] fembs_v = np.sum(Wfbs[ngsidx, :], axis=0) fembt_v = np.sum(Wfbt[ngtidx, :], axis=0) g_jv = np.dot( self.Whb_backup, sigmoid(fembs_v + fembt_v + b_jm1[v] * self.Wrec_backup + b_jm1[-1] * self.Wnon_backup + self.B0_backup)) g_j[v] = g_jv g_j = np.concatenate([g_j, self.B_backup], axis=0) b_j = softmax(g_j) return b_j
def track(self, ms_j, mt_jm1, ssrcpos_js, vsrcpos_js, starpos_js, vtarpos_js): # cnn encoding ngms_j, uttms_j = self.sCNN.read(ms_j) ngmt_jm1, uttmt_jm1 = self.tCNN.read(mt_jm1) # padding dummy vector ngms_j = np.concatenate([ngms_j, np.zeros_like(ngms_j[-1:, :])], axis=0) ngmt_jm1 = np.concatenate( [ngmt_jm1, np.zeros_like(ngmt_jm1[-1:, :])], axis=0) # source features ssrcemb_js = np.sum(ngms_j[ssrcpos_js, :], axis=0) vsrcemb_js = np.sum(ngms_j[vsrcpos_js, :], axis=0) src_js = np.concatenate([ssrcemb_js, vsrcemb_js, uttms_j], axis=0) # target features staremb_js = np.sum(ngmt_jm1[starpos_js, :], axis=0) vtaremb_js = np.sum(ngmt_jm1[vtarpos_js, :], axis=0) tar_js = np.concatenate([staremb_js, vtaremb_js, uttmt_jm1], axis=0) # update g_j g_j = np.dot( self.Whb_backup, sigmoid( np.dot(src_js, self.Wfbs_backup) + np.dot(tar_js, self.Wfbt_backup) + self.B0_backup)) # update b_j g_j = np.array([g_j, self.B_backup]) b_j = softmax(g_j) return b_j
def _gen(self,node): # input word embedding wv_t = sigmoid(self.Wemb_np[node.wordid,:]) # compute ig, fg, og together and slice it gates_t = np.dot( np.concatenate( [wv_t,node.h,node.sv],axis=0),self.Wgate_np) ig = sigmoid(gates_t[:self.dh]) fg = sigmoid(gates_t[self.dh:self.dh*2]) og = sigmoid(gates_t[self.dh*2:self.dh*3]) # compute reading rg rg = sigmoid(np.dot(np.concatenate( [wv_t,node.h,node.sv],axis=0),self.Wrgate_np)) # compute proposed cell value cx_t= np.tanh(np.dot(np.concatenate( [wv_t,node.h],axis=0),self.Wcx_np)) # update DA 1-hot vector sv_t = np.multiply(rg,node.sv) # update lstm internal state c_t = np.multiply(ig,cx_t) +\ np.multiply(fg,node.c)+\ tanh(np.dot(np.concatenate([node.a,sv_t],axis=0),self.Wfc_np)) # obtain new hiddne layer h_t = np.multiply(og,tanh(c_t)) # compute output distribution target word prob o_t = softmax( np.dot(h_t,self.Who_np) ) # make sure we won't sample unknown word o_t[0] = 0.0 selected_words = np.argsort(o_t)[::-1][:self.beamwidth].tolist() # return results return selected_words, o_t[selected_words], sv_t, c_t, h_t
def _sample_from_prior(self, belief_t, degree_t, intent_t): # prior parameterisarion hidden_t = tanh(np.dot(belief_t,self.Ws1_backup)+ np.dot(degree_t,self.Ws2_backup)+ np.dot(intent_t,self.Ws3_backup) ) p_logit_t = np.dot( tanh(np.dot(hidden_t,self.Wp1_backup)+self.bp1_backup), self.Wp2_backup) # sampling from prior sortedIndex = np.argsort(p_logit_t)[::-1][:self.topN] topN_prior_t= softmax(p_logit_t[sortedIndex]) z_t = sortedIndex[ np.argmax( np.random.multinomial(n=1, pvals=topN_prior_t)) ] z_t = np.expand_dims(z_t,axis=0) # choose the top N samples print 'Sample : %s' % z_t print 'Prior dist.: %s' % sortedIndex print 'probability: %s' % topN_prior_t print return z_t, softmax(p_logit_t)
def _sample_from_prior(self, belief_t, degree_t, intent_t): # prior parameterisarion hidden_t = tanh(np.dot(belief_t, self.Ws1_backup) + np.dot(degree_t, self.Ws2_backup) + np.dot(intent_t, self.Ws3_backup)) p_logit_t = np.dot( tanh(np.dot(hidden_t, self.Wp1_backup) + self.bp1_backup), self.Wp2_backup) # sampling from prior sortedIndex = np.argsort(p_logit_t)[::-1][:self.topN] topN_prior_t = softmax(p_logit_t[sortedIndex]) z_t = sortedIndex[np.argmax(np.random.multinomial(n=1, pvals=topN_prior_t))] z_t = np.expand_dims(z_t, axis=0) # choose the top N samples print 'Sample : %s' % z_t print 'Prior dist.: %s' % sortedIndex print 'probability: %s' % topN_prior_t print return z_t, softmax(p_logit_t)
def decide(self, belief_t, degree_t, intent_t, ohidden_tjm1, wemb_tj): # embed degree_t = tanh(np.dot(degree_t, self.Ws2_backup)) intent_t = tanh(np.dot(intent_t, self.Ws3_backup)) # score bias score_t=np.dot(ohidden_tjm1,self.Wa1_backup)+\ np.dot(wemb_tj, self.Wa2_backup)+\ np.dot(belief_t,self.Wa3_backup) # attention mechanism atten_t = softmax(np.dot(sigmoid(score_t), self.Va1_backup)) actEmb = tanh(np.dot(atten_t, belief_t) + degree_t + intent_t) return np.expand_dims(actEmb, axis=0)
def decide(self, belief_t, degree_t, intent_t, ohidden_tjm1, wemb_tj): # embed degree_t = tanh(np.dot(degree_t,self.Ws2_backup)) intent_t = tanh(np.dot(intent_t,self.Ws3_backup)) # score bias score_t=np.dot(ohidden_tjm1,self.Wa1_backup)+\ np.dot(wemb_tj, self.Wa2_backup)+\ np.dot(belief_t,self.Wa3_backup) # attention mechanism atten_t= softmax(np.dot(sigmoid(score_t),self.Va1_backup)) actEmb = tanh(np.dot(atten_t,belief_t)+degree_t+intent_t) return np.expand_dims(actEmb,axis=0)
def track(self, ngs_j, ngt_j): # padding dummy Wfbs = np.concatenate([self.Wfbs_backup, \ np.zeros_like(self.Wfbs_backup[-1:, :])], axis=0) Wfbt = np.concatenate([self.Wfbt_backup, \ np.zeros_like(self.Wfbt_backup[-1:, :])], axis=0) # new belief fembs_v = np.sum(Wfbs[ngs_j, :], axis=0) fembt_v = np.sum(Wfbt[ngt_j, :], axis=0) g_j = np.dot(self.Whb_backup, sigmoid(fembs_v + fembt_v + self.B0_backup)) g_j = np.array([g_j, self.B_backup]) b_j = softmax(g_j) return b_j
def track(self, ngs_j, ngt_j): # padding dummy Wfbs = np.concatenate([self.Wfbs_backup,\ np.zeros_like(self.Wfbs_backup[-1:,:])],axis=0) Wfbt = np.concatenate([self.Wfbt_backup,\ np.zeros_like(self.Wfbt_backup[-1:,:])],axis=0) # new belief fembs_v = np.sum(Wfbs[ngs_j,:],axis=0) fembt_v = np.sum(Wfbt[ngt_j,:],axis=0) g_j = np.dot( self.Whb_backup, sigmoid( fembs_v + fembt_v + self.B0_backup )) g_j = np.array([g_j,self.B_backup]) b_j = softmax( g_j ) return b_j
def track(self, b_jm1, ms_j, mt_jm1, ssrcpos_js, vsrcpos_js, starpos_js, vtarpos_js): # cnn encoding ngms_j, uttms_j = self.sCNN.read(ms_j) ngmt_jm1, uttmt_jm1 = self.tCNN.read(mt_jm1) # padding dummy vector ngms_j = np.concatenate([ngms_j, np.zeros_like(ngms_j[-1:, :])], axis=0) ngmt_jm1 = np.concatenate( [ngmt_jm1, np.zeros_like(ngmt_jm1[-1:, :])], axis=0) # new belief g_j = np.zeros(self.dbm1) for v in range(self.dbm1): # source features ssrcemb_jsv = np.sum(ngms_j[ssrcpos_js[v], :], axis=0) vsrcemb_jsv = np.sum(ngms_j[vsrcpos_js[v], :], axis=0) src_jsv = np.concatenate([ssrcemb_jsv, vsrcemb_jsv, uttms_j], axis=0) # target features staremb_jsv = np.sum(ngmt_jm1[starpos_js[v], :], axis=0) vtaremb_jsv = np.sum(ngmt_jm1[vtarpos_js[v], :], axis=0) tar_jsv = np.concatenate([staremb_jsv, vtaremb_jsv, uttmt_jm1], axis=0) # update g_jv value g_jv = np.dot( self.Whb_backup, sigmoid( np.dot(src_jsv, self.Wfbs_backup) + np.dot(tar_jsv, self.Wfbt_backup) + b_jm1[v] * self.Wrec_backup + b_jm1[-1] * self.Wnon_backup + self.B0_backup)) g_j[v] = g_jv # produce new belief b_j g_j = np.concatenate([g_j, self.B_backup], axis=0) b_j = softmax(g_j) return b_j
def track(self, b_jm1, ms_j, mt_jm1, ssrcpos_js, vsrcpos_js, starpos_js, vtarpos_js ): # cnn encoding ngms_j, uttms_j = self.sCNN.read(ms_j) ngmt_jm1,uttmt_jm1 = self.tCNN.read(mt_jm1) # padding dummy vector ngms_j = np.concatenate([ngms_j,np.zeros_like(ngms_j[-1:,:])],axis=0) ngmt_jm1 = np.concatenate([ngmt_jm1,np.zeros_like(ngmt_jm1[-1:,:])],axis=0) # new belief g_j = np.zeros(self.dbm1) for v in range(self.dbm1): # source features ssrcemb_jsv = np.sum(ngms_j[ssrcpos_js[v],:],axis=0) vsrcemb_jsv = np.sum(ngms_j[vsrcpos_js[v],:],axis=0) src_jsv = np.concatenate([ssrcemb_jsv,vsrcemb_jsv,uttms_j],axis=0) # target features staremb_jsv = np.sum(ngmt_jm1[starpos_js[v],:],axis=0) vtaremb_jsv = np.sum(ngmt_jm1[vtarpos_js[v],:],axis=0) tar_jsv = np.concatenate([staremb_jsv,vtaremb_jsv,uttmt_jm1],axis=0) # update g_jv value g_jv = np.dot( self.Whb_backup, sigmoid( np.dot(src_jsv,self.Wfbs_backup) + np.dot(tar_jsv,self.Wfbt_backup) + b_jm1[v] *self.Wrec_backup + b_jm1[-1]*self.Wnon_backup + self.B0_backup )) g_j[v] = g_jv # produce new belief b_j g_j = np.concatenate([g_j,self.B_backup],axis=0) b_j = softmax( g_j ) return b_j
def _forwardpass(self, n, intent_t, belief_vec_t, degree_t, actEmb_t, scoreTable): # forward pass in_j = sigmoid(self.Wemb_backup[n.wordid]) # action embedding if self.ply == 'attention': actEmb_tj = self.policy.decide(belief_vec_t, degree_t, intent_t, n.h, in_j)[0] else: # fixed action embedding actEmb_tj = actEmb_t # syntatic memory cell and gate # compute i, f, o, c together and slice it bundle_j = np.dot(in_j, self.oWgate_backup) + \ np.dot(n.h, self.oUgate_backup) bundle_aj = np.dot(actEmb_tj, self.Wzh_backup) # input gate ig = sigmoid(bundle_j[:self.doh] + bundle_aj[:self.doh] + self.b_backup[:self.doh]) # use forget bias or not fg = sigmoid(bundle_j[self.doh:self.doh * 2] + bundle_aj[self.doh:self.doh * 2] + self.b_backup[self.doh:self.doh * 2]) # output gate og = sigmoid(bundle_j[self.doh * 2:self.doh * 3] + bundle_aj[self.doh * 2:self.doh * 3] + self.b_backup[self.doh * 2:self.doh * 3]) # proposed memory cell # reading gate, memory cell, hidden layer if self.struct == 'lstm_cond': # reading gate control signal rg = sigmoid(bundle_j[self.doh * 4:self.doh * 5] + bundle_aj[self.doh * 4:self.doh * 5] + self.b_backup[self.doh * 3:]) cx_j = tanh(bundle_j[self.doh * 3:self.doh * 4]) oc_j = np.multiply(ig, cx_j) + \ np.multiply(fg, n.c) + \ np.multiply(rg, tanh(bundle_aj[self.doh * 3:self.doh * 4])) oh_j = np.multiply(og, tanh(oc_j)) o_j = softmax(np.dot(oh_j, self.Who_backup)) elif self.struct == 'lstm_mix': # two signals rg = sigmoid(bundle_j[self.doh * 4:self.doh * 5] + bundle_aj[self.doh * 4:self.doh * 5] + self.b_backup[self.doh * 3:]) cx_j = tanh(bundle_j[self.doh * 3:self.doh * 4]) oc_j = np.multiply(ig, cx_j) + \ np.multiply(fg, n.c) oh_j = np.multiply(og, tanh(oc_j)) + \ np.multiply(rg, tanh(bundle_aj[self.doh * 3:self.doh * 4])) o_j = softmax(np.dot(oh_j, self.Who_backup)) elif self.struct == 'lstm_lm': # lm style cx_j = tanh(bundle_j[self.doh * 3:self.doh * 4] + bundle_aj[self.doh * 3:self.doh * 4]) oc_j = np.multiply(ig, cx_j) + \ np.multiply(fg, n.c) oh_j = np.multiply(og, tanh(oc_j)) o_j = softmax(np.dot(oh_j, self.Who_backup)) else: sys.exit('[ERROR]: Unseen decoder structure ' + self.struct) # compute output distribution, logp, and sample # make sure we won't sample unknown word o_j[0] = 0.0 selected_words = np.argsort(o_j)[::-1][:self.beamwidth] # expand nodes and add additional reward nextnodes = [] for wid in selected_words: # ignore <unk> token # loglikelihood of current word logp = np.log10(o_j[wid]) # update record for new node new_record = deepcopy(n.record) if new_record['s'].has_key(wid): new_record['s'][wid] += 1 if new_record['v'].has_key(wid): new_record['v'][wid] += 1 # create new node and score it node = BeamSearchNode(oh_j, oc_j, n, wid, \ n.logp + logp, n.leng + 1, new_record) # store nodes nextnodes.append( \ (-node.eval(self.repeat_penalty, self.token_reward, \ scoreTable, self.alpha), node)) return nextnodes
def _forwardpass(self, n, intent_t, belief_vec_t, degree_t, actEmb_t, scoreTable): # forward pass in_j = sigmoid( self.Wemb_backup[n.wordid] ) # action embedding if self.ply=='attention': actEmb_tj = self.policy.decide(belief_vec_t, degree_t, intent_t, n.h, in_j)[0] else: # fixed action embedding actEmb_tj = actEmb_t # syntatic memory cell and gate # compute i, f, o, c together and slice it bundle_j = np.dot(in_j,self.oWgate_backup) +\ np.dot(n.h, self.oUgate_backup) bundle_aj= np.dot(actEmb_tj,self.Wzh_backup) # input gate ig = sigmoid( bundle_j[:self.doh]+ bundle_aj[:self.doh]+ self.b_backup[:self.doh]) # use forget bias or not fg = sigmoid( bundle_j[self.doh:self.doh*2]+ bundle_aj[self.doh:self.doh*2]+ self.b_backup[self.doh:self.doh*2]) # output gate og = sigmoid( bundle_j[self.doh*2:self.doh*3]+ bundle_aj[self.doh*2:self.doh*3]+ self.b_backup[self.doh*2:self.doh*3]) # proposed memory cell # reading gate, memory cell, hidden layer if self.struct=='lstm_cond': # reading gate control signal rg = sigmoid( bundle_j[self.doh*4:self.doh*5]+ bundle_aj[self.doh*4:self.doh*5]+ self.b_backup[self.doh*3:]) cx_j = tanh(bundle_j[self.doh*3:self.doh*4]) oc_j = np.multiply(ig,cx_j)+\ np.multiply(fg,n.c)+\ np.multiply(rg,tanh(bundle_aj[self.doh*3:self.doh*4])) oh_j = np.multiply(og,tanh(oc_j)) o_j = softmax( np.dot(oh_j, self.Who_backup) ) elif self.struct=='lstm_mix':# two signals rg = sigmoid( bundle_j[self.doh*4:self.doh*5]+ bundle_aj[self.doh*4:self.doh*5]+ self.b_backup[self.doh*3:]) cx_j = tanh(bundle_j[self.doh*3:self.doh*4]) oc_j = np.multiply(ig,cx_j)+\ np.multiply(fg,n.c) oh_j = np.multiply(og,tanh(oc_j))+\ np.multiply(rg,tanh(bundle_aj[self.doh*3:self.doh*4])) o_j = softmax( np.dot(oh_j, self.Who_backup) ) elif self.struct=='lstm_lm': # lm style cx_j = tanh( bundle_j[self.doh*3:self.doh*4]+ bundle_aj[self.doh*3:self.doh*4]) oc_j = np.multiply(ig,cx_j)+\ np.multiply(fg,n.c) oh_j = np.multiply(og,tanh(oc_j)) o_j = softmax( np.dot(oh_j, self.Who_backup) ) else: sys.exit('[ERROR]: Unseen decoder structure '+self.struct) # compute output distribution, logp, and sample # make sure we won't sample unknown word o_j[0] = 0.0 selected_words = np.argsort(o_j)[::-1][:self.beamwidth] # expand nodes and add additional reward nextnodes = [] for wid in selected_words: # ignore <unk> token # loglikelihood of current word logp = np.log10(o_j[wid]) # update record for new node new_record = deepcopy(n.record) if new_record['s'].has_key(wid): new_record['s'][wid] += 1 if new_record['v'].has_key(wid): new_record['v'][wid] += 1 # create new node and score it node = BeamSearchNode(oh_j,oc_j,n,wid,\ n.logp+logp,n.leng+1,new_record) # store nodes nextnodes.append( \ (-node.eval(self.repeat_penalty,self.token_reward,\ scoreTable,self.alpha), node)) return nextnodes