def beam_search(self, char_seq, truth = None, mu =0.): start_agenda = Agenda(self.options['beam_size']) init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) start_agenda.push(Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None)) agenda = [start_agenda] for idx, _ in enumerate(char_seq,1): # from left to right, character by character now = Agenda(self.options['beam_size']) for wlen in xrange(1,min(idx,self.options['max_word_len'])+1): # generate candidate word vectors word = self.word_repr(char_seq[idx-wlen:idx]) word_score = dy.dot_product(word,self.param_exprs['U']) for sent in agenda[idx-wlen]: # join segmentation if truth is not None: margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.) score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score else: score = sent.score_expr + dy.dot_product(sent.y, word) + word_score if now.happy_with(score.scalar_value()): new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) now.push(Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen)) agenda.append(now) if truth is not None: return agenda[-1].max().score_expr return agenda
def expr_for_tree(self, tree): if tree.isleaf(): return self.E[self.w2i.get(tree.label,0)] if len(tree.children) == 1: assert(tree.children[0].isleaf()) emb = self.expr_for_tree(tree.children[0]) Wi,Wo,Wu = [dy.parameter(w) for w in self.WS] bi,bo,bu,_ = [dy.parameter(b) for b in self.BS] i = dy.logistic(Wi*emb + bi) o = dy.logistic(Wo*emb + bo) u = dy.tanh( Wu*emb + bu) c = dy.cmult(i,u) expr = dy.cmult(o,dy.tanh(c)) return expr assert(len(tree.children) == 2),tree.children[0] e1 = self.expr_for_tree(tree.children[0]) e2 = self.expr_for_tree(tree.children[1]) Ui,Uo,Uu = [dy.parameter(u) for u in self.US] Uf1,Uf2 = [dy.parameter(u) for u in self.UFS] bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] e = dy.concatenate([e1,e2]) i = dy.logistic(Ui*e + bi) o = dy.logistic(Uo*e + bo) f1 = dy.logistic(Uf1*e1 + bf) f2 = dy.logistic(Uf2*e2 + bf) u = dy.tanh( Uu*e + bu) c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2) h = dy.cmult(o,dy.tanh(c)) expr = h return expr
def attend(blstm_outputs, h_t, W_c, v_a, W__a, U__a): # iterate through input states to compute alphas # print 'computing scores...' # scores = [W_a * pc.concatenate([h_t, h_input]) for h_input in blstm_outputs] scores = [v_a * pc.tanh(W__a * h_t + U__a * h_input) for h_input in blstm_outputs] # print 'computed scores' # normalize to alphas using softmax # print 'computing alphas...' alphas = pc.softmax(pc.concatenate(scores)) # print 'computed alphas...' # compute c using alphas # print 'computing c...' # import time # s = time.time() # dim = len(blstm_outputs[0].vec_value()) # stacked_alphas = pc.concatenate_cols([alphas for j in xrange(dim)]) # stacked_vecs = pc.concatenate_cols([h_input for h_input in blstm_outputs]) # c = pc.esum(pc.cwise_multiply(stacked_vecs, stacked_alphas)) # print "stack time:", time.time() - s # s = time.time() c = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # print "pick time:", time.time() - s # print 'computed c' # print 'c len is {}'.format(len(c.vec_value())) # compute output state h~ using c and the decoder's h (global attention variation from Loung and Manning 2015) # print 'computing h~...' h_output = pc.tanh(W_c * pc.concatenate([h_t, c])) # print 'len of h_output is {}'.format(len(h_output.vec_value())) # print 'computed h~' return h_output, alphas, W__a.value()
def __call__(self, x): W = dy.parameter(self.mw) b = dy.parameter(self.mb) W2 = dy.parameter(self.mw2) b2 = dy.parameter(self.mb2) mlp_output = W2 * (dy.tanh(W * x + b)) + b2 if fDo_3_Layers: W3 = dy.parameter(self.mw3) b3 = dy.parameter(self.mb3) mlp_output = W3 * (dy.tanh(dy.mlpoutput)) + b3 return dy.softmax(mlp_output)
def generate(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent #get the output of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] attention_matrix = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) attention_matrix.append(alignment) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent, dy.concatenate_cols(attention_matrix).value()
def gate_and_next_vecs(self, ht1, ct1, xt): v = self.gate_vecs(ht1, xt) c = dy.cmult(ct1, v["f"]) + dy.cmult(v["ctilde"], v["i"]) h = dy.cmult(dy.tanh(c), v["o"]) res = v res.update({"c": c, "h": h}) return res
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: #feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word return dy.esum(all_losses)
def generate(sent): dy.renew_cg() src = sent #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() #generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for i in range(MAX_SENT_SIZE): #feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def word_repr(self, char_seq, cembs): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) chars = dy.concatenate(cembs) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) word = dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]) if self.known_words is not None and tuple( char_seq) in self.known_words: return (word + dy.lookup(self.params['word_embed'], self.known_words[tuple(char_seq)])) / 2. return word
def calc_score_of_history(words): # Lookup the embeddings and concatenate them emb = dy.concatenate([W_emb[x] for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def get_decode_loss(self, src_encodings, tgt_sents): W_s = dy.parameter(self.W_s) b_s = dy.parameter(self.b_s) W_h = dy.parameter(self.W_h) b_h = dy.parameter(self.b_h) W_y = dy.parameter(self.W_y) b_y = dy.parameter(self.b_y) tgt_words, tgt_masks = input_transpose(tgt_sents) batch_size = len(tgt_sents) s = self.dec_builder.initial_state( [dy.tanh(W_s * src_encodings[-1] + b_s)]) ctx_tm1 = dy.vecInput(self.args.hidden_size * 2) losses = [] # start from <S>, until y_{T-1} for t, (y_ref_t, mask_t) in enumerate(zip(tgt_words[1:], tgt_masks[1:]), start=1): y_tm1_embed = dy.lookup_batch(self.tgt_lookup, tgt_words[t - 1]) x = dy.concatenate([y_tm1_embed, ctx_tm1]) s = s.add_input(x) h_t = s.output() ctx_t, alpha_t = self.attention(src_encodings, h_t, batch_size) # read_out = dy.tanh(W_h * dy.concatenate([h_t, ctx_t]) + b_h) read_out = dy.tanh( dy.affine_transform([b_h, W_h, dy.concatenate([h_t, ctx_t])])) if args.dropout > 0.: read_out = dy.dropout(read_out, args.dropout) y_t = W_y * read_out + b_y loss_t = dy.pickneglogsoftmax_batch(y_t, y_ref_t) if 0 in mask_t: mask_expr = dy.inputVector(mask_t) mask_expr = dy.reshape(mask_expr, (1, ), batch_size) loss_t = loss_t * mask_expr losses.append(loss_t) ctx_tm1 = ctx_t loss = dy.esum(losses) loss = dy.sum_batches(loss) / batch_size return loss
def __call__(self, translator, dec_state, src, trg): # TODO: apply trg.mask ? samples = [] logsofts = [] self.bs = [] done = [False for _ in range(len(trg))] for _ in range(self.sample_length): dec_state.context = translator.attender.calc_context(dec_state.rnn_state.output()) if self.use_baseline: h_t = dy.tanh(translator.decoder.context_projector(dy.concatenate([dec_state.rnn_state.output(), dec_state.context]))) self.bs.append(self.baseline(dy.nobackprop(h_t))) logsoft = dy.log_softmax(translator.decoder.get_scores(dec_state)) sample = logsoft.tensor_value().categorical_sample_log_prob().as_numpy()[0] # Keep track of previously sampled EOS sample = [sample_i if not done_i else Vocab.ES for sample_i, done_i in zip(sample, done)] # Appending and feeding in the decoder logsoft = dy.pick_batch(logsoft, sample) logsofts.append(logsoft) samples.append(sample) dec_state = translator.decoder.add_input(dec_state, translator.trg_embedder.embed(xnmt.batcher.mark_as_batch(sample))) # Check if we are done. done = list(six.moves.map(lambda x: x == Vocab.ES, sample)) if all(done): break samples = np.stack(samples, axis=1).tolist() self.eval_score = [] for trg_i, sample_i in zip(trg, samples): # Removing EOS try: idx = sample_i.index(Vocab.ES) sample_i = sample_i[:idx] except ValueError: pass try: idx = trg_i.words.index(Vocab.ES) trg_i.words = trg_i.words[:idx] except ValueError: pass # Calculate the evaluation score score = 0 if not len(sample_i) else self.evaluation_metric.evaluate_fast(trg_i.words, sample_i) self.eval_score.append(score) self.true_score = dy.inputTensor(self.eval_score, batched=True) loss = LossBuilder() if self.use_baseline: for i, (score, _) in enumerate(zip(self.bs, logsofts)): logsofts[i] = dy.cmult(logsofts[i], score - self.true_score) loss.add_loss("Reinforce", dy.sum_elems(dy.esum(logsofts))) else: loss.add_loss("Reinforce", dy.sum_elems(dy.cmult(-self.true_score, dy.esum(logsofts)))) if self.use_baseline: baseline_loss = [] for bs in self.bs: baseline_loss.append(dy.squared_distance(self.true_score, bs)) loss.add_loss("Baseline", dy.sum_elems(dy.esum(baseline_loss))) return loss
def compute_embeddings(self, word, runtime=True): x_list = [] if not isinstance(word, unicode): uniword = unicode(word, 'utf-8') else: import copy uniword = copy.deepcopy(word) uniword = re.sub('\d', '0', uniword) for i in range(len(uniword)): char = uniword[i] if char.lower() == char and char.upper() == char: style_emb = dy.inputVector([1.0, 0.0, 0.0]) # does not support uppercase elif char.lower() == char: style_emb = dy.inputVector([0.0, 1.0, 0.0]) # is lowercased else: style_emb = dy.inputVector([0.0, 0.0, 1.0]) # is uppercased char = char.lower() if char in self.encodings.char2int: x_list.append(dy.concatenate([self.character_lookup[self.encodings.char2int[char]], style_emb])) else: x_list.append(dy.concatenate([self.character_lookup[self.encodings.char2int['<UNK>']], style_emb])) rnn_outputs = x_list rnn_states_fw = None rnn_states_bw = None for rnn_fw, rnn_bw in zip(self.rnn_fw, self.rnn_bw): fw = [] bw = [] if runtime: rnn_fw.set_dropouts(0, 0) rnn_bw.set_dropouts(0, 0) else: rnn_fw.set_dropouts(0, 0.33) rnn_bw.set_dropouts(0, 0.33) rnn_fw = rnn_fw.initial_state() rnn_bw = rnn_bw.initial_state() rnn_states_fw = [] rnn_states_bw = [] for x in rnn_outputs: rnn_fw = rnn_fw.add_input(x) rnn_states_fw.append(rnn_fw) fw.append(rnn_states_fw[-1].output()) for x in reversed(rnn_outputs): rnn_bw = rnn_bw.add_input(x) rnn_states_bw.append(rnn_bw) bw.append(rnn_states_bw[-1].output()) rnn_outputs = [] for x1, x2 in zip(fw, reversed(bw)): rnn_outputs.append(dy.concatenate([x1, x2])) attention = self._attend(rnn_outputs, rnn_states_fw[-1], rnn_states_bw[-1]) pre_linear = dy.concatenate([fw[-1], bw[-1], attention]) embedding = dy.tanh(self.linearW.expr() * pre_linear + self.linearB.expr()) return embedding, rnn_outputs
def __call__(self, inputs, is_train=True): ners, constituent_path, dep_path = inputs dy.renew_cg() #make ner a dynet expression ners_vec = dy.vecInput(LENGTH_OF_NER) ners_vec.set(ners) #get vector from lstm on constituent path if len(constituent_path) > 0: constituent_path = [ self.word_embeds[x] if i % 2 == 0 else self.arrow_embeds[x] for i, x in enumerate(constituent_path) ] if is_train: constituent_path = [ dy.dropout(x, self.dropout) for x in constituent_path ] lstm_init1 = self.constituent_lstm.initial_state() cons_vec = lstm_init1.transduce(constituent_path)[-1] else: cons_vec = dy.vecInput(self.lstm_dim) #get vector from lstm on dependency path if len(dep_path) > 0: dep_vec = [] for i, x in enumerate(dep_path): if i % 3 == 0: dep_vec.append(self.word_embeds[x]) elif i % 3 == 1: dep_vec.append(self.arrow_embeds[x]) else: dep_vec.append(self.dep_embeds[x]) if is_train: dep_vec = [dy.dropout(x, self.dropout) for x in dep_vec] lstm_init2 = self.dependency_lstm.initial_state() dep_vec = lstm_init2.transduce(dep_vec)[-1] else: dep_vec = dy.vecInput(self.lstm_dim) final_input = dy.concatenate([ners_vec, cons_vec, dep_vec]) return dy.softmax(self.W3 * dy.tanh( self.W2 * dy.tanh(self.W1 * final_input + self.b1) + self.b2) + self.b3)
def attend_with_prev(self, state, w1dt, prev_att): w2dt = self.attention_w2 * state w3dt = self.attention_w3 * prev_att unnormalized = dy.transpose( self.attention_v * dy.tanh(dy.colwise_add(dy.colwise_add(w1dt, w2dt), w3dt))) att_weights = dy.softmax(unnormalized) return att_weights
def predict_output(self, x): x_vector = dy.inputVector(x) f = dy.tanh(self.W * x_vector + self.b_bias) probs = dy.softmax(self.U * f + self.d_bias).npvalue() selection = np.random.choice(self.inp_dim, p=probs / probs.sum()) return selection, probs[selection]
def predict(self, x): x = dy.inputVector(x) pred = ((self.U * dy.tanh(self.W * x + self.b))) + self.d softmax = dy.softmax(pred).npvalue() max_pos = heapq.nlargest(20, range(len(softmax)), key=softmax.__getitem__) return max_pos, softmax
def add_input(self, input_vec): x = dynet.concatenate([input_vec, self.h]) i = dynet.logistic(self.W_i * x + self.b_i) f = dynet.logistic(self.W_f * x + self.b_f) g = dynet.tanh(self.W_c * x + self.b_c) o = dynet.logistic(self.W_o * x + self.b_o) c = dynet.cwise_multiply(f, self.c) + dynet.cwise_multiply(i, g) h = dynet.cwise_multiply(o, dynet.tanh(c)) self.c = c self.h = h self.outputs.append(h) return self
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component): w1_att_src = dy.parameter(w1_att_src_p) w1_att_tgt = dy.parameter(w1_att_tgt_p) w2_att = dy.parameter(w2_att_p) a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att alignment = dy.softmax(a_t) att_output = src_output_matrix * alignment return att_output, alignment
def __call__(self, inputs): lookup = self.E emb_vectors = [lookup[i] for i in inputs] net_input = dy.concatenate(emb_vectors) net_output = dy.softmax(self.pV * (dy.tanh((self.pW * net_input) + self.pB_1)) + self.pB_2) return net_output
def __call__(self, input_expr): W1 = dy.parameter(self.W1) W2 = dy.parameter(self.W2) b1 = dy.parameter(self.b1) b2 = dy.parameter(self.b2) h = dy.tanh(W1 * input_expr + b1) return W2 * h + b2
def get_gen_vocab_embedding(self,current_state_output, context_vector, w, b): voc_lookup = dy.parameter(self.gentokenLookup) # state = dy.concatenate([current_state.output(), context_vector]) state = dy.concatenate([current_state_output,context_vector]) s = dy.affine_transform([b, w, state]) g = dy.tanh(s) s = dy.transpose(voc_lookup) * g return s
def predict_next_(self, state, *args, **kwargs): (R, bias, W_c, W__a, U__a, v__a) = self.cg_params # soft attention vector att_scores = [ v__a * dy.tanh(W__a * state.output() + U__a * h_input) for h_input in self.biencoder ] alphas = dy.softmax(dy.concatenate(att_scores)) c = dy.esum([ h_input * dy.pick(alphas, j) for j, h_input in enumerate(self.biencoder) ]) # softmax over vocabulary h_output = dy.tanh(W_c * dy.concatenate([state.output(), c])) return dy.softmax(R * h_output + bias)
def predict_next(self, scores=False, hidden =False): (R, bias, W_c, W__a, U__a, v__a) = self.cg_params # soft attention vector att_scores = [v__a * dy.tanh(W__a * self.s.output() + U__a * h_input) for h_input in self.biencoder] alphas = dy.softmax(dy.concatenate(att_scores)) c = dy.esum([h_input * dy.pick(alphas, j) for j, h_input in enumerate(self.biencoder)]) # softmax over vocabulary h_output = dy.tanh(W_c * dy.concatenate([self.s.output(), c])) if not hidden: if not scores: return dy.softmax(R * h_output + bias) else: return R * h_output + bias else: return h_output
def get_scores(self, mlp_dec_state): """Get scores given a current state. :param mlp_dec_state: An MlpSoftmaxDecoderState object. :returns: Scores over the vocabulary given this state. """ h_t = dy.tanh(self.context_projector(dy.concatenate([mlp_dec_state.rnn_state.output(), mlp_dec_state.context]))) return self.vocab_projector(h_t)
def calc_attention(self, state): V = dy.parameter(self.pV) U = dy.parameter(self.pU) h = dy.tanh(dy.colwise_add(self.WI, V * state)) scores = dy.transpose(U * h) return dy.softmax(scores)
def attend(self, input_mat, state, w1dt): w2 = dy.parameter(self.attention_w2) v = dy.parameter(self.attention_v) w2dt = w2 * dy.concatenate(list(state.s())) att_weights = dy.softmax( dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt)))) context = input_mat * att_weights return context
def __calc_attn_score(self, W1_att_f, W1_att_e, w2_att, h_fs_matrix, h_e): #print type(h_fs_matrix) h_e_matrix = dy.concatenate_cols( [h_e for i in range(h_fs_matrix.npvalue().shape[1])]) layer_1 = dy.tanh(W1_att_f * h_fs_matrix + W1_att_e * h_e_matrix) #print 'continues' return dy.transpose(layer_1) * w2_att
def calc_scores(words): dy.renew_cg() word = words.index(1) h1 = dy.lookup(W_emb, word) h2 = dy.tanh(dy.parameter(W_h) * h1 + dy.parameter(b_h)) W_softmax = dy.parameter(W_sm) b_softmax = dy.parameter(b_sm) return W_softmax * h2 + b_softmax
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs( [dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output() #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append( [sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s( [src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input( dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1, ), len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def calc_loss(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] trg = sent[1] # initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([LOOKUP_SRC[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mean = dy.parameter(W_mean_p) V_mean = dy.parameter(V_mean_p) b_mean = dy.parameter(b_mean_p) W_var = dy.parameter(W_var_p) V_var = dy.parameter(V_var_p) b_var = dy.parameter(b_var_p) # The mean vector from the encoder. mu = mlp(src_output, W_mean, V_mean, b_mean) # This is the diagonal vector of the log co-variance matrix from the encoder # (regard this as log variance is easier for furture implementation) log_var = mlp(src_output, W_var, V_var, b_var) # Compute KL[N(u(x), sigma(x)) || N(0, I)] # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = LSTM_TRG_BUILDER.initial_state().set_s([z, dy.tanh(z)]) prev_word = trg[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_word in trg[1:]: # feed the current state into the current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) prev_word = next_word softmax_loss = dy.esum(all_losses) return kl_loss, softmax_loss
def synthesize(self, mgc, batch_size, sample=True, temperature=1.0): synth = [] total_audio_len = mgc.shape[0] * len(self.upsample_w_s) num_batches = total_audio_len / batch_size if total_audio_len % batch_size != 0: num_batches + 1 last_rnn_state = None last_sample = 127 w_index = 0 last_proc = 0 for iBatch in range(num_batches): dy.renew_cg() # bias=dy.inputVector([0]*self.RNN_SIZE) # gain=dy.inputVector([1.0]*self.RNN_SIZE) start = batch_size * iBatch stop = batch_size * (iBatch + 1) if stop >= total_audio_len: stop = total_audio_len - 1 upsampled = self._upsample(mgc, start, stop) rnn = self.rnn.initial_state() if last_rnn_state is not None: rnn_state = [dy.inputVector(s) for s in last_rnn_state] rnn = rnn.set_s(rnn_state) out_list = [] for index in range(stop - start): w_index += 1 curr_proc = w_index * 100 / total_audio_len if curr_proc % 5 == 0 and curr_proc != last_proc: last_proc = curr_proc sys.stdout.write(' ' + str(curr_proc)) sys.stdout.flush() if self.OUTPUT_EMB_SIZE != 1: rnn_input = dy.concatenate([self.output_lookup[last_sample], upsampled[index]]) else: rnn_input = dy.concatenate([dy.scalarInput(float(last_sample) / 127.0 - 1.0), upsampled[index]]) rnn = rnn.add_input(rnn_input) rnn_output = rnn.output() # dy.layer_norm(rnn.output(), gain, bias) hidden = rnn_output for w, b in zip(self.mlp_w, self.mlp_b): hidden = dy.tanh(w.expr(update=True) * hidden + b.expr(update=True)) softmax_output = dy.softmax( self.softmax_w.expr(update=True) * hidden + self.softmax_b.expr(update=True)) out_list.append(softmax_output) if sample: last_sample = self._pick_sample(softmax_output.npvalue(), temperature=temperature) # np.argmax(softmax_output.npvalue()) else: last_sample = np.argmax(softmax_output.npvalue()) # last_sample = np.argmax(softmax_output.npvalue()) synth.append(last_sample) rnn_state = rnn.s() last_rnn_state = [s.value() for s in rnn_state] return synth
def generate(sent): dy.renew_cg() # Transduce all batch elements with an LSTM src = sent # get the output of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x, y in LSTM_SRC.add_inputs([LOOKUP_SRC[word] for word in src])] src_output = src_outputs[-1] # gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix # generate until a eos tag or max is reached current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_word = sos_trg trg_sent = [] attention_matrix = [] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for i in range(MAX_SENT_SIZE): # feed the previous word into the lstm, calculate the most likely word, add it to the sentence current_state = current_state.add_input(LOOKUP_TRG[prev_word]) output_embedding = current_state.output() att_output, alignment = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) attention_matrix.append(alignment) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) probs = (-dy.log_softmax(s)).value() next_word = np.argmax(probs) if next_word == eos_trg: break prev_word = next_word trg_sent.append(i2w_trg[next_word]) return trg_sent, dy.concatenate_cols(attention_matrix).value()
def predict_logprobs(self,X,Y,structural=True,hidden_out=False): """ Returns the log probabilities of the predictions for this model (batched version). @param X: the input indexes from which to predict (each xdatum is expected to be an iterable of integers) @param Y: a list of references indexes for which to extract the prob @param structural: switches between structural and lexical logprob evaluation @param hidden_out: outputs an additional list of hidden dimension vectors @return the list of predicted logprobabilities for each of the provided ref y in Y """ assert(len(X) == len(Y)) assert(all(len(x) == self.input_length for x in X)) if structural: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) A = dy.parameter(self.action_weights) batched_X = zip(*X) #transposes the X matrix embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(A * dy.tanh( W * xdense ),Y).value() return [-ypred for ypred in preds] else:#lexical if self.tied: dy.renew_cg() W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) batched_X = zip(*X) #transposes the X matrix embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(E * dy.tanh( W * xdense ),Y).value() return [-ypred for ypred in preds] else: dy.renew_cg() O = dy.parameter(self.output_embeddings) W = dy.parameter(self.hidden_weights) E = dy.parameter(self.input_embeddings) batched_X = zip(*X) #transposes the X matrix embeddings = [dy.pick_batch(E, xcolumn) for xcolumn in batched_X] xdense = dy.concatenate(embeddings) preds = dy.pickneglogsoftmax_batch(O * dy.tanh( W * xdense ),Y).value() return [-ypred for ypred in preds]
def __attention_mlp(self, H_f, h_e, W1_att_e, W1_att_f, w2_att): # Calculate the alignment score vector a_t = dy.tanh(dy.colwise_add(W1_att_f * H_f, W1_att_e * h_e)) a_t = w2_att * a_t a_t = a_t[0] alignment = dy.softmax(a_t) c_t = H_f * alignment return c_t
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #initialize the LSTM init_state_src = LSTM_SRC_BUILDER.initial_state() #get the output of the first LSTM src_output = init_state_src.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])[-1].output() #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def calc_attention(self, state): V = dy.parameter(self.pV) U = dy.parameter(self.pU) h = dy.tanh(dy.colwise_add(self.WI, V * state)) scores = dy.transpose(U * h) normalized = dy.softmax(scores) self.attention_vecs.append(normalized) return normalized
def mlp(rnn_ouput, params): w1 = params["w1"] w2 = params["w2"] b1 = params["b1"] b2 = params["b2"] l1 = dy.tanh((w1 * rnn_ouput) + b1) out = dy.softmax((w2 * l1) + b2) return out
def _attend(self, query, mask=None): # query ((H), B) # mask ((T, 1), B) projected_state = self.decoder * query # ((H,), B) non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state)) # ((H, T), B) attn_scores = dy.transpose(self.v * non_lin) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores) # ((T, 1), B)
def __attention_mlp_batch(self, H_f_batch, h_e_batch, W1_att_e, W1_att_f, w2_att): # H_f_batch: (2 * hidden_size, num_step, batch_size) # h_e_batch: (hidden_size, batch_size) a_t_batch = dy.tanh(dy.colwise_add(W1_att_f * H_f_batch, W1_att_e * h_e_batch)) # (attention_size, num_step, batch_size) a_t_batch = w2_att * a_t_batch # (1, num_step, batch_size) a_t_batch = a_t_batch[0] # (num_step, batch_size) alignment_batch = dy.softmax(a_t_batch) # (num_step, batch_size) c_t_batch = H_f_batch * alignment_batch # (2 * hidden_size, batch_size) return c_t_batch
def truth_score(self, word_seq): wembs = [self.param_exprs['<bos>']]+[self.word_repr(word) for word in word_seq] init_state = self.params['lstm'].initial_state() hidden_states = init_state.transduce(wembs) score = dy.scalarInput(0.) for h, w in zip(hidden_states[:-1],wembs[1:]): y = dy.tanh(self.param_exprs['pW'] * h + self.param_exprs['pb']) score = score + dy.dot_product(y,w) +dy.dot_product(w,self.param_exprs['U']) return score
def calc_score_of_history(words, dropout=0.0): # Lookup the embeddings and concatenate them emb = dy.concatenate([W_emb[x] for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # CHANGE 2: perform dropout if dropout != 0.0: h = dy.dropout(h, dropout) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def calc_score_of_histories(words, dropout=0.0): # This will change from a list of histories, to a list of words in each history position words = np.transpose(words) # Lookup the embeddings and concatenate them emb = dy.concatenate([dy.lookup_batch(W_emb, x) for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # Perform dropout if dropout != 0.0: h = dy.dropout(h, dropout) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def expr_for_tree(self, tree): if tree.isleaf(): return self.E[self.w2i.get(tree.label,0)] if len(tree.children) == 1: assert(tree.children[0].isleaf()) expr = self.expr_for_tree(tree.children[0]) return expr assert(len(tree.children) == 2),tree.children[0] e1 = self.expr_for_tree(tree.children[0]) e2 = self.expr_for_tree(tree.children[1]) W = dy.parameter(self.W) expr = dy.tanh(W*dy.concatenate([e1,e2])) return expr
def attend(input_mat, state, w1dt): global attention_w2 global attention_v w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) # input_mat: (encoder_state x seqlen) => input vecs concatenated as cols # w1dt: (attdim x seqlen) # w2dt: (attdim x attdim) w2dt = w2*dy.concatenate(list(state.s())) # att_weights: (seqlen,) row vector unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))) att_weights = dy.softmax(unnormalized) # context: (encoder_state) context = input_mat * att_weights return context
def attend(input_vectors, state): global attention_w1 global attention_w2 global attention_v w1 = dy.parameter(attention_w1) w2 = dy.parameter(attention_w2) v = dy.parameter(attention_v) attention_weights = [] w2dt = w2*dy.concatenate(list(state.s())) for input_vector in input_vectors: attention_weight = v*dy.tanh(w1*input_vector + w2dt) attention_weights.append(attention_weight) attention_weights = dy.softmax(dy.concatenate(attention_weights)) output_vectors = dy.esum([vector*attention_weight for vector, attention_weight in zip(input_vectors, attention_weights)]) return output_vectors
def attend2(blstm_outputs, s_prev, y_feedback, v_a, W_a, U_a, U_o, V_o, C_o): # attention mechanism - Bahdanau style # iterate through input states to compute alphas # print 'computing scores...' # W_a: hidden x hidden, U_a: hidden x 2 hidden, v_a: hidden, each score: scalar scores = [v_a * pc.tanh(W_a * s_prev + U_a * h_j) for h_j in blstm_outputs] alphas = pc.softmax(pc.concatenate(scores)) # c_i: 2 hidden c_i = pc.esum([h_input * pc.pick(alphas, j) for j, h_input in enumerate(blstm_outputs)]) # U_o = 2l x hidden, V_o = 2l x input, C_o = 2l x 2 hidden attention_output_vector = U_o * s_prev + V_o * y_feedback + C_o * c_i return attention_output_vector, alphas
def build_tagging_graph(words): dy.renew_cg() # parameters -> expressions H = dy.parameter(pH) O = dy.parameter(pO) # initialize the RNNs f_init = fwdRNN.initial_state() b_init = bwdRNN.initial_state() cf_init = cFwdRNN.initial_state() cb_init = cBwdRNN.initial_state() # get the word vectors. word_rep(...) returns a 128-dim vector expression for each word. wembs = [word_rep(w, cf_init, cb_init) for w in words] wembs = [dy.noise(we,0.2) for we in wembs] # optional # feed word vectors into biLSTM fw_exps = f_init.transduce(wembs) bw_exps = b_init.transduce(reversed(wembs)) # OR # fw_exps = [] # s = f_init # for we in wembs: # s = s.add_input(we) # fw_exps.append(s.output()) # bw_exps = [] # s = b_init # for we in reversed(wembs): # s = s.add_input(we) # bw_exps.append(s.output()) # biLSTM states bi_exps = [dy.concatenate([f,b]) for f,b in zip(fw_exps, reversed(bw_exps))] # feed each biLSTM state to an MLP exps = [] for x in bi_exps: r_t = O*(dy.tanh(H * x)) exps.append(r_t) return exps
pW1 = m.add_parameters((HIDDEN_SIZE, 2), device="GPU:1") pb1 = m.add_parameters(HIDDEN_SIZE, device="GPU:1") pW2 = m.add_parameters((HIDDEN_SIZE, HIDDEN_SIZE), device="GPU:0") pb2 = m.add_parameters(HIDDEN_SIZE, device="GPU:0") pV = m.add_parameters((1, HIDDEN_SIZE), device="CPU") pa = m.add_parameters(1, device="CPU") if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) dy.renew_cg() W1, b1, W2, b2, V, a = dy.parameter(pW1, pb1, pW2, pb2, pV, pa) x = dy.vecInput(2, "GPU:1") y = dy.scalarInput(0, "CPU") h1 = dy.tanh((W1*x) + b1) h1_gpu0 = dy.to_device(h1, "GPU:0") h2 = dy.tanh((W2*h1_gpu0) + b2) h2_cpu = dy.to_device(h2, "CPU") if xsent: y_pred = dy.logistic((V*h2_cpu) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V*h2_cpu) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1
def calc_scores(words): dy.renew_cg() h = dy.esum([dy.lookup(W_emb, x) for x in words]) for W_h_i, b_h_i in zip(W_h, b_h): h = dy.tanh( W_h_i * h + b_h_i ) return W_sm * h + b_sm
def __init__(self, vocab, w2i, pos, rels, options): if isinstance(options, dict): options = _dict_to_obj(options, 'Values') self.model = ParameterCollection() random.seed(1) self.trainer = AdamTrainer(self.model) self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cmult(cmult(x, x), x)))} self.activation = self.activations[options.activation] self.blstm_flag = options.blstmFlag self.labels_flag = options.labelsFlag self.costaug_flag = options.costaugFlag self.bibi_flag = options.bibiFlag self.ldims = options.lstm_dims self.wdims = options.wembedding_dims self.pdims = options.pembedding_dims self.rdims = options.rembedding_dims self.layers = options.lstm_layers self.words_count = vocab self.vocab = {word: ind + 3 for word, ind in list(w2i.items())} self.pos = {word: ind + 3 for ind, word in enumerate(pos)} self.rels = {word: ind for ind, word in enumerate(rels)} self.irels = rels if self.bibi_flag: self.builders = [LSTMBuilder(1, self.wdims + self.pdims, self.ldims, self.model), LSTMBuilder(1, self.wdims + self.pdims, self.ldims, self.model)] self.bbuilders = [LSTMBuilder(1, self.ldims * 2, self.ldims, self.model), LSTMBuilder(1, self.ldims * 2, self.ldims, self.model)] elif self.layers > 0: self.builders = \ [LSTMBuilder(self.layers, self.wdims + self.pdims, self.ldims, self.model), LSTMBuilder(self.layers, self.wdims + self.pdims, self.ldims, self.model)] else: self.builders = [SimpleRNNBuilder(1, self.wdims + self.pdims, self.ldims, self.model), SimpleRNNBuilder(1, self.wdims + self.pdims, self.ldims, self.model)] self.hidden_units = options.hidden_units self.hidden2_units = options.hidden2_units self.vocab['*PAD*'] = 1 self.pos['*PAD*'] = 1 self.vocab['*INITIAL*'] = 2 self.pos['*INITIAL*'] = 2 self.wlookup = self.model.add_lookup_parameters((len(vocab) + 3, self.wdims)) self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims)) self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims)) self.hid_layer_foh = self.model.add_parameters((self.hidden_units, self.ldims * 2)) self.hid_layer_fom = self.model.add_parameters((self.hidden_units, self.ldims * 2)) self.hid_bias = self.model.add_parameters((self.hidden_units)) self.hid2_layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) self.hid2_bias = self.model.add_parameters((self.hidden2_units)) self.out_layer = self.model.add_parameters( (1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) if self.labels_flag: self.rhid_layer_foh = self.model.add_parameters((self.hidden_units, 2 * self.ldims)) self.rhid_layer_fom = self.model.add_parameters((self.hidden_units, 2 * self.ldims)) self.rhid_bias = self.model.add_parameters((self.hidden_units)) self.rhid2_layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) self.rhid2_bias = self.model.add_parameters((self.hidden2_units)) self.rout_layer = self.model.add_parameters( (len(self.irels), self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) self.rout_bias = self.model.add_parameters((len(self.irels)))
def calc_loss(sents): dy.renew_cg() # Transduce all batch elements with an LSTM src_sents = [x[0] for x in sents] tgt_sents = [x[1] for x in sents] src_cws = [] src_len = [len(sent) for sent in src_sents] max_src_len = np.max(src_len) num_words = 0 for i in range(max_src_len): src_cws.append([sent[i] for sent in src_sents]) #get the outputs of the first LSTM src_outputs = [dy.concatenate([x.output(), y.output()]) for x,y in LSTM_SRC.add_inputs([dy.lookup_batch(LOOKUP_SRC, cws) for cws in src_cws])] src_output = src_outputs[-1] #gets the parameters for the attention src_output_matrix = dy.concatenate_cols(src_outputs) w1_att_src = dy.parameter(w1_att_src_p) fixed_attentional_component = w1_att_src * src_output_matrix #now decode all_losses = [] # Decoder #need to mask padding at end of sentence tgt_cws = [] tgt_len = [len(sent) for sent in sents] max_tgt_len = np.max(tgt_len) masks = [] for i in range(max_tgt_len): tgt_cws.append([sent[i] if len(sent) > i else eos_trg for sent in tgt_sents]) mask = [(1 if len(sent) > i else 0) for sent in tgt_sents] masks.append(mask) num_words += sum(mask) current_state = LSTM_TRG_BUILDER.initial_state().set_s([src_output, dy.tanh(src_output)]) prev_words = tgt_cws[0] W_sm = dy.parameter(W_sm_p) b_sm = dy.parameter(b_sm_p) W_m = dy.parameter(W_m_p) b_m = dy.parameter(b_m_p) for next_words, mask in zip(tgt_cws[1:], masks): #feed the current state into the current_state = current_state.add_input(dy.lookup_batch(LOOKUP_TRG, prev_words)) output_embedding = current_state.output() att_output, _ = calc_attention(src_output_matrix, output_embedding, fixed_attentional_component) middle_expr = dy.tanh(dy.affine_transform([b_m, W_m, dy.concatenate([output_embedding, att_output])])) s = dy.affine_transform([b_sm, W_sm, middle_expr]) loss = (dy.pickneglogsoftmax_batch(s, next_words)) mask_expr = dy.inputVector(mask) mask_expr = dy.reshape(mask_expr, (1,),len(sents)) mask_loss = loss * mask_expr all_losses.append(mask_loss) prev_words = next_words return dy.sum_batches(dy.esum(all_losses)), num_words
def _combine(self, attn, query): comb = super(LuongAttention, self)._combine(attn, query) return dy.tanh(comb)
def _combine(self, attn, query): comb = super(ScaledDotProductAttention, self)._combine(attn, query) return dy.tanh(comb)
def _combine(self, attn, query): comb = super(DotProductAttention, self)._combine(attn, query) # ((H,), B) return dy.tanh(comb)
def mlp(x, W, V, b): # A mlp with only one hidden layer. return V * dy.tanh(W * x + b)
ITERATIONS = 2000 m = dy.Model() trainer = dy.SimpleSGDTrainer(m) W = m.add_parameters((HIDDEN_SIZE, 2)) b = m.add_parameters(HIDDEN_SIZE) V = m.add_parameters((1, HIDDEN_SIZE)) a = m.add_parameters(1) if len(sys.argv) == 2: m.populate_from_textfile(sys.argv[1]) x = dy.vecInput(2) y = dy.scalarInput(0) h = dy.tanh((W*x) + b) if xsent: y_pred = dy.logistic((V*h) + a) loss = dy.binary_log_loss(y_pred, y) T = 1 F = 0 else: y_pred = (V*h) + a loss = dy.squared_distance(y_pred, y) T = 1 F = -1 for iter in range(ITERATIONS): mloss = 0.0 for mi in range(4):